1 ; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2 ; 2021-02-23: Igor Pavlov : Public domain
4 ; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5 ; function for check at link time.
6 ; That code is tightly coupled with LzmaDec_TryDummy()
7 ; and with another functions in LzmaDec.c file.
8 ; CLzmaDec structure, (probs) array layout, input and output of
9 ; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
13 ; .err <x64_IS_REQUIRED>
20 _TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
22 MY_ALIGN macro num:req
39 ; _LZMA_SIZE_OPT equ 1
46 mov dest, dword ptr [mem]
49 mov dword ptr [mem], src
54 movzx dest, word ptr [mem]
57 mov word ptr [mem], @CatStr(src, _W)
61 PMULT equ (1 SHL PSHIFT)
62 PMULT_HALF equ (1 SHL (PSHIFT - 1))
63 PMULT_2 equ (1 SHL (PSHIFT + 1))
65 kMatchSpecLen_Error_Data equ (1 SHL 9)
68 ; x1 pbPos / (prob) TREE
69 ; x2 probBranch / prm (MATCHED) / pbPos / cnt
73 ; x6 t1 NORM_CALC / probs_state / dist
74 ; x7 t0 NORM_CALC / prob2 IF_BIT_1
76 ; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
77 ; x10 kBitModelTotal_reg
79 ; x12 offs (MATCHED) / dic / len_temp
81 ; x14 bit (MATCHED) / dicPos
92 kBitModelTotal_reg equ x10
125 probs_state_R equ t1_R
145 kNumBitModelTotalBits equ 11
146 kBitModelTotal equ (1 SHL kNumBitModelTotalBits)
148 kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)
149 kTopValue equ (1 SHL 24)
152 ; movzx t0, BYTE PTR [buf]
154 mov cod_L, BYTE PTR [buf]
169 ; ---------- Branch MACROS ----------
171 UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
172 mov prob2, kBitModelTotal_reg
173 sub prob2, probBranch
174 shr prob2, kNumMoveBits
175 add probBranch, prob2
176 PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT
180 UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
184 mov prob2, probBranch
185 shr probBranch, kNumMoveBits
186 sub prob2, probBranch
187 PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT
191 CMP_COD macro probsArray:req, probOffset:req, probDisp:req
192 PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT
195 shr range, kNumBitModelTotalBits
196 imul range, probBranch
201 IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
202 CMP_COD probsArray, probOffset, probDisp
207 IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
208 IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
209 UPDATE_0 probsArray, probOffset, probDisp
213 IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
214 CMP_COD probsArray, probOffset, probDisp
219 ; ---------- CMOV MACROS ----------
221 NORM_CALC macro prob:req
224 shr range, kNumBitModelTotalBits
232 PUP macro prob:req, probPtr:req
234 ; only sar works for both 16/32 bit prob modes
241 PUP_SUB macro prob:req, probPtr:req, symSub:req
247 PUP_COD macro prob:req, probPtr:req, symSub:req
248 mov t0, kBitModelOffset
251 cmovb t0, kBitModelTotal_reg
252 PUP_SUB prob, probPtr, symSub
256 BIT_0 macro prob:req, probNext:req
257 PLOAD prob, probs + 1 * PMULT
258 PLOAD probNext, probs + 1 * PMULT_2
263 PLOAD t0, probs + 1 * PMULT_2 + PMULT
265 mov t0, kBitModelOffset
267 cmovb t0, kBitModelTotal_reg
269 PUP_SUB prob, probs + 1 * PMULT, 0 - 1
273 BIT_1 macro prob:req, probNext:req
274 PLOAD probNext, probs + sym_R * PMULT_2
280 PLOAD t0, probs + sym_R * PMULT + PMULT
282 PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
286 BIT_2 macro prob:req, symSub:req
292 PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
296 ; ---------- MATCHED LITERAL ----------
299 mov offs, 256 * PMULT
300 shl match, (PSHIFT + 1)
303 PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
304 lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
305 ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
315 mov t0, kBitModelOffset
317 cmovb t0, kBitModelTotal_reg
319 PUP_SUB x1, prm, -2-1
325 lea prm, [probs + offs_R * 1]
327 PLOAD x1, prm + sym_R * PMULT
337 PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
343 lea prm, [probs + offs_R * 1]
345 PLOAD x1, prm + sym_R * PMULT
351 PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
355 ; ---------- REVERSE BITS ----------
357 REV_0 macro prob:req, probNext:req
358 ; PLOAD prob, probs + 1 * PMULT
359 ; lea sym2_R, [probs + 2 * PMULT]
360 ; PLOAD probNext, probs + 2 * PMULT
361 PLOAD probNext, sym2_R
366 PLOAD t0, probs + 3 * PMULT
369 mov t0, kBitModelOffset
370 cmovb t0, kBitModelTotal_reg
371 lea t1_R, [probs + 3 * PMULT]
373 PUP prob, probs + 1 * PMULT
377 REV_1 macro prob:req, probNext:req, step:req
378 add sym2_R, step * PMULT
379 PLOAD probNext, sym2_R
384 PLOAD t0, sym2_R + step * PMULT
387 mov t0, kBitModelOffset
388 cmovb t0, kBitModelTotal_reg
389 lea t1_R, [sym2_R + step * PMULT]
391 PUP prob, t1_R - step * PMULT_2
395 REV_2 macro prob:req, step:req
406 mov t0, kBitModelOffset
407 cmovb t0, kBitModelTotal_reg
408 PUP prob, probs + sym2_R * PMULT
412 REV_1_VAR macro prob:req
420 lea t0_R, [sym_R + 1 * sym2_R]
422 mov t0, kBitModelOffset
424 ; mov t1, kBitModelTotal
426 cmovb t0, kBitModelTotal_reg
434 LIT_PROBS macro lpMaskParam:req
435 ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
440 add probs_state_R, pbPos_R
442 lea sym, dword ptr[sym_R + 2 * sym_R]
443 add probs, Literal * PMULT
446 UPDATE_0 probs_state_R, 0, IsMatch
453 kNumPosStatesMax equ (1 SHL kNumPosBitsMax)
456 kLenNumLowSymbols equ (1 SHL kLenNumLowBits)
457 kLenNumHighBits equ 8
458 kLenNumHighSymbols equ (1 SHL kLenNumHighBits)
459 kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
463 LenChoice2 equ (LenLow + kLenNumLowSymbols)
464 LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
470 kStartPosModelIndex equ 4
471 kEndPosModelIndex equ 14
472 kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))
474 kNumPosSlotBits equ 6
475 kNumLenToPosStates equ 4
478 kAlignTableSize equ (1 SHL kNumAlignBits)
481 kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
483 kStartOffset equ 1664
484 SpecPos equ (-kStartOffset)
485 IsRep0Long equ (SpecPos + kNumFullDistances)
486 RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
487 LenCoder equ (RepLenCoder + kNumLenProbs)
488 IsMatch equ (LenCoder + kNumLenProbs)
489 kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
490 IsRep equ (kAlign + kAlignTableSize)
491 IsRepG0 equ (IsRep + kNumStates)
492 IsRepG1 equ (IsRepG0 + kNumStates)
493 IsRepG2 equ (IsRepG1 + kNumStates)
494 PosSlot equ (IsRepG2 + kNumStates)
495 Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
496 NUM_BASE_PROBS equ (Literal + kStartOffset)
499 .err <Stop_Compiling_Bad_LZMA_kAlign>
502 if NUM_BASE_PROBS ne 1984
503 .err <Stop_Compiling_Bad_LZMA_PROBS>
520 dicPos_Spec PTR_FIELD
525 processedPos_Spec dd ?
536 CLzmaDec_Asm_Loc struct
555 dicPos_Spec PTR_FIELD
560 CLzmaDec_Asm_Loc ends
563 GLOB_2 equ [sym_R].CLzmaDec_Asm.
564 GLOB equ [r1].CLzmaDec_Asm.
565 LOC_0 equ [r0].CLzmaDec_Asm_Loc.
566 LOC equ [RSP].CLzmaDec_Asm_Loc.
575 RESTORE_VAR macro name
582 IsMatchBranch_Pre macro reg
583 ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
584 mov pbPos, LOC pbMask
585 and pbPos, processedPos
586 shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
587 lea probs_state_R, [probs + 1 * state_R]
591 IsMatchBranch macro reg
593 IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
597 CheckLimits macro reg
598 cmp buf, LOC bufLimit
600 cmp dicPos, LOC limit
606 ; RSP is (16x + 8) bytes aligned in WIN64-x64
607 ; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
609 PARAM_lzma equ REG_ABI_PARAM_0
610 PARAM_limit equ REG_ABI_PARAM_1
611 PARAM_bufLimit equ REG_ABI_PARAM_2
614 MY_PROC LzmaDec_DecodeReal_3, 3
615 MY_PUSH_PRESERVED_ABI_REGS
617 lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
621 mov LOC_0 Old_RSP, r5
622 mov LOC_0 lzmaPtr, PARAM_lzma
624 mov LOC_0 remainLen, 0 ; remainLen must be ZERO
626 mov LOC_0 bufLimit, PARAM_bufLimit
627 mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2
628 mov dic, GLOB_2 dic_Spec
630 mov LOC_0 limit, PARAM_limit
637 mov dicPos, GLOB_2 dicPos_Spec
639 mov LOC_0 dicPos_Spec, dicPos
640 mov LOC_0 dic_Spec, dic
648 ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
649 ; unsigned lc = p->prop.lc;
650 ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
665 ; mov probs, GLOB_2 probs_Spec
666 ; add probs, kStartOffset SHL PSHIFT
667 mov probs, GLOB_2 probs_1664
668 mov LOC_0 probs_Spec, probs
670 mov t0_R, GLOB_2 dicBufSize
671 mov LOC_0 dicBufSize, t0_R
673 mov x1, GLOB_2 checkDicSize
674 mov LOC_0 checkDicSize, x1
676 mov processedPos, GLOB_2 processedPos_Spec
678 mov state, GLOB_2 state_Spec
681 mov buf, GLOB_2 buf_Spec
682 mov range, GLOB_2 range_Spec
683 mov cod, GLOB_2 code_Spec
684 mov kBitModelTotal_reg, kBitModelTotal
687 ; if (processedPos != 0 || checkDicSize != 0)
694 movzx sym, byte ptr[t0_R - 1]
700 cmp state, kNumLitStates * PMULT
707 ; ---------- LITERAL ----------
716 PLOAD x1, probs + 1 * PMULT
739 ; mov dic, LOC dic_Spec
740 mov probs, LOC probs_Spec
742 mov byte ptr[dicPos], sym_L
747 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
751 ; ---------- MATCHES ----------
754 UPDATE_1 probs_state_R, pbPos_R, IsMatch
755 IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
757 add probs, LenCoder * PMULT
758 add state, kNumStates * PMULT
760 ; ---------- LEN DECODE ----------
762 mov len_temp, 8 - 1 - kMatchMinLen
763 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
765 add probs, (1 SHL (kLenNumLowBits + PSHIFT))
766 mov len_temp, -1 - kMatchMinLen
767 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
769 add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
771 PLOAD x1, probs + 1 * PMULT
780 mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
781 jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
791 mov probs, LOC probs_Spec
792 cmp state, kNumStates * PMULT
796 ; ---------- DECODE DISTANCE ----------
797 ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
799 mov t0, 3 + kMatchMinLen
800 cmp sym, 3 + kMatchMinLen
802 add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
803 shl t0, (kNumPosSlotBits + PSHIFT)
807 ; mov LOC remainLen, sym
812 PLOAD x1, probs + 1 * PMULT
835 mov probs, LOC probs_Spec
836 cmp x1, 32 + kEndPosModelIndex / 2
839 ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
840 sub x1, (32 + 1 + kNumAlignBits)
841 ; distance = (2 | (distance & 1));
843 PLOAD x2, probs + 1 * PMULT
844 shl sym, kNumAlignBits + 1
845 lea sym2_R, [probs + 2 * PMULT]
848 ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
849 ; cmp range, kTopValue
852 ; ---------- DIRECT DISTANCE ----------
865 lea sym, dword ptr [r2 + sym_R * 2 + 1]
874 lea t1, [sym_R + (1 SHL kNumAlignBits)]
876 jae near ptr direct_loop
877 ; we align for 32 here with "near ptr" command above
884 ; distance <<= kNumAlignBits;
892 ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
898 mov t0, LOC checkDicSize
900 cmove t0, processedPos
903 ; jmp end_of_payload ; for debug
908 ; rep0 = distance + 1;
912 ; mov sym, LOC remainLen
918 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
919 cmp state, (kNumStates + kNumLitStates) * PMULT
920 mov state, kNumLitStates * PMULT
921 mov t0, (kNumLitStates + 3) * PMULT
925 ; ---------- COPY MATCH ----------
928 ; len += kMatchMinLen;
929 ; add sym, kMatchMinLen
931 ; if ((rem = limit - dicPos) == 0)
933 ; p->dicPos = dicPos;
934 ; return SZ_ERROR_DATA;
940 ; curLen = ((rem < len) ? (unsigned)rem : len);
942 ; cmovae cnt_R, sym_R ; 64-bit
943 cmovae cnt, sym ; 32-bit
945 mov dic, LOC dic_Spec
950 ; processedPos += curLen;
951 add processedPos, cnt
954 mov LOC remainLen, sym
958 ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
962 mov r1, LOC dicBufSize
968 ; if (curLen <= dicBufSize - pos)
970 ; ---------- COPY MATCH FAST ----------
971 ; Byte *dest = dic + dicPos;
973 ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
977 ; const Byte *lim = dest + curLen;
979 movzx sym, byte ptr[t0_R]
982 ; lea r1, [dicPos - 1]
997 mov byte ptr[cnt_R * 1 + dicPos], sym_L
998 movzx sym, byte ptr[cnt_R * 1 + t0_R]
1004 mov byte ptr[dicPos], sym_L
1010 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1014 ; ---------- LITERAL MATCHED ----------
1016 LIT_PROBS LOC lpMask
1018 ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1020 ; mov dic, LOC dic_Spec
1021 mov LOC dicPos_Spec, dicPos
1023 ; state -= (state < 10) ? 3 : 6;
1024 lea t0, [state_R - 6 * PMULT]
1025 sub state, 3 * PMULT
1026 cmp state, 7 * PMULT
1032 add dicPos, LOC dicBufSize
1037 cmovb t0_R, LOC dicBufSize
1040 movzx match, byte ptr[dic + dicPos * 1]
1042 ifdef _LZMA_SIZE_OPT
1044 mov offs, 256 * PMULT
1045 shl match, (PSHIFT + 1)
1068 mov probs, LOC probs_Spec
1070 ; mov dic, LOC dic_Spec
1071 mov dicPos, LOC dicPos_Spec
1072 mov byte ptr[dicPos], sym_L
1077 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1079 mov lpMask_reg, LOC lpMask
1080 sub state, 3 * PMULT
1085 ; ---------- REP 0 LITERAL ----------
1088 UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1090 ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1091 mov dic, LOC dic_Spec
1093 mov probBranch, LOC rep0
1096 sub probs, RepLenCoder * PMULT
1098 ; state = state < kNumLitStates ? 9 : 11;
1101 ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1102 ; so we don't need the following (dicPos == limit) check here:
1103 ; cmp dicPos, LOC limit
1104 ; jae fin_dicPos_LIMIT_REP_SHORT
1111 ; sub t0_R, probBranch_R
1112 ; cmovb sym_R, LOC dicBufSize
1114 sub t0_R, probBranch_R
1116 add t0_R, LOC dicBufSize
1118 movzx sym, byte ptr[dic + t0_R * 1]
1124 UPDATE_1 probs_state_R, 0, IsRep
1126 ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1127 ; So we don't check it here.
1129 ; mov t0, processedPos
1130 ; or t0, LOC checkDicSize
1133 ; state = state < kNumLitStates ? 8 : 11;
1134 cmp state, kNumLitStates * PMULT
1135 mov state, 8 * PMULT
1136 mov probBranch, 11 * PMULT
1137 cmovae state, probBranch
1139 ; prob = probs + RepLenCoder;
1140 add probs, RepLenCoder * PMULT
1142 IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1143 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1144 UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1149 UPDATE_1 probs_state_R, 0, IsRepG0
1154 IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1160 UPDATE_1 probs_state_R, 0, IsRepG1
1164 IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1170 UPDATE_1 probs_state_R, 0, IsRepG2
1178 ; ---------- SPEC SHORT DISTANCE ----------
1186 lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1187 mov sym2, PMULT ; step
1194 mov probs, LOC probs_Spec
1196 sub sym, SpecPos * PMULT
1203 ; ---------- COPY MATCH CROSS ----------
1206 ; r1 - len to dicBufSize
1207 ; cnt_R - total copy len
1209 mov t1_R, t0_R ; srcPos
1211 mov r1, LOC dicBufSize ;
1214 movzx sym, byte ptr[t1_R * 1 + t0_R]
1216 mov byte ptr[cnt_R * 1 + dicPos], sym_L
1221 movzx sym, byte ptr[t0_R]
1228 ; fin_dicPos_LIMIT_REP_SHORT:
1232 mov LOC remainLen, sym
1234 ; For more strict mode we can stop decoding with error
1239 fin_ERROR_MATCH_DIST:
1244 ; rep0 = distance + 1;
1246 add len_temp, kMatchSpecLen_Error_Data
1247 mov LOC remainLen, len_temp
1254 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1255 cmp state, (kNumStates + kNumLitStates) * PMULT
1256 mov state, kNumLitStates * PMULT
1257 mov t0, (kNumLitStates + 3) * PMULT
1266 jnz fin_ERROR_MATCH_DIST
1268 mov LOC remainLen, kMatchSpecLenStart
1269 sub state, kNumStates * PMULT
1279 sub dicPos, LOC dic_Spec
1280 mov GLOB dicPos_Spec, dicPos
1281 mov GLOB buf_Spec, buf
1282 mov GLOB range_Spec, range
1283 mov GLOB code_Spec, cod
1285 mov GLOB state_Spec, state
1286 mov GLOB processedPos_Spec, processedPos
1288 RESTORE_VAR(remainLen)
1296 mov RSP, LOC Old_RSP
1298 MY_POP_PRESERVED_ABI_REGS
1301 _TEXT$LZMADECOPT ENDS