--- /dev/null
+; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function\r
+; 2021-02-23: Igor Pavlov : Public domain\r
+;\r
+; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()\r
+; function for check at link time.\r
+; That code is tightly coupled with LzmaDec_TryDummy()\r
+; and with another functions in LzmaDec.c file.\r
+; CLzmaDec structure, (probs) array layout, input and output of\r
+; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).\r
+\r
+ifndef x64\r
+; x64=1\r
+; .err <x64_IS_REQUIRED>\r
+endif\r
+\r
+include 7zAsm.asm\r
+\r
+MY_ASM_START\r
+\r
+_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'\r
+\r
+MY_ALIGN macro num:req\r
+ align num\r
+endm\r
+\r
+MY_ALIGN_16 macro\r
+ MY_ALIGN 16\r
+endm\r
+\r
+MY_ALIGN_32 macro\r
+ MY_ALIGN 32\r
+endm\r
+\r
+MY_ALIGN_64 macro\r
+ MY_ALIGN 64\r
+endm\r
+\r
+\r
+; _LZMA_SIZE_OPT equ 1\r
+\r
+; _LZMA_PROB32 equ 1\r
+\r
+ifdef _LZMA_PROB32\r
+ PSHIFT equ 2\r
+ PLOAD macro dest, mem\r
+ mov dest, dword ptr [mem]\r
+ endm\r
+ PSTORE macro src, mem\r
+ mov dword ptr [mem], src\r
+ endm\r
+else\r
+ PSHIFT equ 1\r
+ PLOAD macro dest, mem\r
+ movzx dest, word ptr [mem]\r
+ endm\r
+ PSTORE macro src, mem\r
+ mov word ptr [mem], @CatStr(src, _W)\r
+ endm\r
+endif\r
+\r
+PMULT equ (1 SHL PSHIFT)\r
+PMULT_HALF equ (1 SHL (PSHIFT - 1))\r
+PMULT_2 equ (1 SHL (PSHIFT + 1))\r
+\r
+kMatchSpecLen_Error_Data equ (1 SHL 9)\r
+\r
+; x0 range\r
+; x1 pbPos / (prob) TREE\r
+; x2 probBranch / prm (MATCHED) / pbPos / cnt\r
+; x3 sym\r
+;====== r4 === RSP\r
+; x5 cod\r
+; x6 t1 NORM_CALC / probs_state / dist\r
+; x7 t0 NORM_CALC / prob2 IF_BIT_1\r
+; x8 state\r
+; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg\r
+; x10 kBitModelTotal_reg\r
+; r11 probs\r
+; x12 offs (MATCHED) / dic / len_temp\r
+; x13 processedPos\r
+; x14 bit (MATCHED) / dicPos\r
+; r15 buf\r
+\r
+\r
+cod equ x5\r
+cod_L equ x5_L\r
+range equ x0\r
+state equ x8\r
+state_R equ r8\r
+buf equ r15\r
+processedPos equ x13\r
+kBitModelTotal_reg equ x10\r
+\r
+probBranch equ x2\r
+probBranch_R equ r2\r
+probBranch_W equ x2_W\r
+\r
+pbPos equ x1\r
+pbPos_R equ r1\r
+\r
+cnt equ x2\r
+cnt_R equ r2\r
+\r
+lpMask_reg equ x9\r
+dicPos equ r14\r
+\r
+sym equ x3\r
+sym_R equ r3\r
+sym_L equ x3_L\r
+\r
+probs equ r11\r
+dic equ r12\r
+\r
+t0 equ x7\r
+t0_W equ x7_W\r
+t0_R equ r7\r
+\r
+prob2 equ t0\r
+prob2_W equ t0_W\r
+\r
+t1 equ x6\r
+t1_R equ r6\r
+\r
+probs_state equ t1\r
+probs_state_R equ t1_R\r
+\r
+prm equ r2\r
+match equ x9\r
+match_R equ r9\r
+offs equ x12\r
+offs_R equ r12\r
+bit equ x14\r
+bit_R equ r14\r
+\r
+sym2 equ x9\r
+sym2_R equ r9\r
+\r
+len_temp equ x12\r
+\r
+dist equ sym\r
+dist2 equ x9\r
+\r
+\r
+\r
+kNumBitModelTotalBits equ 11\r
+kBitModelTotal equ (1 SHL kNumBitModelTotalBits)\r
+kNumMoveBits equ 5\r
+kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)\r
+kTopValue equ (1 SHL 24)\r
+\r
+NORM_2 macro\r
+ ; movzx t0, BYTE PTR [buf]\r
+ shl cod, 8\r
+ mov cod_L, BYTE PTR [buf]\r
+ shl range, 8\r
+ ; or cod, t0\r
+ inc buf\r
+endm\r
+\r
+\r
+NORM macro\r
+ cmp range, kTopValue\r
+ jae SHORT @F\r
+ NORM_2\r
+@@:\r
+endm\r
+\r
+\r
+; ---------- Branch MACROS ----------\r
+\r
+UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req\r
+ mov prob2, kBitModelTotal_reg\r
+ sub prob2, probBranch\r
+ shr prob2, kNumMoveBits\r
+ add probBranch, prob2\r
+ PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT\r
+endm\r
+\r
+\r
+UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req\r
+ sub prob2, range\r
+ sub cod, range\r
+ mov range, prob2\r
+ mov prob2, probBranch\r
+ shr probBranch, kNumMoveBits\r
+ sub prob2, probBranch\r
+ PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT\r
+endm\r
+\r
+\r
+CMP_COD macro probsArray:req, probOffset:req, probDisp:req\r
+ PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT\r
+ NORM\r
+ mov prob2, range\r
+ shr range, kNumBitModelTotalBits\r
+ imul range, probBranch\r
+ cmp cod, range\r
+endm\r
+\r
+\r
+IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
+ CMP_COD probsArray, probOffset, probDisp\r
+ jae toLabel\r
+endm\r
+\r
+\r
+IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
+ IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel\r
+ UPDATE_0 probsArray, probOffset, probDisp\r
+endm\r
+\r
+\r
+IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
+ CMP_COD probsArray, probOffset, probDisp\r
+ jb toLabel\r
+endm\r
+\r
+\r
+; ---------- CMOV MACROS ----------\r
+\r
+NORM_CALC macro prob:req\r
+ NORM\r
+ mov t0, range\r
+ shr range, kNumBitModelTotalBits\r
+ imul range, prob\r
+ sub t0, range\r
+ mov t1, cod\r
+ sub cod, range\r
+endm\r
+\r
+\r
+PUP macro prob:req, probPtr:req\r
+ sub t0, prob\r
+ ; only sar works for both 16/32 bit prob modes\r
+ sar t0, kNumMoveBits\r
+ add t0, prob\r
+ PSTORE t0, probPtr\r
+endm\r
+\r
+\r
+PUP_SUB macro prob:req, probPtr:req, symSub:req\r
+ sbb sym, symSub\r
+ PUP prob, probPtr\r
+endm\r
+\r
+\r
+PUP_COD macro prob:req, probPtr:req, symSub:req\r
+ mov t0, kBitModelOffset\r
+ cmovb cod, t1\r
+ mov t1, sym\r
+ cmovb t0, kBitModelTotal_reg\r
+ PUP_SUB prob, probPtr, symSub\r
+endm\r
+\r
+\r
+BIT_0 macro prob:req, probNext:req\r
+ PLOAD prob, probs + 1 * PMULT\r
+ PLOAD probNext, probs + 1 * PMULT_2\r
+\r
+ NORM_CALC prob\r
+ \r
+ cmovae range, t0\r
+ PLOAD t0, probs + 1 * PMULT_2 + PMULT\r
+ cmovae probNext, t0\r
+ mov t0, kBitModelOffset\r
+ cmovb cod, t1\r
+ cmovb t0, kBitModelTotal_reg\r
+ mov sym, 2\r
+ PUP_SUB prob, probs + 1 * PMULT, 0 - 1\r
+endm\r
+\r
+\r
+BIT_1 macro prob:req, probNext:req\r
+ PLOAD probNext, probs + sym_R * PMULT_2\r
+ add sym, sym\r
+ \r
+ NORM_CALC prob\r
+ \r
+ cmovae range, t0\r
+ PLOAD t0, probs + sym_R * PMULT + PMULT\r
+ cmovae probNext, t0\r
+ PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1\r
+endm\r
+\r
+\r
+BIT_2 macro prob:req, symSub:req\r
+ add sym, sym\r
+\r
+ NORM_CALC prob\r
+ \r
+ cmovae range, t0\r
+ PUP_COD prob, probs + t1_R * PMULT_HALF, symSub\r
+endm\r
+\r
+\r
+; ---------- MATCHED LITERAL ----------\r
+\r
+LITM_0 macro\r
+ mov offs, 256 * PMULT\r
+ shl match, (PSHIFT + 1)\r
+ mov bit, offs\r
+ and bit, match\r
+ PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT\r
+ lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]\r
+ ; lea prm, [probs + 256 * PMULT + 1 * PMULT]\r
+ ; add prm, bit_R\r
+ xor offs, bit\r
+ add match, match\r
+\r
+ NORM_CALC x1\r
+\r
+ cmovae offs, bit\r
+ mov bit, match\r
+ cmovae range, t0\r
+ mov t0, kBitModelOffset\r
+ cmovb cod, t1\r
+ cmovb t0, kBitModelTotal_reg\r
+ mov sym, 0\r
+ PUP_SUB x1, prm, -2-1\r
+endm\r
+\r
+\r
+LITM macro\r
+ and bit, offs\r
+ lea prm, [probs + offs_R * 1]\r
+ add prm, bit_R\r
+ PLOAD x1, prm + sym_R * PMULT\r
+ xor offs, bit\r
+ add sym, sym\r
+ add match, match\r
+\r
+ NORM_CALC x1\r
+\r
+ cmovae offs, bit\r
+ mov bit, match\r
+ cmovae range, t0\r
+ PUP_COD x1, prm + t1_R * PMULT_HALF, - 1\r
+endm\r
+\r
+\r
+LITM_2 macro\r
+ and bit, offs\r
+ lea prm, [probs + offs_R * 1]\r
+ add prm, bit_R\r
+ PLOAD x1, prm + sym_R * PMULT\r
+ add sym, sym\r
+\r
+ NORM_CALC x1\r
+\r
+ cmovae range, t0\r
+ PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1\r
+endm\r
+\r
+\r
+; ---------- REVERSE BITS ----------\r
+\r
+REV_0 macro prob:req, probNext:req\r
+ ; PLOAD prob, probs + 1 * PMULT\r
+ ; lea sym2_R, [probs + 2 * PMULT]\r
+ ; PLOAD probNext, probs + 2 * PMULT\r
+ PLOAD probNext, sym2_R\r
+\r
+ NORM_CALC prob\r
+\r
+ cmovae range, t0\r
+ PLOAD t0, probs + 3 * PMULT\r
+ cmovae probNext, t0\r
+ cmovb cod, t1\r
+ mov t0, kBitModelOffset\r
+ cmovb t0, kBitModelTotal_reg\r
+ lea t1_R, [probs + 3 * PMULT]\r
+ cmovae sym2_R, t1_R\r
+ PUP prob, probs + 1 * PMULT\r
+endm\r
+\r
+\r
+REV_1 macro prob:req, probNext:req, step:req\r
+ add sym2_R, step * PMULT\r
+ PLOAD probNext, sym2_R\r
+\r
+ NORM_CALC prob\r
+\r
+ cmovae range, t0\r
+ PLOAD t0, sym2_R + step * PMULT\r
+ cmovae probNext, t0\r
+ cmovb cod, t1\r
+ mov t0, kBitModelOffset\r
+ cmovb t0, kBitModelTotal_reg\r
+ lea t1_R, [sym2_R + step * PMULT]\r
+ cmovae sym2_R, t1_R\r
+ PUP prob, t1_R - step * PMULT_2\r
+endm\r
+\r
+\r
+REV_2 macro prob:req, step:req\r
+ sub sym2_R, probs\r
+ shr sym2, PSHIFT\r
+ or sym, sym2\r
+\r
+ NORM_CALC prob\r
+\r
+ cmovae range, t0\r
+ lea t0, [sym - step]\r
+ cmovb sym, t0\r
+ cmovb cod, t1\r
+ mov t0, kBitModelOffset\r
+ cmovb t0, kBitModelTotal_reg\r
+ PUP prob, probs + sym2_R * PMULT\r
+endm\r
+\r
+\r
+REV_1_VAR macro prob:req\r
+ PLOAD prob, sym_R\r
+ mov probs, sym_R\r
+ add sym_R, sym2_R\r
+\r
+ NORM_CALC prob\r
+\r
+ cmovae range, t0\r
+ lea t0_R, [sym_R + 1 * sym2_R]\r
+ cmovae sym_R, t0_R\r
+ mov t0, kBitModelOffset\r
+ cmovb cod, t1\r
+ ; mov t1, kBitModelTotal\r
+ ; cmovb t0, t1\r
+ cmovb t0, kBitModelTotal_reg\r
+ add sym2, sym2\r
+ PUP prob, probs\r
+endm\r
+\r
+\r
+\r
+\r
+LIT_PROBS macro lpMaskParam:req\r
+ ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);\r
+ mov t0, processedPos\r
+ shl t0, 8\r
+ add sym, t0\r
+ and sym, lpMaskParam\r
+ add probs_state_R, pbPos_R\r
+ mov x1, LOC lc2\r
+ lea sym, dword ptr[sym_R + 2 * sym_R]\r
+ add probs, Literal * PMULT\r
+ shl sym, x1_L\r
+ add probs, sym_R\r
+ UPDATE_0 probs_state_R, 0, IsMatch\r
+ inc processedPos\r
+endm\r
+\r
+\r
+\r
+kNumPosBitsMax equ 4\r
+kNumPosStatesMax equ (1 SHL kNumPosBitsMax)\r
+\r
+kLenNumLowBits equ 3\r
+kLenNumLowSymbols equ (1 SHL kLenNumLowBits)\r
+kLenNumHighBits equ 8\r
+kLenNumHighSymbols equ (1 SHL kLenNumHighBits)\r
+kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)\r
+\r
+LenLow equ 0\r
+LenChoice equ LenLow\r
+LenChoice2 equ (LenLow + kLenNumLowSymbols)\r
+LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)\r
+\r
+kNumStates equ 12\r
+kNumStates2 equ 16\r
+kNumLitStates equ 7\r
+\r
+kStartPosModelIndex equ 4\r
+kEndPosModelIndex equ 14\r
+kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))\r
+\r
+kNumPosSlotBits equ 6\r
+kNumLenToPosStates equ 4\r
+\r
+kNumAlignBits equ 4\r
+kAlignTableSize equ (1 SHL kNumAlignBits)\r
+\r
+kMatchMinLen equ 2\r
+kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)\r
+\r
+kStartOffset equ 1664\r
+SpecPos equ (-kStartOffset)\r
+IsRep0Long equ (SpecPos + kNumFullDistances)\r
+RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))\r
+LenCoder equ (RepLenCoder + kNumLenProbs)\r
+IsMatch equ (LenCoder + kNumLenProbs)\r
+kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))\r
+IsRep equ (kAlign + kAlignTableSize)\r
+IsRepG0 equ (IsRep + kNumStates)\r
+IsRepG1 equ (IsRepG0 + kNumStates)\r
+IsRepG2 equ (IsRepG1 + kNumStates)\r
+PosSlot equ (IsRepG2 + kNumStates)\r
+Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))\r
+NUM_BASE_PROBS equ (Literal + kStartOffset)\r
+\r
+if kAlign ne 0\r
+ .err <Stop_Compiling_Bad_LZMA_kAlign>\r
+endif\r
+\r
+if NUM_BASE_PROBS ne 1984\r
+ .err <Stop_Compiling_Bad_LZMA_PROBS>\r
+endif\r
+\r
+\r
+PTR_FIELD equ dq ?\r
+\r
+CLzmaDec_Asm struct\r
+ lc db ?\r
+ lp db ?\r
+ pb db ?\r
+ _pad_ db ?\r
+ dicSize dd ?\r
+\r
+ probs_Spec PTR_FIELD\r
+ probs_1664 PTR_FIELD\r
+ dic_Spec PTR_FIELD\r
+ dicBufSize PTR_FIELD\r
+ dicPos_Spec PTR_FIELD\r
+ buf_Spec PTR_FIELD\r
+\r
+ range_Spec dd ?\r
+ code_Spec dd ?\r
+ processedPos_Spec dd ?\r
+ checkDicSize dd ?\r
+ rep0 dd ?\r
+ rep1 dd ?\r
+ rep2 dd ?\r
+ rep3 dd ?\r
+ state_Spec dd ?\r
+ remainLen dd ?\r
+CLzmaDec_Asm ends\r
+\r
+\r
+CLzmaDec_Asm_Loc struct\r
+ OLD_RSP PTR_FIELD\r
+ lzmaPtr PTR_FIELD\r
+ _pad0_ PTR_FIELD\r
+ _pad1_ PTR_FIELD\r
+ _pad2_ PTR_FIELD\r
+ dicBufSize PTR_FIELD\r
+ probs_Spec PTR_FIELD\r
+ dic_Spec PTR_FIELD\r
+ \r
+ limit PTR_FIELD\r
+ bufLimit PTR_FIELD\r
+ lc2 dd ?\r
+ lpMask dd ?\r
+ pbMask dd ?\r
+ checkDicSize dd ?\r
+\r
+ _pad_ dd ?\r
+ remainLen dd ?\r
+ dicPos_Spec PTR_FIELD\r
+ rep0 dd ?\r
+ rep1 dd ?\r
+ rep2 dd ?\r
+ rep3 dd ?\r
+CLzmaDec_Asm_Loc ends\r
+\r
+\r
+GLOB_2 equ [sym_R].CLzmaDec_Asm.\r
+GLOB equ [r1].CLzmaDec_Asm.\r
+LOC_0 equ [r0].CLzmaDec_Asm_Loc.\r
+LOC equ [RSP].CLzmaDec_Asm_Loc.\r
+\r
+\r
+COPY_VAR macro name\r
+ mov t0, GLOB_2 name\r
+ mov LOC_0 name, t0\r
+endm\r
+\r
+\r
+RESTORE_VAR macro name\r
+ mov t0, LOC name\r
+ mov GLOB name, t0\r
+endm\r
+\r
+\r
+\r
+IsMatchBranch_Pre macro reg\r
+ ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;\r
+ mov pbPos, LOC pbMask\r
+ and pbPos, processedPos\r
+ shl pbPos, (kLenNumLowBits + 1 + PSHIFT)\r
+ lea probs_state_R, [probs + 1 * state_R]\r
+endm\r
+\r
+\r
+IsMatchBranch macro reg\r
+ IsMatchBranch_Pre\r
+ IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
+endm\r
+ \r
+\r
+CheckLimits macro reg\r
+ cmp buf, LOC bufLimit\r
+ jae fin_OK\r
+ cmp dicPos, LOC limit\r
+ jae fin_OK\r
+endm\r
+\r
+\r
+\r
+; RSP is (16x + 8) bytes aligned in WIN64-x64\r
+; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)\r
+\r
+PARAM_lzma equ REG_ABI_PARAM_0\r
+PARAM_limit equ REG_ABI_PARAM_1\r
+PARAM_bufLimit equ REG_ABI_PARAM_2\r
+\r
+; MY_ALIGN_64\r
+MY_PROC LzmaDec_DecodeReal_3, 3\r
+MY_PUSH_PRESERVED_ABI_REGS\r
+\r
+ lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]\r
+ and r0, -128\r
+ mov r5, RSP\r
+ mov RSP, r0\r
+ mov LOC_0 Old_RSP, r5\r
+ mov LOC_0 lzmaPtr, PARAM_lzma\r
+ \r
+ mov LOC_0 remainLen, 0 ; remainLen must be ZERO\r
+\r
+ mov LOC_0 bufLimit, PARAM_bufLimit\r
+ mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2\r
+ mov dic, GLOB_2 dic_Spec\r
+ add PARAM_limit, dic\r
+ mov LOC_0 limit, PARAM_limit\r
+\r
+ COPY_VAR(rep0)\r
+ COPY_VAR(rep1)\r
+ COPY_VAR(rep2)\r
+ COPY_VAR(rep3)\r
+ \r
+ mov dicPos, GLOB_2 dicPos_Spec\r
+ add dicPos, dic\r
+ mov LOC_0 dicPos_Spec, dicPos\r
+ mov LOC_0 dic_Spec, dic\r
+ \r
+ mov x1_L, GLOB_2 pb\r
+ mov t0, 1\r
+ shl t0, x1_L\r
+ dec t0\r
+ mov LOC_0 pbMask, t0\r
+\r
+ ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;\r
+ ; unsigned lc = p->prop.lc;\r
+ ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);\r
+\r
+ mov x1_L, GLOB_2 lc\r
+ mov x2, 100h\r
+ mov t0, x2\r
+ shr x2, x1_L\r
+ ; inc x1\r
+ add x1_L, PSHIFT\r
+ mov LOC_0 lc2, x1\r
+ mov x1_L, GLOB_2 lp\r
+ shl t0, x1_L\r
+ sub t0, x2\r
+ mov LOC_0 lpMask, t0\r
+ mov lpMask_reg, t0\r
+ \r
+ ; mov probs, GLOB_2 probs_Spec\r
+ ; add probs, kStartOffset SHL PSHIFT\r
+ mov probs, GLOB_2 probs_1664\r
+ mov LOC_0 probs_Spec, probs\r
+\r
+ mov t0_R, GLOB_2 dicBufSize\r
+ mov LOC_0 dicBufSize, t0_R\r
+ \r
+ mov x1, GLOB_2 checkDicSize\r
+ mov LOC_0 checkDicSize, x1\r
+\r
+ mov processedPos, GLOB_2 processedPos_Spec\r
+\r
+ mov state, GLOB_2 state_Spec\r
+ shl state, PSHIFT\r
+\r
+ mov buf, GLOB_2 buf_Spec\r
+ mov range, GLOB_2 range_Spec\r
+ mov cod, GLOB_2 code_Spec\r
+ mov kBitModelTotal_reg, kBitModelTotal\r
+ xor sym, sym\r
+\r
+ ; if (processedPos != 0 || checkDicSize != 0)\r
+ or x1, processedPos\r
+ jz @f\r
+ \r
+ add t0_R, dic\r
+ cmp dicPos, dic\r
+ cmovnz t0_R, dicPos\r
+ movzx sym, byte ptr[t0_R - 1]\r
+\r
+@@:\r
+ IsMatchBranch_Pre\r
+ cmp state, 4 * PMULT\r
+ jb lit_end\r
+ cmp state, kNumLitStates * PMULT\r
+ jb lit_matched_end\r
+ jmp lz_end\r
+ \r
+\r
+ \r
+\r
+; ---------- LITERAL ----------\r
+MY_ALIGN_64\r
+lit_start:\r
+ xor state, state\r
+lit_start_2:\r
+ LIT_PROBS lpMask_reg\r
+\r
+ ifdef _LZMA_SIZE_OPT\r
+\r
+ PLOAD x1, probs + 1 * PMULT\r
+ mov sym, 1\r
+MY_ALIGN_16\r
+lit_loop:\r
+ BIT_1 x1, x2\r
+ mov x1, x2\r
+ cmp sym, 127\r
+ jbe lit_loop\r
+ \r
+ else\r
+ \r
+ BIT_0 x1, x2\r
+ BIT_1 x2, x1\r
+ BIT_1 x1, x2\r
+ BIT_1 x2, x1\r
+ BIT_1 x1, x2\r
+ BIT_1 x2, x1\r
+ BIT_1 x1, x2\r
+ \r
+ endif\r
+\r
+ BIT_2 x2, 256 - 1\r
+ \r
+ ; mov dic, LOC dic_Spec\r
+ mov probs, LOC probs_Spec\r
+ IsMatchBranch_Pre\r
+ mov byte ptr[dicPos], sym_L\r
+ inc dicPos\r
+ \r
+ CheckLimits\r
+lit_end:\r
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start\r
+\r
+ ; jmp IsMatch_label\r
+ \r
+; ---------- MATCHES ----------\r
+; MY_ALIGN_32\r
+IsMatch_label:\r
+ UPDATE_1 probs_state_R, pbPos_R, IsMatch\r
+ IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label\r
+\r
+ add probs, LenCoder * PMULT\r
+ add state, kNumStates * PMULT\r
+\r
+; ---------- LEN DECODE ----------\r
+len_decode:\r
+ mov len_temp, 8 - 1 - kMatchMinLen\r
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0\r
+ UPDATE_1 probs, 0, 0\r
+ add probs, (1 SHL (kLenNumLowBits + PSHIFT))\r
+ mov len_temp, -1 - kMatchMinLen\r
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0\r
+ UPDATE_1 probs, 0, 0\r
+ add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))\r
+ mov sym, 1\r
+ PLOAD x1, probs + 1 * PMULT\r
+\r
+MY_ALIGN_32\r
+len8_loop:\r
+ BIT_1 x1, x2\r
+ mov x1, x2\r
+ cmp sym, 64\r
+ jb len8_loop\r
+ \r
+ mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen\r
+ jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs\r
+ \r
+MY_ALIGN_32\r
+len_mid_0:\r
+ UPDATE_0 probs, 0, 0\r
+ add probs, pbPos_R\r
+ BIT_0 x2, x1\r
+len_mid_2:\r
+ BIT_1 x1, x2\r
+ BIT_2 x2, len_temp\r
+ mov probs, LOC probs_Spec\r
+ cmp state, kNumStates * PMULT\r
+ jb copy_match\r
+ \r
+\r
+; ---------- DECODE DISTANCE ----------\r
+ ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);\r
+\r
+ mov t0, 3 + kMatchMinLen\r
+ cmp sym, 3 + kMatchMinLen\r
+ cmovb t0, sym\r
+ add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))\r
+ shl t0, (kNumPosSlotBits + PSHIFT)\r
+ add probs, t0_R\r
+ \r
+ ; sym = Len\r
+ ; mov LOC remainLen, sym\r
+ mov len_temp, sym\r
+\r
+ ifdef _LZMA_SIZE_OPT\r
+\r
+ PLOAD x1, probs + 1 * PMULT\r
+ mov sym, 1\r
+MY_ALIGN_16\r
+slot_loop:\r
+ BIT_1 x1, x2\r
+ mov x1, x2\r
+ cmp sym, 32\r
+ jb slot_loop\r
+ \r
+ else\r
+ \r
+ BIT_0 x1, x2\r
+ BIT_1 x2, x1\r
+ BIT_1 x1, x2\r
+ BIT_1 x2, x1\r
+ BIT_1 x1, x2\r
+ \r
+ endif\r
+ \r
+ mov x1, sym\r
+ BIT_2 x2, 64-1\r
+\r
+ and sym, 3\r
+ mov probs, LOC probs_Spec\r
+ cmp x1, 32 + kEndPosModelIndex / 2\r
+ jb short_dist\r
+\r
+ ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));\r
+ sub x1, (32 + 1 + kNumAlignBits)\r
+ ; distance = (2 | (distance & 1));\r
+ or sym, 2\r
+ PLOAD x2, probs + 1 * PMULT\r
+ shl sym, kNumAlignBits + 1\r
+ lea sym2_R, [probs + 2 * PMULT]\r
+ \r
+ jmp direct_norm\r
+ ; lea t1, [sym_R + (1 SHL kNumAlignBits)]\r
+ ; cmp range, kTopValue\r
+ ; jb direct_norm\r
+ \r
+; ---------- DIRECT DISTANCE ----------\r
+MY_ALIGN_32\r
+direct_loop:\r
+ shr range, 1\r
+ mov t0, cod\r
+ sub cod, range\r
+ cmovs cod, t0\r
+ cmovns sym, t1\r
+ \r
+ comment ~\r
+ sub cod, range\r
+ mov x2, cod\r
+ sar x2, 31\r
+ lea sym, dword ptr [r2 + sym_R * 2 + 1]\r
+ and x2, range\r
+ add cod, x2\r
+ ~\r
+ dec x1\r
+ je direct_end\r
+\r
+ add sym, sym\r
+direct_norm:\r
+ lea t1, [sym_R + (1 SHL kNumAlignBits)]\r
+ cmp range, kTopValue\r
+ jae near ptr direct_loop\r
+ ; we align for 32 here with "near ptr" command above\r
+ NORM_2\r
+ jmp direct_loop\r
+\r
+MY_ALIGN_32\r
+direct_end:\r
+ ; prob = + kAlign;\r
+ ; distance <<= kNumAlignBits;\r
+ REV_0 x2, x1\r
+ REV_1 x1, x2, 2\r
+ REV_1 x2, x1, 4\r
+ REV_2 x1, 8\r
+\r
+decode_dist_end:\r
+\r
+ ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))\r
+\r
+ mov t1, LOC rep0\r
+ mov x1, LOC rep1\r
+ mov x2, LOC rep2\r
+ \r
+ mov t0, LOC checkDicSize\r
+ test t0, t0\r
+ cmove t0, processedPos\r
+ cmp sym, t0\r
+ jae end_of_payload\r
+ ; jmp end_of_payload ; for debug\r
+ \r
+ ; rep3 = rep2;\r
+ ; rep2 = rep1;\r
+ ; rep1 = rep0;\r
+ ; rep0 = distance + 1;\r
+\r
+ inc sym\r
+ mov LOC rep0, sym\r
+ ; mov sym, LOC remainLen\r
+ mov sym, len_temp\r
+ mov LOC rep1, t1\r
+ mov LOC rep2, x1\r
+ mov LOC rep3, x2\r
+ \r
+ ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;\r
+ cmp state, (kNumStates + kNumLitStates) * PMULT\r
+ mov state, kNumLitStates * PMULT\r
+ mov t0, (kNumLitStates + 3) * PMULT\r
+ cmovae state, t0\r
+\r
+ \r
+; ---------- COPY MATCH ----------\r
+copy_match:\r
+\r
+ ; len += kMatchMinLen;\r
+ ; add sym, kMatchMinLen\r
+\r
+ ; if ((rem = limit - dicPos) == 0)\r
+ ; {\r
+ ; p->dicPos = dicPos;\r
+ ; return SZ_ERROR_DATA;\r
+ ; }\r
+ mov cnt_R, LOC limit\r
+ sub cnt_R, dicPos\r
+ jz fin_dicPos_LIMIT\r
+\r
+ ; curLen = ((rem < len) ? (unsigned)rem : len);\r
+ cmp cnt_R, sym_R\r
+ ; cmovae cnt_R, sym_R ; 64-bit\r
+ cmovae cnt, sym ; 32-bit\r
+\r
+ mov dic, LOC dic_Spec\r
+ mov x1, LOC rep0\r
+\r
+ mov t0_R, dicPos\r
+ add dicPos, cnt_R\r
+ ; processedPos += curLen;\r
+ add processedPos, cnt\r
+ ; len -= curLen;\r
+ sub sym, cnt\r
+ mov LOC remainLen, sym\r
+\r
+ sub t0_R, dic\r
+ \r
+ ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);\r
+ sub t0_R, r1\r
+ jae @f\r
+\r
+ mov r1, LOC dicBufSize\r
+ add t0_R, r1\r
+ sub r1, t0_R\r
+ cmp cnt_R, r1\r
+ ja copy_match_cross\r
+@@:\r
+ ; if (curLen <= dicBufSize - pos)\r
+\r
+; ---------- COPY MATCH FAST ----------\r
+ ; Byte *dest = dic + dicPos;\r
+ ; mov r1, dic\r
+ ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;\r
+ ; sub t0_R, dicPos\r
+ ; dicPos += curLen;\r
+\r
+ ; const Byte *lim = dest + curLen;\r
+ add t0_R, dic\r
+ movzx sym, byte ptr[t0_R]\r
+ add t0_R, cnt_R\r
+ neg cnt_R\r
+ ; lea r1, [dicPos - 1]\r
+copy_common:\r
+ dec dicPos\r
+ ; cmp LOC rep0, 1\r
+ ; je rep0Label\r
+\r
+ ; t0_R - src_lim\r
+ ; r1 - dest_lim - 1\r
+ ; cnt_R - (-cnt)\r
+\r
+ IsMatchBranch_Pre\r
+ inc cnt_R\r
+ jz copy_end\r
+MY_ALIGN_16\r
+@@:\r
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L\r
+ movzx sym, byte ptr[cnt_R * 1 + t0_R]\r
+ inc cnt_R\r
+ jnz @b\r
+\r
+copy_end:\r
+lz_end_match:\r
+ mov byte ptr[dicPos], sym_L\r
+ inc dicPos\r
+ \r
+ ; IsMatchBranch_Pre\r
+ CheckLimits\r
+lz_end:\r
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
+\r
+\r
+\r
+; ---------- LITERAL MATCHED ----------\r
+ \r
+ LIT_PROBS LOC lpMask\r
+ \r
+ ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];\r
+ mov x1, LOC rep0\r
+ ; mov dic, LOC dic_Spec\r
+ mov LOC dicPos_Spec, dicPos\r
+ \r
+ ; state -= (state < 10) ? 3 : 6;\r
+ lea t0, [state_R - 6 * PMULT]\r
+ sub state, 3 * PMULT\r
+ cmp state, 7 * PMULT\r
+ cmovae state, t0\r
+ \r
+ sub dicPos, dic\r
+ sub dicPos, r1\r
+ jae @f\r
+ add dicPos, LOC dicBufSize\r
+@@:\r
+ comment ~\r
+ xor t0, t0\r
+ sub dicPos, r1\r
+ cmovb t0_R, LOC dicBufSize\r
+ ~\r
+ \r
+ movzx match, byte ptr[dic + dicPos * 1]\r
+\r
+ ifdef _LZMA_SIZE_OPT\r
+\r
+ mov offs, 256 * PMULT\r
+ shl match, (PSHIFT + 1)\r
+ mov bit, match\r
+ mov sym, 1\r
+MY_ALIGN_16\r
+litm_loop:\r
+ LITM\r
+ cmp sym, 256\r
+ jb litm_loop\r
+ sub sym, 256\r
+ \r
+ else\r
+ \r
+ LITM_0\r
+ LITM\r
+ LITM\r
+ LITM\r
+ LITM\r
+ LITM\r
+ LITM\r
+ LITM_2\r
+ \r
+ endif\r
+ \r
+ mov probs, LOC probs_Spec\r
+ IsMatchBranch_Pre\r
+ ; mov dic, LOC dic_Spec\r
+ mov dicPos, LOC dicPos_Spec\r
+ mov byte ptr[dicPos], sym_L\r
+ inc dicPos\r
+ \r
+ CheckLimits\r
+lit_matched_end:\r
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
+ ; IsMatchBranch\r
+ mov lpMask_reg, LOC lpMask\r
+ sub state, 3 * PMULT\r
+ jmp lit_start_2\r
+ \r
+\r
+\r
+; ---------- REP 0 LITERAL ----------\r
+MY_ALIGN_32\r
+IsRep0Short_label:\r
+ UPDATE_0 probs_state_R, pbPos_R, IsRep0Long\r
+\r
+ ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];\r
+ mov dic, LOC dic_Spec\r
+ mov t0_R, dicPos\r
+ mov probBranch, LOC rep0\r
+ sub t0_R, dic\r
+ \r
+ sub probs, RepLenCoder * PMULT\r
+ \r
+ ; state = state < kNumLitStates ? 9 : 11;\r
+ or state, 1 * PMULT\r
+ \r
+ ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT\r
+ ; so we don't need the following (dicPos == limit) check here:\r
+ ; cmp dicPos, LOC limit\r
+ ; jae fin_dicPos_LIMIT_REP_SHORT\r
+\r
+ inc processedPos\r
+\r
+ IsMatchBranch_Pre\r
+ \r
+; xor sym, sym\r
+; sub t0_R, probBranch_R\r
+; cmovb sym_R, LOC dicBufSize\r
+; add t0_R, sym_R\r
+ sub t0_R, probBranch_R\r
+ jae @f\r
+ add t0_R, LOC dicBufSize\r
+@@:\r
+ movzx sym, byte ptr[dic + t0_R * 1]\r
+ jmp lz_end_match\r
+ \r
+ \r
+MY_ALIGN_32\r
+IsRep_label:\r
+ UPDATE_1 probs_state_R, 0, IsRep\r
+\r
+ ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.\r
+ ; So we don't check it here.\r
+ \r
+ ; mov t0, processedPos\r
+ ; or t0, LOC checkDicSize\r
+ ; jz fin_ERROR_2\r
+\r
+ ; state = state < kNumLitStates ? 8 : 11;\r
+ cmp state, kNumLitStates * PMULT\r
+ mov state, 8 * PMULT\r
+ mov probBranch, 11 * PMULT\r
+ cmovae state, probBranch\r
+\r
+ ; prob = probs + RepLenCoder;\r
+ add probs, RepLenCoder * PMULT\r
+ \r
+ IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label\r
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label\r
+ UPDATE_1 probs_state_R, pbPos_R, IsRep0Long\r
+ jmp len_decode\r
+\r
+MY_ALIGN_32\r
+IsRepG0_label:\r
+ UPDATE_1 probs_state_R, 0, IsRepG0\r
+ mov dist2, LOC rep0\r
+ mov dist, LOC rep1\r
+ mov LOC rep1, dist2\r
+ \r
+ IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label\r
+ mov LOC rep0, dist\r
+ jmp len_decode\r
+ \r
+; MY_ALIGN_32\r
+IsRepG1_label:\r
+ UPDATE_1 probs_state_R, 0, IsRepG1\r
+ mov dist2, LOC rep2\r
+ mov LOC rep2, dist\r
+ \r
+ IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label\r
+ mov LOC rep0, dist2\r
+ jmp len_decode\r
+\r
+; MY_ALIGN_32\r
+IsRepG2_label:\r
+ UPDATE_1 probs_state_R, 0, IsRepG2\r
+ mov dist, LOC rep3\r
+ mov LOC rep3, dist2\r
+ mov LOC rep0, dist\r
+ jmp len_decode\r
+\r
+ \r
+\r
+; ---------- SPEC SHORT DISTANCE ----------\r
+\r
+MY_ALIGN_32\r
+short_dist:\r
+ sub x1, 32 + 1\r
+ jbe decode_dist_end\r
+ or sym, 2\r
+ shl sym, x1_L\r
+ lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]\r
+ mov sym2, PMULT ; step\r
+MY_ALIGN_32\r
+spec_loop:\r
+ REV_1_VAR x2\r
+ dec x1\r
+ jnz spec_loop\r
+\r
+ mov probs, LOC probs_Spec\r
+ sub sym, sym2\r
+ sub sym, SpecPos * PMULT\r
+ sub sym_R, probs\r
+ shr sym, PSHIFT\r
+ \r
+ jmp decode_dist_end\r
+\r
+\r
+; ---------- COPY MATCH CROSS ----------\r
+copy_match_cross:\r
+ ; t0_R - src pos\r
+ ; r1 - len to dicBufSize\r
+ ; cnt_R - total copy len\r
+\r
+ mov t1_R, t0_R ; srcPos\r
+ mov t0_R, dic\r
+ mov r1, LOC dicBufSize ;\r
+ neg cnt_R\r
+@@:\r
+ movzx sym, byte ptr[t1_R * 1 + t0_R]\r
+ inc t1_R\r
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L\r
+ inc cnt_R\r
+ cmp t1_R, r1\r
+ jne @b\r
+ \r
+ movzx sym, byte ptr[t0_R]\r
+ sub t0_R, cnt_R\r
+ jmp copy_common\r
+\r
+\r
+\r
+\r
+; fin_dicPos_LIMIT_REP_SHORT:\r
+ ; mov sym, 1\r
+\r
+fin_dicPos_LIMIT:\r
+ mov LOC remainLen, sym\r
+ jmp fin_OK\r
+ ; For more strict mode we can stop decoding with error\r
+ ; mov sym, 1\r
+ ; jmp fin\r
+\r
+\r
+fin_ERROR_MATCH_DIST:\r
+\r
+ ; rep3 = rep2;\r
+ ; rep2 = rep1;\r
+ ; rep1 = rep0;\r
+ ; rep0 = distance + 1;\r
+ \r
+ add len_temp, kMatchSpecLen_Error_Data\r
+ mov LOC remainLen, len_temp\r
+\r
+ mov LOC rep0, sym\r
+ mov LOC rep1, t1\r
+ mov LOC rep2, x1\r
+ mov LOC rep3, x2\r
+ \r
+ ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;\r
+ cmp state, (kNumStates + kNumLitStates) * PMULT\r
+ mov state, kNumLitStates * PMULT\r
+ mov t0, (kNumLitStates + 3) * PMULT\r
+ cmovae state, t0\r
+\r
+ ; jmp fin_OK\r
+ mov sym, 1\r
+ jmp fin\r
+\r
+end_of_payload:\r
+ inc sym\r
+ jnz fin_ERROR_MATCH_DIST\r
+\r
+ mov LOC remainLen, kMatchSpecLenStart\r
+ sub state, kNumStates * PMULT\r
+\r
+fin_OK:\r
+ xor sym, sym\r
+\r
+fin:\r
+ NORM\r
+\r
+ mov r1, LOC lzmaPtr\r
+\r
+ sub dicPos, LOC dic_Spec\r
+ mov GLOB dicPos_Spec, dicPos\r
+ mov GLOB buf_Spec, buf\r
+ mov GLOB range_Spec, range\r
+ mov GLOB code_Spec, cod\r
+ shr state, PSHIFT\r
+ mov GLOB state_Spec, state\r
+ mov GLOB processedPos_Spec, processedPos\r
+\r
+ RESTORE_VAR(remainLen)\r
+ RESTORE_VAR(rep0)\r
+ RESTORE_VAR(rep1)\r
+ RESTORE_VAR(rep2)\r
+ RESTORE_VAR(rep3)\r
+\r
+ mov x0, sym\r
+ \r
+ mov RSP, LOC Old_RSP\r
+\r
+MY_POP_PRESERVED_ABI_REGS\r
+MY_ENDP\r
+\r
+_TEXT$LZMADECOPT ENDS\r
+\r
+end\r