-; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function\r
-; 2021-02-23: Igor Pavlov : Public domain\r
-;\r
-; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()\r
-; function for check at link time.\r
-; That code is tightly coupled with LzmaDec_TryDummy()\r
-; and with another functions in LzmaDec.c file.\r
-; CLzmaDec structure, (probs) array layout, input and output of\r
-; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).\r
-\r
-ifndef x64\r
-; x64=1\r
-; .err <x64_IS_REQUIRED>\r
-endif\r
-\r
-include 7zAsm.asm\r
-\r
-MY_ASM_START\r
-\r
-_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'\r
-\r
-MY_ALIGN macro num:req\r
- align num\r
-endm\r
-\r
-MY_ALIGN_16 macro\r
- MY_ALIGN 16\r
-endm\r
-\r
-MY_ALIGN_32 macro\r
- MY_ALIGN 32\r
-endm\r
-\r
-MY_ALIGN_64 macro\r
- MY_ALIGN 64\r
-endm\r
-\r
-\r
-; _LZMA_SIZE_OPT equ 1\r
-\r
-; _LZMA_PROB32 equ 1\r
-\r
-ifdef _LZMA_PROB32\r
- PSHIFT equ 2\r
- PLOAD macro dest, mem\r
- mov dest, dword ptr [mem]\r
- endm\r
- PSTORE macro src, mem\r
- mov dword ptr [mem], src\r
- endm\r
-else\r
- PSHIFT equ 1\r
- PLOAD macro dest, mem\r
- movzx dest, word ptr [mem]\r
- endm\r
- PSTORE macro src, mem\r
- mov word ptr [mem], @CatStr(src, _W)\r
- endm\r
-endif\r
-\r
-PMULT equ (1 SHL PSHIFT)\r
-PMULT_HALF equ (1 SHL (PSHIFT - 1))\r
-PMULT_2 equ (1 SHL (PSHIFT + 1))\r
-\r
-kMatchSpecLen_Error_Data equ (1 SHL 9)\r
-\r
-; x0 range\r
-; x1 pbPos / (prob) TREE\r
-; x2 probBranch / prm (MATCHED) / pbPos / cnt\r
-; x3 sym\r
-;====== r4 === RSP\r
-; x5 cod\r
-; x6 t1 NORM_CALC / probs_state / dist\r
-; x7 t0 NORM_CALC / prob2 IF_BIT_1\r
-; x8 state\r
-; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg\r
-; x10 kBitModelTotal_reg\r
-; r11 probs\r
-; x12 offs (MATCHED) / dic / len_temp\r
-; x13 processedPos\r
-; x14 bit (MATCHED) / dicPos\r
-; r15 buf\r
-\r
-\r
-cod equ x5\r
-cod_L equ x5_L\r
-range equ x0\r
-state equ x8\r
-state_R equ r8\r
-buf equ r15\r
-processedPos equ x13\r
-kBitModelTotal_reg equ x10\r
-\r
-probBranch equ x2\r
-probBranch_R equ r2\r
-probBranch_W equ x2_W\r
-\r
-pbPos equ x1\r
-pbPos_R equ r1\r
-\r
-cnt equ x2\r
-cnt_R equ r2\r
-\r
-lpMask_reg equ x9\r
-dicPos equ r14\r
-\r
-sym equ x3\r
-sym_R equ r3\r
-sym_L equ x3_L\r
-\r
-probs equ r11\r
-dic equ r12\r
-\r
-t0 equ x7\r
-t0_W equ x7_W\r
-t0_R equ r7\r
-\r
-prob2 equ t0\r
-prob2_W equ t0_W\r
-\r
-t1 equ x6\r
-t1_R equ r6\r
-\r
-probs_state equ t1\r
-probs_state_R equ t1_R\r
-\r
-prm equ r2\r
-match equ x9\r
-match_R equ r9\r
-offs equ x12\r
-offs_R equ r12\r
-bit equ x14\r
-bit_R equ r14\r
-\r
-sym2 equ x9\r
-sym2_R equ r9\r
-\r
-len_temp equ x12\r
-\r
-dist equ sym\r
-dist2 equ x9\r
-\r
-\r
-\r
-kNumBitModelTotalBits equ 11\r
-kBitModelTotal equ (1 SHL kNumBitModelTotalBits)\r
-kNumMoveBits equ 5\r
-kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)\r
-kTopValue equ (1 SHL 24)\r
-\r
-NORM_2 macro\r
- ; movzx t0, BYTE PTR [buf]\r
- shl cod, 8\r
- mov cod_L, BYTE PTR [buf]\r
- shl range, 8\r
- ; or cod, t0\r
- inc buf\r
-endm\r
-\r
-\r
-NORM macro\r
- cmp range, kTopValue\r
- jae SHORT @F\r
- NORM_2\r
-@@:\r
-endm\r
-\r
-\r
-; ---------- Branch MACROS ----------\r
-\r
-UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req\r
- mov prob2, kBitModelTotal_reg\r
- sub prob2, probBranch\r
- shr prob2, kNumMoveBits\r
- add probBranch, prob2\r
- PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT\r
-endm\r
-\r
-\r
-UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req\r
- sub prob2, range\r
- sub cod, range\r
- mov range, prob2\r
- mov prob2, probBranch\r
- shr probBranch, kNumMoveBits\r
- sub prob2, probBranch\r
- PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT\r
-endm\r
-\r
-\r
-CMP_COD macro probsArray:req, probOffset:req, probDisp:req\r
- PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT\r
- NORM\r
- mov prob2, range\r
- shr range, kNumBitModelTotalBits\r
- imul range, probBranch\r
- cmp cod, range\r
-endm\r
-\r
-\r
-IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
- CMP_COD probsArray, probOffset, probDisp\r
- jae toLabel\r
-endm\r
-\r
-\r
-IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
- IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel\r
- UPDATE_0 probsArray, probOffset, probDisp\r
-endm\r
-\r
-\r
-IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req\r
- CMP_COD probsArray, probOffset, probDisp\r
- jb toLabel\r
-endm\r
-\r
-\r
-; ---------- CMOV MACROS ----------\r
-\r
-NORM_CALC macro prob:req\r
- NORM\r
- mov t0, range\r
- shr range, kNumBitModelTotalBits\r
- imul range, prob\r
- sub t0, range\r
- mov t1, cod\r
- sub cod, range\r
-endm\r
-\r
-\r
-PUP macro prob:req, probPtr:req\r
- sub t0, prob\r
- ; only sar works for both 16/32 bit prob modes\r
- sar t0, kNumMoveBits\r
- add t0, prob\r
- PSTORE t0, probPtr\r
-endm\r
-\r
-\r
-PUP_SUB macro prob:req, probPtr:req, symSub:req\r
- sbb sym, symSub\r
- PUP prob, probPtr\r
-endm\r
-\r
-\r
-PUP_COD macro prob:req, probPtr:req, symSub:req\r
- mov t0, kBitModelOffset\r
- cmovb cod, t1\r
- mov t1, sym\r
- cmovb t0, kBitModelTotal_reg\r
- PUP_SUB prob, probPtr, symSub\r
-endm\r
-\r
-\r
-BIT_0 macro prob:req, probNext:req\r
- PLOAD prob, probs + 1 * PMULT\r
- PLOAD probNext, probs + 1 * PMULT_2\r
-\r
- NORM_CALC prob\r
- \r
- cmovae range, t0\r
- PLOAD t0, probs + 1 * PMULT_2 + PMULT\r
- cmovae probNext, t0\r
- mov t0, kBitModelOffset\r
- cmovb cod, t1\r
- cmovb t0, kBitModelTotal_reg\r
- mov sym, 2\r
- PUP_SUB prob, probs + 1 * PMULT, 0 - 1\r
-endm\r
-\r
-\r
-BIT_1 macro prob:req, probNext:req\r
- PLOAD probNext, probs + sym_R * PMULT_2\r
- add sym, sym\r
- \r
- NORM_CALC prob\r
- \r
- cmovae range, t0\r
- PLOAD t0, probs + sym_R * PMULT + PMULT\r
- cmovae probNext, t0\r
- PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1\r
-endm\r
-\r
-\r
-BIT_2 macro prob:req, symSub:req\r
- add sym, sym\r
-\r
- NORM_CALC prob\r
- \r
- cmovae range, t0\r
- PUP_COD prob, probs + t1_R * PMULT_HALF, symSub\r
-endm\r
-\r
-\r
-; ---------- MATCHED LITERAL ----------\r
-\r
-LITM_0 macro\r
- mov offs, 256 * PMULT\r
- shl match, (PSHIFT + 1)\r
- mov bit, offs\r
- and bit, match\r
- PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT\r
- lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]\r
- ; lea prm, [probs + 256 * PMULT + 1 * PMULT]\r
- ; add prm, bit_R\r
- xor offs, bit\r
- add match, match\r
-\r
- NORM_CALC x1\r
-\r
- cmovae offs, bit\r
- mov bit, match\r
- cmovae range, t0\r
- mov t0, kBitModelOffset\r
- cmovb cod, t1\r
- cmovb t0, kBitModelTotal_reg\r
- mov sym, 0\r
- PUP_SUB x1, prm, -2-1\r
-endm\r
-\r
-\r
-LITM macro\r
- and bit, offs\r
- lea prm, [probs + offs_R * 1]\r
- add prm, bit_R\r
- PLOAD x1, prm + sym_R * PMULT\r
- xor offs, bit\r
- add sym, sym\r
- add match, match\r
-\r
- NORM_CALC x1\r
-\r
- cmovae offs, bit\r
- mov bit, match\r
- cmovae range, t0\r
- PUP_COD x1, prm + t1_R * PMULT_HALF, - 1\r
-endm\r
-\r
-\r
-LITM_2 macro\r
- and bit, offs\r
- lea prm, [probs + offs_R * 1]\r
- add prm, bit_R\r
- PLOAD x1, prm + sym_R * PMULT\r
- add sym, sym\r
-\r
- NORM_CALC x1\r
-\r
- cmovae range, t0\r
- PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1\r
-endm\r
-\r
-\r
-; ---------- REVERSE BITS ----------\r
-\r
-REV_0 macro prob:req, probNext:req\r
- ; PLOAD prob, probs + 1 * PMULT\r
- ; lea sym2_R, [probs + 2 * PMULT]\r
- ; PLOAD probNext, probs + 2 * PMULT\r
- PLOAD probNext, sym2_R\r
-\r
- NORM_CALC prob\r
-\r
- cmovae range, t0\r
- PLOAD t0, probs + 3 * PMULT\r
- cmovae probNext, t0\r
- cmovb cod, t1\r
- mov t0, kBitModelOffset\r
- cmovb t0, kBitModelTotal_reg\r
- lea t1_R, [probs + 3 * PMULT]\r
- cmovae sym2_R, t1_R\r
- PUP prob, probs + 1 * PMULT\r
-endm\r
-\r
-\r
-REV_1 macro prob:req, probNext:req, step:req\r
- add sym2_R, step * PMULT\r
- PLOAD probNext, sym2_R\r
-\r
- NORM_CALC prob\r
-\r
- cmovae range, t0\r
- PLOAD t0, sym2_R + step * PMULT\r
- cmovae probNext, t0\r
- cmovb cod, t1\r
- mov t0, kBitModelOffset\r
- cmovb t0, kBitModelTotal_reg\r
- lea t1_R, [sym2_R + step * PMULT]\r
- cmovae sym2_R, t1_R\r
- PUP prob, t1_R - step * PMULT_2\r
-endm\r
-\r
-\r
-REV_2 macro prob:req, step:req\r
- sub sym2_R, probs\r
- shr sym2, PSHIFT\r
- or sym, sym2\r
-\r
- NORM_CALC prob\r
-\r
- cmovae range, t0\r
- lea t0, [sym - step]\r
- cmovb sym, t0\r
- cmovb cod, t1\r
- mov t0, kBitModelOffset\r
- cmovb t0, kBitModelTotal_reg\r
- PUP prob, probs + sym2_R * PMULT\r
-endm\r
-\r
-\r
-REV_1_VAR macro prob:req\r
- PLOAD prob, sym_R\r
- mov probs, sym_R\r
- add sym_R, sym2_R\r
-\r
- NORM_CALC prob\r
-\r
- cmovae range, t0\r
- lea t0_R, [sym_R + 1 * sym2_R]\r
- cmovae sym_R, t0_R\r
- mov t0, kBitModelOffset\r
- cmovb cod, t1\r
- ; mov t1, kBitModelTotal\r
- ; cmovb t0, t1\r
- cmovb t0, kBitModelTotal_reg\r
- add sym2, sym2\r
- PUP prob, probs\r
-endm\r
-\r
-\r
-\r
-\r
-LIT_PROBS macro lpMaskParam:req\r
- ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);\r
- mov t0, processedPos\r
- shl t0, 8\r
- add sym, t0\r
- and sym, lpMaskParam\r
- add probs_state_R, pbPos_R\r
- mov x1, LOC lc2\r
- lea sym, dword ptr[sym_R + 2 * sym_R]\r
- add probs, Literal * PMULT\r
- shl sym, x1_L\r
- add probs, sym_R\r
- UPDATE_0 probs_state_R, 0, IsMatch\r
- inc processedPos\r
-endm\r
-\r
-\r
-\r
-kNumPosBitsMax equ 4\r
-kNumPosStatesMax equ (1 SHL kNumPosBitsMax)\r
-\r
-kLenNumLowBits equ 3\r
-kLenNumLowSymbols equ (1 SHL kLenNumLowBits)\r
-kLenNumHighBits equ 8\r
-kLenNumHighSymbols equ (1 SHL kLenNumHighBits)\r
-kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)\r
-\r
-LenLow equ 0\r
-LenChoice equ LenLow\r
-LenChoice2 equ (LenLow + kLenNumLowSymbols)\r
-LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)\r
-\r
-kNumStates equ 12\r
-kNumStates2 equ 16\r
-kNumLitStates equ 7\r
-\r
-kStartPosModelIndex equ 4\r
-kEndPosModelIndex equ 14\r
-kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))\r
-\r
-kNumPosSlotBits equ 6\r
-kNumLenToPosStates equ 4\r
-\r
-kNumAlignBits equ 4\r
-kAlignTableSize equ (1 SHL kNumAlignBits)\r
-\r
-kMatchMinLen equ 2\r
-kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)\r
-\r
-kStartOffset equ 1664\r
-SpecPos equ (-kStartOffset)\r
-IsRep0Long equ (SpecPos + kNumFullDistances)\r
-RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))\r
-LenCoder equ (RepLenCoder + kNumLenProbs)\r
-IsMatch equ (LenCoder + kNumLenProbs)\r
-kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))\r
-IsRep equ (kAlign + kAlignTableSize)\r
-IsRepG0 equ (IsRep + kNumStates)\r
-IsRepG1 equ (IsRepG0 + kNumStates)\r
-IsRepG2 equ (IsRepG1 + kNumStates)\r
-PosSlot equ (IsRepG2 + kNumStates)\r
-Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))\r
-NUM_BASE_PROBS equ (Literal + kStartOffset)\r
-\r
-if kAlign ne 0\r
- .err <Stop_Compiling_Bad_LZMA_kAlign>\r
-endif\r
-\r
-if NUM_BASE_PROBS ne 1984\r
- .err <Stop_Compiling_Bad_LZMA_PROBS>\r
-endif\r
-\r
-\r
-PTR_FIELD equ dq ?\r
-\r
-CLzmaDec_Asm struct\r
- lc db ?\r
- lp db ?\r
- pb db ?\r
- _pad_ db ?\r
- dicSize dd ?\r
-\r
- probs_Spec PTR_FIELD\r
- probs_1664 PTR_FIELD\r
- dic_Spec PTR_FIELD\r
- dicBufSize PTR_FIELD\r
- dicPos_Spec PTR_FIELD\r
- buf_Spec PTR_FIELD\r
-\r
- range_Spec dd ?\r
- code_Spec dd ?\r
- processedPos_Spec dd ?\r
- checkDicSize dd ?\r
- rep0 dd ?\r
- rep1 dd ?\r
- rep2 dd ?\r
- rep3 dd ?\r
- state_Spec dd ?\r
- remainLen dd ?\r
-CLzmaDec_Asm ends\r
-\r
-\r
-CLzmaDec_Asm_Loc struct\r
- OLD_RSP PTR_FIELD\r
- lzmaPtr PTR_FIELD\r
- _pad0_ PTR_FIELD\r
- _pad1_ PTR_FIELD\r
- _pad2_ PTR_FIELD\r
- dicBufSize PTR_FIELD\r
- probs_Spec PTR_FIELD\r
- dic_Spec PTR_FIELD\r
- \r
- limit PTR_FIELD\r
- bufLimit PTR_FIELD\r
- lc2 dd ?\r
- lpMask dd ?\r
- pbMask dd ?\r
- checkDicSize dd ?\r
-\r
- _pad_ dd ?\r
- remainLen dd ?\r
- dicPos_Spec PTR_FIELD\r
- rep0 dd ?\r
- rep1 dd ?\r
- rep2 dd ?\r
- rep3 dd ?\r
-CLzmaDec_Asm_Loc ends\r
-\r
-\r
-GLOB_2 equ [sym_R].CLzmaDec_Asm.\r
-GLOB equ [r1].CLzmaDec_Asm.\r
-LOC_0 equ [r0].CLzmaDec_Asm_Loc.\r
-LOC equ [RSP].CLzmaDec_Asm_Loc.\r
-\r
-\r
-COPY_VAR macro name\r
- mov t0, GLOB_2 name\r
- mov LOC_0 name, t0\r
-endm\r
-\r
-\r
-RESTORE_VAR macro name\r
- mov t0, LOC name\r
- mov GLOB name, t0\r
-endm\r
-\r
-\r
-\r
-IsMatchBranch_Pre macro reg\r
- ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;\r
- mov pbPos, LOC pbMask\r
- and pbPos, processedPos\r
- shl pbPos, (kLenNumLowBits + 1 + PSHIFT)\r
- lea probs_state_R, [probs + 1 * state_R]\r
-endm\r
-\r
-\r
-IsMatchBranch macro reg\r
- IsMatchBranch_Pre\r
- IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
-endm\r
- \r
-\r
-CheckLimits macro reg\r
- cmp buf, LOC bufLimit\r
- jae fin_OK\r
- cmp dicPos, LOC limit\r
- jae fin_OK\r
-endm\r
-\r
-\r
-\r
-; RSP is (16x + 8) bytes aligned in WIN64-x64\r
-; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)\r
-\r
-PARAM_lzma equ REG_ABI_PARAM_0\r
-PARAM_limit equ REG_ABI_PARAM_1\r
-PARAM_bufLimit equ REG_ABI_PARAM_2\r
-\r
-; MY_ALIGN_64\r
-MY_PROC LzmaDec_DecodeReal_3, 3\r
-MY_PUSH_PRESERVED_ABI_REGS\r
-\r
- lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]\r
- and r0, -128\r
- mov r5, RSP\r
- mov RSP, r0\r
- mov LOC_0 Old_RSP, r5\r
- mov LOC_0 lzmaPtr, PARAM_lzma\r
- \r
- mov LOC_0 remainLen, 0 ; remainLen must be ZERO\r
-\r
- mov LOC_0 bufLimit, PARAM_bufLimit\r
- mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2\r
- mov dic, GLOB_2 dic_Spec\r
- add PARAM_limit, dic\r
- mov LOC_0 limit, PARAM_limit\r
-\r
- COPY_VAR(rep0)\r
- COPY_VAR(rep1)\r
- COPY_VAR(rep2)\r
- COPY_VAR(rep3)\r
- \r
- mov dicPos, GLOB_2 dicPos_Spec\r
- add dicPos, dic\r
- mov LOC_0 dicPos_Spec, dicPos\r
- mov LOC_0 dic_Spec, dic\r
- \r
- mov x1_L, GLOB_2 pb\r
- mov t0, 1\r
- shl t0, x1_L\r
- dec t0\r
- mov LOC_0 pbMask, t0\r
-\r
- ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;\r
- ; unsigned lc = p->prop.lc;\r
- ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);\r
-\r
- mov x1_L, GLOB_2 lc\r
- mov x2, 100h\r
- mov t0, x2\r
- shr x2, x1_L\r
- ; inc x1\r
- add x1_L, PSHIFT\r
- mov LOC_0 lc2, x1\r
- mov x1_L, GLOB_2 lp\r
- shl t0, x1_L\r
- sub t0, x2\r
- mov LOC_0 lpMask, t0\r
- mov lpMask_reg, t0\r
- \r
- ; mov probs, GLOB_2 probs_Spec\r
- ; add probs, kStartOffset SHL PSHIFT\r
- mov probs, GLOB_2 probs_1664\r
- mov LOC_0 probs_Spec, probs\r
-\r
- mov t0_R, GLOB_2 dicBufSize\r
- mov LOC_0 dicBufSize, t0_R\r
- \r
- mov x1, GLOB_2 checkDicSize\r
- mov LOC_0 checkDicSize, x1\r
-\r
- mov processedPos, GLOB_2 processedPos_Spec\r
-\r
- mov state, GLOB_2 state_Spec\r
- shl state, PSHIFT\r
-\r
- mov buf, GLOB_2 buf_Spec\r
- mov range, GLOB_2 range_Spec\r
- mov cod, GLOB_2 code_Spec\r
- mov kBitModelTotal_reg, kBitModelTotal\r
- xor sym, sym\r
-\r
- ; if (processedPos != 0 || checkDicSize != 0)\r
- or x1, processedPos\r
- jz @f\r
- \r
- add t0_R, dic\r
- cmp dicPos, dic\r
- cmovnz t0_R, dicPos\r
- movzx sym, byte ptr[t0_R - 1]\r
-\r
-@@:\r
- IsMatchBranch_Pre\r
- cmp state, 4 * PMULT\r
- jb lit_end\r
- cmp state, kNumLitStates * PMULT\r
- jb lit_matched_end\r
- jmp lz_end\r
- \r
-\r
- \r
-\r
-; ---------- LITERAL ----------\r
-MY_ALIGN_64\r
-lit_start:\r
- xor state, state\r
-lit_start_2:\r
- LIT_PROBS lpMask_reg\r
-\r
- ifdef _LZMA_SIZE_OPT\r
-\r
- PLOAD x1, probs + 1 * PMULT\r
- mov sym, 1\r
-MY_ALIGN_16\r
-lit_loop:\r
- BIT_1 x1, x2\r
- mov x1, x2\r
- cmp sym, 127\r
- jbe lit_loop\r
- \r
- else\r
- \r
- BIT_0 x1, x2\r
- BIT_1 x2, x1\r
- BIT_1 x1, x2\r
- BIT_1 x2, x1\r
- BIT_1 x1, x2\r
- BIT_1 x2, x1\r
- BIT_1 x1, x2\r
- \r
- endif\r
-\r
- BIT_2 x2, 256 - 1\r
- \r
- ; mov dic, LOC dic_Spec\r
- mov probs, LOC probs_Spec\r
- IsMatchBranch_Pre\r
- mov byte ptr[dicPos], sym_L\r
- inc dicPos\r
- \r
- CheckLimits\r
-lit_end:\r
- IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start\r
-\r
- ; jmp IsMatch_label\r
- \r
-; ---------- MATCHES ----------\r
-; MY_ALIGN_32\r
-IsMatch_label:\r
- UPDATE_1 probs_state_R, pbPos_R, IsMatch\r
- IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label\r
-\r
- add probs, LenCoder * PMULT\r
- add state, kNumStates * PMULT\r
-\r
-; ---------- LEN DECODE ----------\r
-len_decode:\r
- mov len_temp, 8 - 1 - kMatchMinLen\r
- IF_BIT_0_NOUP probs, 0, 0, len_mid_0\r
- UPDATE_1 probs, 0, 0\r
- add probs, (1 SHL (kLenNumLowBits + PSHIFT))\r
- mov len_temp, -1 - kMatchMinLen\r
- IF_BIT_0_NOUP probs, 0, 0, len_mid_0\r
- UPDATE_1 probs, 0, 0\r
- add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))\r
- mov sym, 1\r
- PLOAD x1, probs + 1 * PMULT\r
-\r
-MY_ALIGN_32\r
-len8_loop:\r
- BIT_1 x1, x2\r
- mov x1, x2\r
- cmp sym, 64\r
- jb len8_loop\r
- \r
- mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen\r
- jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs\r
- \r
-MY_ALIGN_32\r
-len_mid_0:\r
- UPDATE_0 probs, 0, 0\r
- add probs, pbPos_R\r
- BIT_0 x2, x1\r
-len_mid_2:\r
- BIT_1 x1, x2\r
- BIT_2 x2, len_temp\r
- mov probs, LOC probs_Spec\r
- cmp state, kNumStates * PMULT\r
- jb copy_match\r
- \r
-\r
-; ---------- DECODE DISTANCE ----------\r
- ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);\r
-\r
- mov t0, 3 + kMatchMinLen\r
- cmp sym, 3 + kMatchMinLen\r
- cmovb t0, sym\r
- add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))\r
- shl t0, (kNumPosSlotBits + PSHIFT)\r
- add probs, t0_R\r
- \r
- ; sym = Len\r
- ; mov LOC remainLen, sym\r
- mov len_temp, sym\r
-\r
- ifdef _LZMA_SIZE_OPT\r
-\r
- PLOAD x1, probs + 1 * PMULT\r
- mov sym, 1\r
-MY_ALIGN_16\r
-slot_loop:\r
- BIT_1 x1, x2\r
- mov x1, x2\r
- cmp sym, 32\r
- jb slot_loop\r
- \r
- else\r
- \r
- BIT_0 x1, x2\r
- BIT_1 x2, x1\r
- BIT_1 x1, x2\r
- BIT_1 x2, x1\r
- BIT_1 x1, x2\r
- \r
- endif\r
- \r
- mov x1, sym\r
- BIT_2 x2, 64-1\r
-\r
- and sym, 3\r
- mov probs, LOC probs_Spec\r
- cmp x1, 32 + kEndPosModelIndex / 2\r
- jb short_dist\r
-\r
- ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));\r
- sub x1, (32 + 1 + kNumAlignBits)\r
- ; distance = (2 | (distance & 1));\r
- or sym, 2\r
- PLOAD x2, probs + 1 * PMULT\r
- shl sym, kNumAlignBits + 1\r
- lea sym2_R, [probs + 2 * PMULT]\r
- \r
- jmp direct_norm\r
- ; lea t1, [sym_R + (1 SHL kNumAlignBits)]\r
- ; cmp range, kTopValue\r
- ; jb direct_norm\r
- \r
-; ---------- DIRECT DISTANCE ----------\r
-MY_ALIGN_32\r
-direct_loop:\r
- shr range, 1\r
- mov t0, cod\r
- sub cod, range\r
- cmovs cod, t0\r
- cmovns sym, t1\r
- \r
- comment ~\r
- sub cod, range\r
- mov x2, cod\r
- sar x2, 31\r
- lea sym, dword ptr [r2 + sym_R * 2 + 1]\r
- and x2, range\r
- add cod, x2\r
- ~\r
- dec x1\r
- je direct_end\r
-\r
- add sym, sym\r
-direct_norm:\r
- lea t1, [sym_R + (1 SHL kNumAlignBits)]\r
- cmp range, kTopValue\r
- jae near ptr direct_loop\r
- ; we align for 32 here with "near ptr" command above\r
- NORM_2\r
- jmp direct_loop\r
-\r
-MY_ALIGN_32\r
-direct_end:\r
- ; prob = + kAlign;\r
- ; distance <<= kNumAlignBits;\r
- REV_0 x2, x1\r
- REV_1 x1, x2, 2\r
- REV_1 x2, x1, 4\r
- REV_2 x1, 8\r
-\r
-decode_dist_end:\r
-\r
- ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))\r
-\r
- mov t1, LOC rep0\r
- mov x1, LOC rep1\r
- mov x2, LOC rep2\r
- \r
- mov t0, LOC checkDicSize\r
- test t0, t0\r
- cmove t0, processedPos\r
- cmp sym, t0\r
- jae end_of_payload\r
- ; jmp end_of_payload ; for debug\r
- \r
- ; rep3 = rep2;\r
- ; rep2 = rep1;\r
- ; rep1 = rep0;\r
- ; rep0 = distance + 1;\r
-\r
- inc sym\r
- mov LOC rep0, sym\r
- ; mov sym, LOC remainLen\r
- mov sym, len_temp\r
- mov LOC rep1, t1\r
- mov LOC rep2, x1\r
- mov LOC rep3, x2\r
- \r
- ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;\r
- cmp state, (kNumStates + kNumLitStates) * PMULT\r
- mov state, kNumLitStates * PMULT\r
- mov t0, (kNumLitStates + 3) * PMULT\r
- cmovae state, t0\r
-\r
- \r
-; ---------- COPY MATCH ----------\r
-copy_match:\r
-\r
- ; len += kMatchMinLen;\r
- ; add sym, kMatchMinLen\r
-\r
- ; if ((rem = limit - dicPos) == 0)\r
- ; {\r
- ; p->dicPos = dicPos;\r
- ; return SZ_ERROR_DATA;\r
- ; }\r
- mov cnt_R, LOC limit\r
- sub cnt_R, dicPos\r
- jz fin_dicPos_LIMIT\r
-\r
- ; curLen = ((rem < len) ? (unsigned)rem : len);\r
- cmp cnt_R, sym_R\r
- ; cmovae cnt_R, sym_R ; 64-bit\r
- cmovae cnt, sym ; 32-bit\r
-\r
- mov dic, LOC dic_Spec\r
- mov x1, LOC rep0\r
-\r
- mov t0_R, dicPos\r
- add dicPos, cnt_R\r
- ; processedPos += curLen;\r
- add processedPos, cnt\r
- ; len -= curLen;\r
- sub sym, cnt\r
- mov LOC remainLen, sym\r
-\r
- sub t0_R, dic\r
- \r
- ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);\r
- sub t0_R, r1\r
- jae @f\r
-\r
- mov r1, LOC dicBufSize\r
- add t0_R, r1\r
- sub r1, t0_R\r
- cmp cnt_R, r1\r
- ja copy_match_cross\r
-@@:\r
- ; if (curLen <= dicBufSize - pos)\r
-\r
-; ---------- COPY MATCH FAST ----------\r
- ; Byte *dest = dic + dicPos;\r
- ; mov r1, dic\r
- ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;\r
- ; sub t0_R, dicPos\r
- ; dicPos += curLen;\r
-\r
- ; const Byte *lim = dest + curLen;\r
- add t0_R, dic\r
- movzx sym, byte ptr[t0_R]\r
- add t0_R, cnt_R\r
- neg cnt_R\r
- ; lea r1, [dicPos - 1]\r
-copy_common:\r
- dec dicPos\r
- ; cmp LOC rep0, 1\r
- ; je rep0Label\r
-\r
- ; t0_R - src_lim\r
- ; r1 - dest_lim - 1\r
- ; cnt_R - (-cnt)\r
-\r
- IsMatchBranch_Pre\r
- inc cnt_R\r
- jz copy_end\r
-MY_ALIGN_16\r
-@@:\r
- mov byte ptr[cnt_R * 1 + dicPos], sym_L\r
- movzx sym, byte ptr[cnt_R * 1 + t0_R]\r
- inc cnt_R\r
- jnz @b\r
-\r
-copy_end:\r
-lz_end_match:\r
- mov byte ptr[dicPos], sym_L\r
- inc dicPos\r
- \r
- ; IsMatchBranch_Pre\r
- CheckLimits\r
-lz_end:\r
- IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
-\r
-\r
-\r
-; ---------- LITERAL MATCHED ----------\r
- \r
- LIT_PROBS LOC lpMask\r
- \r
- ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];\r
- mov x1, LOC rep0\r
- ; mov dic, LOC dic_Spec\r
- mov LOC dicPos_Spec, dicPos\r
- \r
- ; state -= (state < 10) ? 3 : 6;\r
- lea t0, [state_R - 6 * PMULT]\r
- sub state, 3 * PMULT\r
- cmp state, 7 * PMULT\r
- cmovae state, t0\r
- \r
- sub dicPos, dic\r
- sub dicPos, r1\r
- jae @f\r
- add dicPos, LOC dicBufSize\r
-@@:\r
- comment ~\r
- xor t0, t0\r
- sub dicPos, r1\r
- cmovb t0_R, LOC dicBufSize\r
- ~\r
- \r
- movzx match, byte ptr[dic + dicPos * 1]\r
-\r
- ifdef _LZMA_SIZE_OPT\r
-\r
- mov offs, 256 * PMULT\r
- shl match, (PSHIFT + 1)\r
- mov bit, match\r
- mov sym, 1\r
-MY_ALIGN_16\r
-litm_loop:\r
- LITM\r
- cmp sym, 256\r
- jb litm_loop\r
- sub sym, 256\r
- \r
- else\r
- \r
- LITM_0\r
- LITM\r
- LITM\r
- LITM\r
- LITM\r
- LITM\r
- LITM\r
- LITM_2\r
- \r
- endif\r
- \r
- mov probs, LOC probs_Spec\r
- IsMatchBranch_Pre\r
- ; mov dic, LOC dic_Spec\r
- mov dicPos, LOC dicPos_Spec\r
- mov byte ptr[dicPos], sym_L\r
- inc dicPos\r
- \r
- CheckLimits\r
-lit_matched_end:\r
- IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label\r
- ; IsMatchBranch\r
- mov lpMask_reg, LOC lpMask\r
- sub state, 3 * PMULT\r
- jmp lit_start_2\r
- \r
-\r
-\r
-; ---------- REP 0 LITERAL ----------\r
-MY_ALIGN_32\r
-IsRep0Short_label:\r
- UPDATE_0 probs_state_R, pbPos_R, IsRep0Long\r
-\r
- ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];\r
- mov dic, LOC dic_Spec\r
- mov t0_R, dicPos\r
- mov probBranch, LOC rep0\r
- sub t0_R, dic\r
- \r
- sub probs, RepLenCoder * PMULT\r
- \r
- ; state = state < kNumLitStates ? 9 : 11;\r
- or state, 1 * PMULT\r
- \r
- ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT\r
- ; so we don't need the following (dicPos == limit) check here:\r
- ; cmp dicPos, LOC limit\r
- ; jae fin_dicPos_LIMIT_REP_SHORT\r
-\r
- inc processedPos\r
-\r
- IsMatchBranch_Pre\r
- \r
-; xor sym, sym\r
-; sub t0_R, probBranch_R\r
-; cmovb sym_R, LOC dicBufSize\r
-; add t0_R, sym_R\r
- sub t0_R, probBranch_R\r
- jae @f\r
- add t0_R, LOC dicBufSize\r
-@@:\r
- movzx sym, byte ptr[dic + t0_R * 1]\r
- jmp lz_end_match\r
- \r
- \r
-MY_ALIGN_32\r
-IsRep_label:\r
- UPDATE_1 probs_state_R, 0, IsRep\r
-\r
- ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.\r
- ; So we don't check it here.\r
- \r
- ; mov t0, processedPos\r
- ; or t0, LOC checkDicSize\r
- ; jz fin_ERROR_2\r
-\r
- ; state = state < kNumLitStates ? 8 : 11;\r
- cmp state, kNumLitStates * PMULT\r
- mov state, 8 * PMULT\r
- mov probBranch, 11 * PMULT\r
- cmovae state, probBranch\r
-\r
- ; prob = probs + RepLenCoder;\r
- add probs, RepLenCoder * PMULT\r
- \r
- IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label\r
- IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label\r
- UPDATE_1 probs_state_R, pbPos_R, IsRep0Long\r
- jmp len_decode\r
-\r
-MY_ALIGN_32\r
-IsRepG0_label:\r
- UPDATE_1 probs_state_R, 0, IsRepG0\r
- mov dist2, LOC rep0\r
- mov dist, LOC rep1\r
- mov LOC rep1, dist2\r
- \r
- IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label\r
- mov LOC rep0, dist\r
- jmp len_decode\r
- \r
-; MY_ALIGN_32\r
-IsRepG1_label:\r
- UPDATE_1 probs_state_R, 0, IsRepG1\r
- mov dist2, LOC rep2\r
- mov LOC rep2, dist\r
- \r
- IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label\r
- mov LOC rep0, dist2\r
- jmp len_decode\r
-\r
-; MY_ALIGN_32\r
-IsRepG2_label:\r
- UPDATE_1 probs_state_R, 0, IsRepG2\r
- mov dist, LOC rep3\r
- mov LOC rep3, dist2\r
- mov LOC rep0, dist\r
- jmp len_decode\r
-\r
- \r
-\r
-; ---------- SPEC SHORT DISTANCE ----------\r
-\r
-MY_ALIGN_32\r
-short_dist:\r
- sub x1, 32 + 1\r
- jbe decode_dist_end\r
- or sym, 2\r
- shl sym, x1_L\r
- lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]\r
- mov sym2, PMULT ; step\r
-MY_ALIGN_32\r
-spec_loop:\r
- REV_1_VAR x2\r
- dec x1\r
- jnz spec_loop\r
-\r
- mov probs, LOC probs_Spec\r
- sub sym, sym2\r
- sub sym, SpecPos * PMULT\r
- sub sym_R, probs\r
- shr sym, PSHIFT\r
- \r
- jmp decode_dist_end\r
-\r
-\r
-; ---------- COPY MATCH CROSS ----------\r
-copy_match_cross:\r
- ; t0_R - src pos\r
- ; r1 - len to dicBufSize\r
- ; cnt_R - total copy len\r
-\r
- mov t1_R, t0_R ; srcPos\r
- mov t0_R, dic\r
- mov r1, LOC dicBufSize ;\r
- neg cnt_R\r
-@@:\r
- movzx sym, byte ptr[t1_R * 1 + t0_R]\r
- inc t1_R\r
- mov byte ptr[cnt_R * 1 + dicPos], sym_L\r
- inc cnt_R\r
- cmp t1_R, r1\r
- jne @b\r
- \r
- movzx sym, byte ptr[t0_R]\r
- sub t0_R, cnt_R\r
- jmp copy_common\r
-\r
-\r
-\r
-\r
-; fin_dicPos_LIMIT_REP_SHORT:\r
- ; mov sym, 1\r
-\r
-fin_dicPos_LIMIT:\r
- mov LOC remainLen, sym\r
- jmp fin_OK\r
- ; For more strict mode we can stop decoding with error\r
- ; mov sym, 1\r
- ; jmp fin\r
-\r
-\r
-fin_ERROR_MATCH_DIST:\r
-\r
- ; rep3 = rep2;\r
- ; rep2 = rep1;\r
- ; rep1 = rep0;\r
- ; rep0 = distance + 1;\r
- \r
- add len_temp, kMatchSpecLen_Error_Data\r
- mov LOC remainLen, len_temp\r
-\r
- mov LOC rep0, sym\r
- mov LOC rep1, t1\r
- mov LOC rep2, x1\r
- mov LOC rep3, x2\r
- \r
- ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;\r
- cmp state, (kNumStates + kNumLitStates) * PMULT\r
- mov state, kNumLitStates * PMULT\r
- mov t0, (kNumLitStates + 3) * PMULT\r
- cmovae state, t0\r
-\r
- ; jmp fin_OK\r
- mov sym, 1\r
- jmp fin\r
-\r
-end_of_payload:\r
- inc sym\r
- jnz fin_ERROR_MATCH_DIST\r
-\r
- mov LOC remainLen, kMatchSpecLenStart\r
- sub state, kNumStates * PMULT\r
-\r
-fin_OK:\r
- xor sym, sym\r
-\r
-fin:\r
- NORM\r
-\r
- mov r1, LOC lzmaPtr\r
-\r
- sub dicPos, LOC dic_Spec\r
- mov GLOB dicPos_Spec, dicPos\r
- mov GLOB buf_Spec, buf\r
- mov GLOB range_Spec, range\r
- mov GLOB code_Spec, cod\r
- shr state, PSHIFT\r
- mov GLOB state_Spec, state\r
- mov GLOB processedPos_Spec, processedPos\r
-\r
- RESTORE_VAR(remainLen)\r
- RESTORE_VAR(rep0)\r
- RESTORE_VAR(rep1)\r
- RESTORE_VAR(rep2)\r
- RESTORE_VAR(rep3)\r
-\r
- mov x0, sym\r
- \r
- mov RSP, LOC Old_RSP\r
-\r
-MY_POP_PRESERVED_ABI_REGS\r
-MY_ENDP\r
-\r
-_TEXT$LZMADECOPT ENDS\r
-\r
-end\r
+; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
+; 2021-02-23: Igor Pavlov : Public domain
+;
+; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
+; function for check at link time.
+; That code is tightly coupled with LzmaDec_TryDummy()
+; and with another functions in LzmaDec.c file.
+; CLzmaDec structure, (probs) array layout, input and output of
+; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
+
+ifndef x64
+; x64=1
+; .err <x64_IS_REQUIRED>
+endif
+
+include 7zAsm.asm
+
+MY_ASM_START
+
+_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
+
+MY_ALIGN macro num:req
+ align num
+endm
+
+MY_ALIGN_16 macro
+ MY_ALIGN 16
+endm
+
+MY_ALIGN_32 macro
+ MY_ALIGN 32
+endm
+
+MY_ALIGN_64 macro
+ MY_ALIGN 64
+endm
+
+
+; _LZMA_SIZE_OPT equ 1
+
+; _LZMA_PROB32 equ 1
+
+ifdef _LZMA_PROB32
+ PSHIFT equ 2
+ PLOAD macro dest, mem
+ mov dest, dword ptr [mem]
+ endm
+ PSTORE macro src, mem
+ mov dword ptr [mem], src
+ endm
+else
+ PSHIFT equ 1
+ PLOAD macro dest, mem
+ movzx dest, word ptr [mem]
+ endm
+ PSTORE macro src, mem
+ mov word ptr [mem], @CatStr(src, _W)
+ endm
+endif
+
+PMULT equ (1 SHL PSHIFT)
+PMULT_HALF equ (1 SHL (PSHIFT - 1))
+PMULT_2 equ (1 SHL (PSHIFT + 1))
+
+kMatchSpecLen_Error_Data equ (1 SHL 9)
+
+; x0 range
+; x1 pbPos / (prob) TREE
+; x2 probBranch / prm (MATCHED) / pbPos / cnt
+; x3 sym
+;====== r4 === RSP
+; x5 cod
+; x6 t1 NORM_CALC / probs_state / dist
+; x7 t0 NORM_CALC / prob2 IF_BIT_1
+; x8 state
+; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
+; x10 kBitModelTotal_reg
+; r11 probs
+; x12 offs (MATCHED) / dic / len_temp
+; x13 processedPos
+; x14 bit (MATCHED) / dicPos
+; r15 buf
+
+
+cod equ x5
+cod_L equ x5_L
+range equ x0
+state equ x8
+state_R equ r8
+buf equ r15
+processedPos equ x13
+kBitModelTotal_reg equ x10
+
+probBranch equ x2
+probBranch_R equ r2
+probBranch_W equ x2_W
+
+pbPos equ x1
+pbPos_R equ r1
+
+cnt equ x2
+cnt_R equ r2
+
+lpMask_reg equ x9
+dicPos equ r14
+
+sym equ x3
+sym_R equ r3
+sym_L equ x3_L
+
+probs equ r11
+dic equ r12
+
+t0 equ x7
+t0_W equ x7_W
+t0_R equ r7
+
+prob2 equ t0
+prob2_W equ t0_W
+
+t1 equ x6
+t1_R equ r6
+
+probs_state equ t1
+probs_state_R equ t1_R
+
+prm equ r2
+match equ x9
+match_R equ r9
+offs equ x12
+offs_R equ r12
+bit equ x14
+bit_R equ r14
+
+sym2 equ x9
+sym2_R equ r9
+
+len_temp equ x12
+
+dist equ sym
+dist2 equ x9
+
+
+
+kNumBitModelTotalBits equ 11
+kBitModelTotal equ (1 SHL kNumBitModelTotalBits)
+kNumMoveBits equ 5
+kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)
+kTopValue equ (1 SHL 24)
+
+NORM_2 macro
+ ; movzx t0, BYTE PTR [buf]
+ shl cod, 8
+ mov cod_L, BYTE PTR [buf]
+ shl range, 8
+ ; or cod, t0
+ inc buf
+endm
+
+
+NORM macro
+ cmp range, kTopValue
+ jae SHORT @F
+ NORM_2
+@@:
+endm
+
+
+; ---------- Branch MACROS ----------
+
+UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
+ mov prob2, kBitModelTotal_reg
+ sub prob2, probBranch
+ shr prob2, kNumMoveBits
+ add probBranch, prob2
+ PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT
+endm
+
+
+UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
+ sub prob2, range
+ sub cod, range
+ mov range, prob2
+ mov prob2, probBranch
+ shr probBranch, kNumMoveBits
+ sub prob2, probBranch
+ PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT
+endm
+
+
+CMP_COD macro probsArray:req, probOffset:req, probDisp:req
+ PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT
+ NORM
+ mov prob2, range
+ shr range, kNumBitModelTotalBits
+ imul range, probBranch
+ cmp cod, range
+endm
+
+
+IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD probsArray, probOffset, probDisp
+ jae toLabel
+endm
+
+
+IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
+ UPDATE_0 probsArray, probOffset, probDisp
+endm
+
+
+IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD probsArray, probOffset, probDisp
+ jb toLabel
+endm
+
+
+; ---------- CMOV MACROS ----------
+
+NORM_CALC macro prob:req
+ NORM
+ mov t0, range
+ shr range, kNumBitModelTotalBits
+ imul range, prob
+ sub t0, range
+ mov t1, cod
+ sub cod, range
+endm
+
+
+PUP macro prob:req, probPtr:req
+ sub t0, prob
+ ; only sar works for both 16/32 bit prob modes
+ sar t0, kNumMoveBits
+ add t0, prob
+ PSTORE t0, probPtr
+endm
+
+
+PUP_SUB macro prob:req, probPtr:req, symSub:req
+ sbb sym, symSub
+ PUP prob, probPtr
+endm
+
+
+PUP_COD macro prob:req, probPtr:req, symSub:req
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ mov t1, sym
+ cmovb t0, kBitModelTotal_reg
+ PUP_SUB prob, probPtr, symSub
+endm
+
+
+BIT_0 macro prob:req, probNext:req
+ PLOAD prob, probs + 1 * PMULT
+ PLOAD probNext, probs + 1 * PMULT_2
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + 1 * PMULT_2 + PMULT
+ cmovae probNext, t0
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ cmovb t0, kBitModelTotal_reg
+ mov sym, 2
+ PUP_SUB prob, probs + 1 * PMULT, 0 - 1
+endm
+
+
+BIT_1 macro prob:req, probNext:req
+ PLOAD probNext, probs + sym_R * PMULT_2
+ add sym, sym
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + sym_R * PMULT + PMULT
+ cmovae probNext, t0
+ PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
+endm
+
+
+BIT_2 macro prob:req, symSub:req
+ add sym, sym
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
+endm
+
+
+; ---------- MATCHED LITERAL ----------
+
+LITM_0 macro
+ mov offs, 256 * PMULT
+ shl match, (PSHIFT + 1)
+ mov bit, offs
+ and bit, match
+ PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
+ lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
+ ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
+ ; add prm, bit_R
+ xor offs, bit
+ add match, match
+
+ NORM_CALC x1
+
+ cmovae offs, bit
+ mov bit, match
+ cmovae range, t0
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ cmovb t0, kBitModelTotal_reg
+ mov sym, 0
+ PUP_SUB x1, prm, -2-1
+endm
+
+
+LITM macro
+ and bit, offs
+ lea prm, [probs + offs_R * 1]
+ add prm, bit_R
+ PLOAD x1, prm + sym_R * PMULT
+ xor offs, bit
+ add sym, sym
+ add match, match
+
+ NORM_CALC x1
+
+ cmovae offs, bit
+ mov bit, match
+ cmovae range, t0
+ PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
+endm
+
+
+LITM_2 macro
+ and bit, offs
+ lea prm, [probs + offs_R * 1]
+ add prm, bit_R
+ PLOAD x1, prm + sym_R * PMULT
+ add sym, sym
+
+ NORM_CALC x1
+
+ cmovae range, t0
+ PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
+endm
+
+
+; ---------- REVERSE BITS ----------
+
+REV_0 macro prob:req, probNext:req
+ ; PLOAD prob, probs + 1 * PMULT
+ ; lea sym2_R, [probs + 2 * PMULT]
+ ; PLOAD probNext, probs + 2 * PMULT
+ PLOAD probNext, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + 3 * PMULT
+ cmovae probNext, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ lea t1_R, [probs + 3 * PMULT]
+ cmovae sym2_R, t1_R
+ PUP prob, probs + 1 * PMULT
+endm
+
+
+REV_1 macro prob:req, probNext:req, step:req
+ add sym2_R, step * PMULT
+ PLOAD probNext, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, sym2_R + step * PMULT
+ cmovae probNext, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ lea t1_R, [sym2_R + step * PMULT]
+ cmovae sym2_R, t1_R
+ PUP prob, t1_R - step * PMULT_2
+endm
+
+
+REV_2 macro prob:req, step:req
+ sub sym2_R, probs
+ shr sym2, PSHIFT
+ or sym, sym2
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ lea t0, [sym - step]
+ cmovb sym, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ PUP prob, probs + sym2_R * PMULT
+endm
+
+
+REV_1_VAR macro prob:req
+ PLOAD prob, sym_R
+ mov probs, sym_R
+ add sym_R, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ lea t0_R, [sym_R + 1 * sym2_R]
+ cmovae sym_R, t0_R
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ ; mov t1, kBitModelTotal
+ ; cmovb t0, t1
+ cmovb t0, kBitModelTotal_reg
+ add sym2, sym2
+ PUP prob, probs
+endm
+
+
+
+
+LIT_PROBS macro lpMaskParam:req
+ ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
+ mov t0, processedPos
+ shl t0, 8
+ add sym, t0
+ and sym, lpMaskParam
+ add probs_state_R, pbPos_R
+ mov x1, LOC lc2
+ lea sym, dword ptr[sym_R + 2 * sym_R]
+ add probs, Literal * PMULT
+ shl sym, x1_L
+ add probs, sym_R
+ UPDATE_0 probs_state_R, 0, IsMatch
+ inc processedPos
+endm
+
+
+
+kNumPosBitsMax equ 4
+kNumPosStatesMax equ (1 SHL kNumPosBitsMax)
+
+kLenNumLowBits equ 3
+kLenNumLowSymbols equ (1 SHL kLenNumLowBits)
+kLenNumHighBits equ 8
+kLenNumHighSymbols equ (1 SHL kLenNumHighBits)
+kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
+
+LenLow equ 0
+LenChoice equ LenLow
+LenChoice2 equ (LenLow + kLenNumLowSymbols)
+LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
+
+kNumStates equ 12
+kNumStates2 equ 16
+kNumLitStates equ 7
+
+kStartPosModelIndex equ 4
+kEndPosModelIndex equ 14
+kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))
+
+kNumPosSlotBits equ 6
+kNumLenToPosStates equ 4
+
+kNumAlignBits equ 4
+kAlignTableSize equ (1 SHL kNumAlignBits)
+
+kMatchMinLen equ 2
+kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
+
+kStartOffset equ 1664
+SpecPos equ (-kStartOffset)
+IsRep0Long equ (SpecPos + kNumFullDistances)
+RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
+LenCoder equ (RepLenCoder + kNumLenProbs)
+IsMatch equ (LenCoder + kNumLenProbs)
+kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
+IsRep equ (kAlign + kAlignTableSize)
+IsRepG0 equ (IsRep + kNumStates)
+IsRepG1 equ (IsRepG0 + kNumStates)
+IsRepG2 equ (IsRepG1 + kNumStates)
+PosSlot equ (IsRepG2 + kNumStates)
+Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
+NUM_BASE_PROBS equ (Literal + kStartOffset)
+
+if kAlign ne 0
+ .err <Stop_Compiling_Bad_LZMA_kAlign>
+endif
+
+if NUM_BASE_PROBS ne 1984
+ .err <Stop_Compiling_Bad_LZMA_PROBS>
+endif
+
+
+PTR_FIELD equ dq ?
+
+CLzmaDec_Asm struct
+ lc db ?
+ lp db ?
+ pb db ?
+ _pad_ db ?
+ dicSize dd ?
+
+ probs_Spec PTR_FIELD
+ probs_1664 PTR_FIELD
+ dic_Spec PTR_FIELD
+ dicBufSize PTR_FIELD
+ dicPos_Spec PTR_FIELD
+ buf_Spec PTR_FIELD
+
+ range_Spec dd ?
+ code_Spec dd ?
+ processedPos_Spec dd ?
+ checkDicSize dd ?
+ rep0 dd ?
+ rep1 dd ?
+ rep2 dd ?
+ rep3 dd ?
+ state_Spec dd ?
+ remainLen dd ?
+CLzmaDec_Asm ends
+
+
+CLzmaDec_Asm_Loc struct
+ OLD_RSP PTR_FIELD
+ lzmaPtr PTR_FIELD
+ _pad0_ PTR_FIELD
+ _pad1_ PTR_FIELD
+ _pad2_ PTR_FIELD
+ dicBufSize PTR_FIELD
+ probs_Spec PTR_FIELD
+ dic_Spec PTR_FIELD
+
+ limit PTR_FIELD
+ bufLimit PTR_FIELD
+ lc2 dd ?
+ lpMask dd ?
+ pbMask dd ?
+ checkDicSize dd ?
+
+ _pad_ dd ?
+ remainLen dd ?
+ dicPos_Spec PTR_FIELD
+ rep0 dd ?
+ rep1 dd ?
+ rep2 dd ?
+ rep3 dd ?
+CLzmaDec_Asm_Loc ends
+
+
+GLOB_2 equ [sym_R].CLzmaDec_Asm.
+GLOB equ [r1].CLzmaDec_Asm.
+LOC_0 equ [r0].CLzmaDec_Asm_Loc.
+LOC equ [RSP].CLzmaDec_Asm_Loc.
+
+
+COPY_VAR macro name
+ mov t0, GLOB_2 name
+ mov LOC_0 name, t0
+endm
+
+
+RESTORE_VAR macro name
+ mov t0, LOC name
+ mov GLOB name, t0
+endm
+
+
+
+IsMatchBranch_Pre macro reg
+ ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
+ mov pbPos, LOC pbMask
+ and pbPos, processedPos
+ shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
+ lea probs_state_R, [probs + 1 * state_R]
+endm
+
+
+IsMatchBranch macro reg
+ IsMatchBranch_Pre
+ IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
+endm
+
+
+CheckLimits macro reg
+ cmp buf, LOC bufLimit
+ jae fin_OK
+ cmp dicPos, LOC limit
+ jae fin_OK
+endm
+
+
+
+; RSP is (16x + 8) bytes aligned in WIN64-x64
+; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
+
+PARAM_lzma equ REG_ABI_PARAM_0
+PARAM_limit equ REG_ABI_PARAM_1
+PARAM_bufLimit equ REG_ABI_PARAM_2
+
+; MY_ALIGN_64
+MY_PROC LzmaDec_DecodeReal_3, 3
+MY_PUSH_PRESERVED_ABI_REGS
+
+ lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
+ and r0, -128
+ mov r5, RSP
+ mov RSP, r0
+ mov LOC_0 Old_RSP, r5
+ mov LOC_0 lzmaPtr, PARAM_lzma
+
+ mov LOC_0 remainLen, 0 ; remainLen must be ZERO
+
+ mov LOC_0 bufLimit, PARAM_bufLimit
+ mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2
+ mov dic, GLOB_2 dic_Spec
+ add PARAM_limit, dic
+ mov LOC_0 limit, PARAM_limit
+
+ COPY_VAR(rep0)
+ COPY_VAR(rep1)
+ COPY_VAR(rep2)
+ COPY_VAR(rep3)
+
+ mov dicPos, GLOB_2 dicPos_Spec
+ add dicPos, dic
+ mov LOC_0 dicPos_Spec, dicPos
+ mov LOC_0 dic_Spec, dic
+
+ mov x1_L, GLOB_2 pb
+ mov t0, 1
+ shl t0, x1_L
+ dec t0
+ mov LOC_0 pbMask, t0
+
+ ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
+ ; unsigned lc = p->prop.lc;
+ ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
+
+ mov x1_L, GLOB_2 lc
+ mov x2, 100h
+ mov t0, x2
+ shr x2, x1_L
+ ; inc x1
+ add x1_L, PSHIFT
+ mov LOC_0 lc2, x1
+ mov x1_L, GLOB_2 lp
+ shl t0, x1_L
+ sub t0, x2
+ mov LOC_0 lpMask, t0
+ mov lpMask_reg, t0
+
+ ; mov probs, GLOB_2 probs_Spec
+ ; add probs, kStartOffset SHL PSHIFT
+ mov probs, GLOB_2 probs_1664
+ mov LOC_0 probs_Spec, probs
+
+ mov t0_R, GLOB_2 dicBufSize
+ mov LOC_0 dicBufSize, t0_R
+
+ mov x1, GLOB_2 checkDicSize
+ mov LOC_0 checkDicSize, x1
+
+ mov processedPos, GLOB_2 processedPos_Spec
+
+ mov state, GLOB_2 state_Spec
+ shl state, PSHIFT
+
+ mov buf, GLOB_2 buf_Spec
+ mov range, GLOB_2 range_Spec
+ mov cod, GLOB_2 code_Spec
+ mov kBitModelTotal_reg, kBitModelTotal
+ xor sym, sym
+
+ ; if (processedPos != 0 || checkDicSize != 0)
+ or x1, processedPos
+ jz @f
+
+ add t0_R, dic
+ cmp dicPos, dic
+ cmovnz t0_R, dicPos
+ movzx sym, byte ptr[t0_R - 1]
+
+@@:
+ IsMatchBranch_Pre
+ cmp state, 4 * PMULT
+ jb lit_end
+ cmp state, kNumLitStates * PMULT
+ jb lit_matched_end
+ jmp lz_end
+
+
+
+
+; ---------- LITERAL ----------
+MY_ALIGN_64
+lit_start:
+ xor state, state
+lit_start_2:
+ LIT_PROBS lpMask_reg
+
+ ifdef _LZMA_SIZE_OPT
+
+ PLOAD x1, probs + 1 * PMULT
+ mov sym, 1
+MY_ALIGN_16
+lit_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 127
+ jbe lit_loop
+
+ else
+
+ BIT_0 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+
+ endif
+
+ BIT_2 x2, 256 - 1
+
+ ; mov dic, LOC dic_Spec
+ mov probs, LOC probs_Spec
+ IsMatchBranch_Pre
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ CheckLimits
+lit_end:
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
+
+ ; jmp IsMatch_label
+
+; ---------- MATCHES ----------
+; MY_ALIGN_32
+IsMatch_label:
+ UPDATE_1 probs_state_R, pbPos_R, IsMatch
+ IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
+
+ add probs, LenCoder * PMULT
+ add state, kNumStates * PMULT
+
+; ---------- LEN DECODE ----------
+len_decode:
+ mov len_temp, 8 - 1 - kMatchMinLen
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0
+ UPDATE_1 probs, 0, 0
+ add probs, (1 SHL (kLenNumLowBits + PSHIFT))
+ mov len_temp, -1 - kMatchMinLen
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0
+ UPDATE_1 probs, 0, 0
+ add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
+ mov sym, 1
+ PLOAD x1, probs + 1 * PMULT
+
+MY_ALIGN_32
+len8_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 64
+ jb len8_loop
+
+ mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
+ jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
+
+MY_ALIGN_32
+len_mid_0:
+ UPDATE_0 probs, 0, 0
+ add probs, pbPos_R
+ BIT_0 x2, x1
+len_mid_2:
+ BIT_1 x1, x2
+ BIT_2 x2, len_temp
+ mov probs, LOC probs_Spec
+ cmp state, kNumStates * PMULT
+ jb copy_match
+
+
+; ---------- DECODE DISTANCE ----------
+ ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
+
+ mov t0, 3 + kMatchMinLen
+ cmp sym, 3 + kMatchMinLen
+ cmovb t0, sym
+ add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
+ shl t0, (kNumPosSlotBits + PSHIFT)
+ add probs, t0_R
+
+ ; sym = Len
+ ; mov LOC remainLen, sym
+ mov len_temp, sym
+
+ ifdef _LZMA_SIZE_OPT
+
+ PLOAD x1, probs + 1 * PMULT
+ mov sym, 1
+MY_ALIGN_16
+slot_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 32
+ jb slot_loop
+
+ else
+
+ BIT_0 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+
+ endif
+
+ mov x1, sym
+ BIT_2 x2, 64-1
+
+ and sym, 3
+ mov probs, LOC probs_Spec
+ cmp x1, 32 + kEndPosModelIndex / 2
+ jb short_dist
+
+ ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
+ sub x1, (32 + 1 + kNumAlignBits)
+ ; distance = (2 | (distance & 1));
+ or sym, 2
+ PLOAD x2, probs + 1 * PMULT
+ shl sym, kNumAlignBits + 1
+ lea sym2_R, [probs + 2 * PMULT]
+
+ jmp direct_norm
+ ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
+ ; cmp range, kTopValue
+ ; jb direct_norm
+
+; ---------- DIRECT DISTANCE ----------
+MY_ALIGN_32
+direct_loop:
+ shr range, 1
+ mov t0, cod
+ sub cod, range
+ cmovs cod, t0
+ cmovns sym, t1
+
+ comment ~
+ sub cod, range
+ mov x2, cod
+ sar x2, 31
+ lea sym, dword ptr [r2 + sym_R * 2 + 1]
+ and x2, range
+ add cod, x2
+ ~
+ dec x1
+ je direct_end
+
+ add sym, sym
+direct_norm:
+ lea t1, [sym_R + (1 SHL kNumAlignBits)]
+ cmp range, kTopValue
+ jae near ptr direct_loop
+ ; we align for 32 here with "near ptr" command above
+ NORM_2
+ jmp direct_loop
+
+MY_ALIGN_32
+direct_end:
+ ; prob = + kAlign;
+ ; distance <<= kNumAlignBits;
+ REV_0 x2, x1
+ REV_1 x1, x2, 2
+ REV_1 x2, x1, 4
+ REV_2 x1, 8
+
+decode_dist_end:
+
+ ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
+
+ mov t1, LOC rep0
+ mov x1, LOC rep1
+ mov x2, LOC rep2
+
+ mov t0, LOC checkDicSize
+ test t0, t0
+ cmove t0, processedPos
+ cmp sym, t0
+ jae end_of_payload
+ ; jmp end_of_payload ; for debug
+
+ ; rep3 = rep2;
+ ; rep2 = rep1;
+ ; rep1 = rep0;
+ ; rep0 = distance + 1;
+
+ inc sym
+ mov LOC rep0, sym
+ ; mov sym, LOC remainLen
+ mov sym, len_temp
+ mov LOC rep1, t1
+ mov LOC rep2, x1
+ mov LOC rep3, x2
+
+ ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
+ cmp state, (kNumStates + kNumLitStates) * PMULT
+ mov state, kNumLitStates * PMULT
+ mov t0, (kNumLitStates + 3) * PMULT
+ cmovae state, t0
+
+
+; ---------- COPY MATCH ----------
+copy_match:
+
+ ; len += kMatchMinLen;
+ ; add sym, kMatchMinLen
+
+ ; if ((rem = limit - dicPos) == 0)
+ ; {
+ ; p->dicPos = dicPos;
+ ; return SZ_ERROR_DATA;
+ ; }
+ mov cnt_R, LOC limit
+ sub cnt_R, dicPos
+ jz fin_dicPos_LIMIT
+
+ ; curLen = ((rem < len) ? (unsigned)rem : len);
+ cmp cnt_R, sym_R
+ ; cmovae cnt_R, sym_R ; 64-bit
+ cmovae cnt, sym ; 32-bit
+
+ mov dic, LOC dic_Spec
+ mov x1, LOC rep0
+
+ mov t0_R, dicPos
+ add dicPos, cnt_R
+ ; processedPos += curLen;
+ add processedPos, cnt
+ ; len -= curLen;
+ sub sym, cnt
+ mov LOC remainLen, sym
+
+ sub t0_R, dic
+
+ ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
+ sub t0_R, r1
+ jae @f
+
+ mov r1, LOC dicBufSize
+ add t0_R, r1
+ sub r1, t0_R
+ cmp cnt_R, r1
+ ja copy_match_cross
+@@:
+ ; if (curLen <= dicBufSize - pos)
+
+; ---------- COPY MATCH FAST ----------
+ ; Byte *dest = dic + dicPos;
+ ; mov r1, dic
+ ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
+ ; sub t0_R, dicPos
+ ; dicPos += curLen;
+
+ ; const Byte *lim = dest + curLen;
+ add t0_R, dic
+ movzx sym, byte ptr[t0_R]
+ add t0_R, cnt_R
+ neg cnt_R
+ ; lea r1, [dicPos - 1]
+copy_common:
+ dec dicPos
+ ; cmp LOC rep0, 1
+ ; je rep0Label
+
+ ; t0_R - src_lim
+ ; r1 - dest_lim - 1
+ ; cnt_R - (-cnt)
+
+ IsMatchBranch_Pre
+ inc cnt_R
+ jz copy_end
+MY_ALIGN_16
+@@:
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L
+ movzx sym, byte ptr[cnt_R * 1 + t0_R]
+ inc cnt_R
+ jnz @b
+
+copy_end:
+lz_end_match:
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ ; IsMatchBranch_Pre
+ CheckLimits
+lz_end:
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
+
+
+
+; ---------- LITERAL MATCHED ----------
+
+ LIT_PROBS LOC lpMask
+
+ ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+ mov x1, LOC rep0
+ ; mov dic, LOC dic_Spec
+ mov LOC dicPos_Spec, dicPos
+
+ ; state -= (state < 10) ? 3 : 6;
+ lea t0, [state_R - 6 * PMULT]
+ sub state, 3 * PMULT
+ cmp state, 7 * PMULT
+ cmovae state, t0
+
+ sub dicPos, dic
+ sub dicPos, r1
+ jae @f
+ add dicPos, LOC dicBufSize
+@@:
+ comment ~
+ xor t0, t0
+ sub dicPos, r1
+ cmovb t0_R, LOC dicBufSize
+ ~
+
+ movzx match, byte ptr[dic + dicPos * 1]
+
+ ifdef _LZMA_SIZE_OPT
+
+ mov offs, 256 * PMULT
+ shl match, (PSHIFT + 1)
+ mov bit, match
+ mov sym, 1
+MY_ALIGN_16
+litm_loop:
+ LITM
+ cmp sym, 256
+ jb litm_loop
+ sub sym, 256
+
+ else
+
+ LITM_0
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM_2
+
+ endif
+
+ mov probs, LOC probs_Spec
+ IsMatchBranch_Pre
+ ; mov dic, LOC dic_Spec
+ mov dicPos, LOC dicPos_Spec
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ CheckLimits
+lit_matched_end:
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
+ ; IsMatchBranch
+ mov lpMask_reg, LOC lpMask
+ sub state, 3 * PMULT
+ jmp lit_start_2
+
+
+
+; ---------- REP 0 LITERAL ----------
+MY_ALIGN_32
+IsRep0Short_label:
+ UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
+
+ ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+ mov dic, LOC dic_Spec
+ mov t0_R, dicPos
+ mov probBranch, LOC rep0
+ sub t0_R, dic
+
+ sub probs, RepLenCoder * PMULT
+
+ ; state = state < kNumLitStates ? 9 : 11;
+ or state, 1 * PMULT
+
+ ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
+ ; so we don't need the following (dicPos == limit) check here:
+ ; cmp dicPos, LOC limit
+ ; jae fin_dicPos_LIMIT_REP_SHORT
+
+ inc processedPos
+
+ IsMatchBranch_Pre
+
+; xor sym, sym
+; sub t0_R, probBranch_R
+; cmovb sym_R, LOC dicBufSize
+; add t0_R, sym_R
+ sub t0_R, probBranch_R
+ jae @f
+ add t0_R, LOC dicBufSize
+@@:
+ movzx sym, byte ptr[dic + t0_R * 1]
+ jmp lz_end_match
+
+
+MY_ALIGN_32
+IsRep_label:
+ UPDATE_1 probs_state_R, 0, IsRep
+
+ ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
+ ; So we don't check it here.
+
+ ; mov t0, processedPos
+ ; or t0, LOC checkDicSize
+ ; jz fin_ERROR_2
+
+ ; state = state < kNumLitStates ? 8 : 11;
+ cmp state, kNumLitStates * PMULT
+ mov state, 8 * PMULT
+ mov probBranch, 11 * PMULT
+ cmovae state, probBranch
+
+ ; prob = probs + RepLenCoder;
+ add probs, RepLenCoder * PMULT
+
+ IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
+ UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
+ jmp len_decode
+
+MY_ALIGN_32
+IsRepG0_label:
+ UPDATE_1 probs_state_R, 0, IsRepG0
+ mov dist2, LOC rep0
+ mov dist, LOC rep1
+ mov LOC rep1, dist2
+
+ IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
+ mov LOC rep0, dist
+ jmp len_decode
+
+; MY_ALIGN_32
+IsRepG1_label:
+ UPDATE_1 probs_state_R, 0, IsRepG1
+ mov dist2, LOC rep2
+ mov LOC rep2, dist
+
+ IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
+ mov LOC rep0, dist2
+ jmp len_decode
+
+; MY_ALIGN_32
+IsRepG2_label:
+ UPDATE_1 probs_state_R, 0, IsRepG2
+ mov dist, LOC rep3
+ mov LOC rep3, dist2
+ mov LOC rep0, dist
+ jmp len_decode
+
+
+
+; ---------- SPEC SHORT DISTANCE ----------
+
+MY_ALIGN_32
+short_dist:
+ sub x1, 32 + 1
+ jbe decode_dist_end
+ or sym, 2
+ shl sym, x1_L
+ lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
+ mov sym2, PMULT ; step
+MY_ALIGN_32
+spec_loop:
+ REV_1_VAR x2
+ dec x1
+ jnz spec_loop
+
+ mov probs, LOC probs_Spec
+ sub sym, sym2
+ sub sym, SpecPos * PMULT
+ sub sym_R, probs
+ shr sym, PSHIFT
+
+ jmp decode_dist_end
+
+
+; ---------- COPY MATCH CROSS ----------
+copy_match_cross:
+ ; t0_R - src pos
+ ; r1 - len to dicBufSize
+ ; cnt_R - total copy len
+
+ mov t1_R, t0_R ; srcPos
+ mov t0_R, dic
+ mov r1, LOC dicBufSize ;
+ neg cnt_R
+@@:
+ movzx sym, byte ptr[t1_R * 1 + t0_R]
+ inc t1_R
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L
+ inc cnt_R
+ cmp t1_R, r1
+ jne @b
+
+ movzx sym, byte ptr[t0_R]
+ sub t0_R, cnt_R
+ jmp copy_common
+
+
+
+
+; fin_dicPos_LIMIT_REP_SHORT:
+ ; mov sym, 1
+
+fin_dicPos_LIMIT:
+ mov LOC remainLen, sym
+ jmp fin_OK
+ ; For more strict mode we can stop decoding with error
+ ; mov sym, 1
+ ; jmp fin
+
+
+fin_ERROR_MATCH_DIST:
+
+ ; rep3 = rep2;
+ ; rep2 = rep1;
+ ; rep1 = rep0;
+ ; rep0 = distance + 1;
+
+ add len_temp, kMatchSpecLen_Error_Data
+ mov LOC remainLen, len_temp
+
+ mov LOC rep0, sym
+ mov LOC rep1, t1
+ mov LOC rep2, x1
+ mov LOC rep3, x2
+
+ ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
+ cmp state, (kNumStates + kNumLitStates) * PMULT
+ mov state, kNumLitStates * PMULT
+ mov t0, (kNumLitStates + 3) * PMULT
+ cmovae state, t0
+
+ ; jmp fin_OK
+ mov sym, 1
+ jmp fin
+
+end_of_payload:
+ inc sym
+ jnz fin_ERROR_MATCH_DIST
+
+ mov LOC remainLen, kMatchSpecLenStart
+ sub state, kNumStates * PMULT
+
+fin_OK:
+ xor sym, sym
+
+fin:
+ NORM
+
+ mov r1, LOC lzmaPtr
+
+ sub dicPos, LOC dic_Spec
+ mov GLOB dicPos_Spec, dicPos
+ mov GLOB buf_Spec, buf
+ mov GLOB range_Spec, range
+ mov GLOB code_Spec, cod
+ shr state, PSHIFT
+ mov GLOB state_Spec, state
+ mov GLOB processedPos_Spec, processedPos
+
+ RESTORE_VAR(remainLen)
+ RESTORE_VAR(rep0)
+ RESTORE_VAR(rep1)
+ RESTORE_VAR(rep2)
+ RESTORE_VAR(rep3)
+
+ mov x0, sym
+
+ mov RSP, LOC Old_RSP
+
+MY_POP_PRESERVED_ABI_REGS
+MY_ENDP
+
+_TEXT$LZMADECOPT ENDS
+
+end