X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flibchdr%2Fdeps%2Flzma-24.05%2FAsm%2Fx86%2FLzmaDecOpt.asm;fp=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Fx86%2FLzmaDecOpt.asm;h=f2818e77b30a703b1a1cf6d5510eeb65aded4bd7;hb=f535537faaef474f7d31e50949eea1f15a58ee09;hp=7e08acc6397435183750dcb39849c4c6037e8b64;hpb=db02598e737b8d50cd347fe2ef13cb85ade051dd;p=pcsx_rearmed.git diff --git a/deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm b/deps/libchdr/deps/lzma-24.05/Asm/x86/LzmaDecOpt.asm similarity index 96% rename from deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm rename to deps/libchdr/deps/lzma-24.05/Asm/x86/LzmaDecOpt.asm index 7e08acc6..f2818e77 100644 --- a/deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm +++ b/deps/libchdr/deps/lzma-24.05/Asm/x86/LzmaDecOpt.asm @@ -1,1303 +1,1303 @@ -; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function -; 2021-02-23: Igor Pavlov : Public domain -; -; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() -; function for check at link time. -; That code is tightly coupled with LzmaDec_TryDummy() -; and with another functions in LzmaDec.c file. -; CLzmaDec structure, (probs) array layout, input and output of -; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). - -ifndef x64 -; x64=1 -; .err -endif - -include 7zAsm.asm - -MY_ASM_START - -_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' - -MY_ALIGN macro num:req - align num -endm - -MY_ALIGN_16 macro - MY_ALIGN 16 -endm - -MY_ALIGN_32 macro - MY_ALIGN 32 -endm - -MY_ALIGN_64 macro - MY_ALIGN 64 -endm - - -; _LZMA_SIZE_OPT equ 1 - -; _LZMA_PROB32 equ 1 - -ifdef _LZMA_PROB32 - PSHIFT equ 2 - PLOAD macro dest, mem - mov dest, dword ptr [mem] - endm - PSTORE macro src, mem - mov dword ptr [mem], src - endm -else - PSHIFT equ 1 - PLOAD macro dest, mem - movzx dest, word ptr [mem] - endm - PSTORE macro src, mem - mov word ptr [mem], @CatStr(src, _W) - endm -endif - -PMULT equ (1 SHL PSHIFT) -PMULT_HALF equ (1 SHL (PSHIFT - 1)) -PMULT_2 equ (1 SHL (PSHIFT + 1)) - -kMatchSpecLen_Error_Data equ (1 SHL 9) - -; x0 range -; x1 pbPos / (prob) TREE -; x2 probBranch / prm (MATCHED) / pbPos / cnt -; x3 sym -;====== r4 === RSP -; x5 cod -; x6 t1 NORM_CALC / probs_state / dist -; x7 t0 NORM_CALC / prob2 IF_BIT_1 -; x8 state -; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg -; x10 kBitModelTotal_reg -; r11 probs -; x12 offs (MATCHED) / dic / len_temp -; x13 processedPos -; x14 bit (MATCHED) / dicPos -; r15 buf - - -cod equ x5 -cod_L equ x5_L -range equ x0 -state equ x8 -state_R equ r8 -buf equ r15 -processedPos equ x13 -kBitModelTotal_reg equ x10 - -probBranch equ x2 -probBranch_R equ r2 -probBranch_W equ x2_W - -pbPos equ x1 -pbPos_R equ r1 - -cnt equ x2 -cnt_R equ r2 - -lpMask_reg equ x9 -dicPos equ r14 - -sym equ x3 -sym_R equ r3 -sym_L equ x3_L - -probs equ r11 -dic equ r12 - -t0 equ x7 -t0_W equ x7_W -t0_R equ r7 - -prob2 equ t0 -prob2_W equ t0_W - -t1 equ x6 -t1_R equ r6 - -probs_state equ t1 -probs_state_R equ t1_R - -prm equ r2 -match equ x9 -match_R equ r9 -offs equ x12 -offs_R equ r12 -bit equ x14 -bit_R equ r14 - -sym2 equ x9 -sym2_R equ r9 - -len_temp equ x12 - -dist equ sym -dist2 equ x9 - - - -kNumBitModelTotalBits equ 11 -kBitModelTotal equ (1 SHL kNumBitModelTotalBits) -kNumMoveBits equ 5 -kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) -kTopValue equ (1 SHL 24) - -NORM_2 macro - ; movzx t0, BYTE PTR [buf] - shl cod, 8 - mov cod_L, BYTE PTR [buf] - shl range, 8 - ; or cod, t0 - inc buf -endm - - -NORM macro - cmp range, kTopValue - jae SHORT @F - NORM_2 -@@: -endm - - -; ---------- Branch MACROS ---------- - -UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req - mov prob2, kBitModelTotal_reg - sub prob2, probBranch - shr prob2, kNumMoveBits - add probBranch, prob2 - PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT -endm - - -UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req - sub prob2, range - sub cod, range - mov range, prob2 - mov prob2, probBranch - shr probBranch, kNumMoveBits - sub prob2, probBranch - PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT -endm - - -CMP_COD macro probsArray:req, probOffset:req, probDisp:req - PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT - NORM - mov prob2, range - shr range, kNumBitModelTotalBits - imul range, probBranch - cmp cod, range -endm - - -IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req - CMP_COD probsArray, probOffset, probDisp - jae toLabel -endm - - -IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req - IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel - UPDATE_0 probsArray, probOffset, probDisp -endm - - -IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req - CMP_COD probsArray, probOffset, probDisp - jb toLabel -endm - - -; ---------- CMOV MACROS ---------- - -NORM_CALC macro prob:req - NORM - mov t0, range - shr range, kNumBitModelTotalBits - imul range, prob - sub t0, range - mov t1, cod - sub cod, range -endm - - -PUP macro prob:req, probPtr:req - sub t0, prob - ; only sar works for both 16/32 bit prob modes - sar t0, kNumMoveBits - add t0, prob - PSTORE t0, probPtr -endm - - -PUP_SUB macro prob:req, probPtr:req, symSub:req - sbb sym, symSub - PUP prob, probPtr -endm - - -PUP_COD macro prob:req, probPtr:req, symSub:req - mov t0, kBitModelOffset - cmovb cod, t1 - mov t1, sym - cmovb t0, kBitModelTotal_reg - PUP_SUB prob, probPtr, symSub -endm - - -BIT_0 macro prob:req, probNext:req - PLOAD prob, probs + 1 * PMULT - PLOAD probNext, probs + 1 * PMULT_2 - - NORM_CALC prob - - cmovae range, t0 - PLOAD t0, probs + 1 * PMULT_2 + PMULT - cmovae probNext, t0 - mov t0, kBitModelOffset - cmovb cod, t1 - cmovb t0, kBitModelTotal_reg - mov sym, 2 - PUP_SUB prob, probs + 1 * PMULT, 0 - 1 -endm - - -BIT_1 macro prob:req, probNext:req - PLOAD probNext, probs + sym_R * PMULT_2 - add sym, sym - - NORM_CALC prob - - cmovae range, t0 - PLOAD t0, probs + sym_R * PMULT + PMULT - cmovae probNext, t0 - PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 -endm - - -BIT_2 macro prob:req, symSub:req - add sym, sym - - NORM_CALC prob - - cmovae range, t0 - PUP_COD prob, probs + t1_R * PMULT_HALF, symSub -endm - - -; ---------- MATCHED LITERAL ---------- - -LITM_0 macro - mov offs, 256 * PMULT - shl match, (PSHIFT + 1) - mov bit, offs - and bit, match - PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT - lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] - ; lea prm, [probs + 256 * PMULT + 1 * PMULT] - ; add prm, bit_R - xor offs, bit - add match, match - - NORM_CALC x1 - - cmovae offs, bit - mov bit, match - cmovae range, t0 - mov t0, kBitModelOffset - cmovb cod, t1 - cmovb t0, kBitModelTotal_reg - mov sym, 0 - PUP_SUB x1, prm, -2-1 -endm - - -LITM macro - and bit, offs - lea prm, [probs + offs_R * 1] - add prm, bit_R - PLOAD x1, prm + sym_R * PMULT - xor offs, bit - add sym, sym - add match, match - - NORM_CALC x1 - - cmovae offs, bit - mov bit, match - cmovae range, t0 - PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 -endm - - -LITM_2 macro - and bit, offs - lea prm, [probs + offs_R * 1] - add prm, bit_R - PLOAD x1, prm + sym_R * PMULT - add sym, sym - - NORM_CALC x1 - - cmovae range, t0 - PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 -endm - - -; ---------- REVERSE BITS ---------- - -REV_0 macro prob:req, probNext:req - ; PLOAD prob, probs + 1 * PMULT - ; lea sym2_R, [probs + 2 * PMULT] - ; PLOAD probNext, probs + 2 * PMULT - PLOAD probNext, sym2_R - - NORM_CALC prob - - cmovae range, t0 - PLOAD t0, probs + 3 * PMULT - cmovae probNext, t0 - cmovb cod, t1 - mov t0, kBitModelOffset - cmovb t0, kBitModelTotal_reg - lea t1_R, [probs + 3 * PMULT] - cmovae sym2_R, t1_R - PUP prob, probs + 1 * PMULT -endm - - -REV_1 macro prob:req, probNext:req, step:req - add sym2_R, step * PMULT - PLOAD probNext, sym2_R - - NORM_CALC prob - - cmovae range, t0 - PLOAD t0, sym2_R + step * PMULT - cmovae probNext, t0 - cmovb cod, t1 - mov t0, kBitModelOffset - cmovb t0, kBitModelTotal_reg - lea t1_R, [sym2_R + step * PMULT] - cmovae sym2_R, t1_R - PUP prob, t1_R - step * PMULT_2 -endm - - -REV_2 macro prob:req, step:req - sub sym2_R, probs - shr sym2, PSHIFT - or sym, sym2 - - NORM_CALC prob - - cmovae range, t0 - lea t0, [sym - step] - cmovb sym, t0 - cmovb cod, t1 - mov t0, kBitModelOffset - cmovb t0, kBitModelTotal_reg - PUP prob, probs + sym2_R * PMULT -endm - - -REV_1_VAR macro prob:req - PLOAD prob, sym_R - mov probs, sym_R - add sym_R, sym2_R - - NORM_CALC prob - - cmovae range, t0 - lea t0_R, [sym_R + 1 * sym2_R] - cmovae sym_R, t0_R - mov t0, kBitModelOffset - cmovb cod, t1 - ; mov t1, kBitModelTotal - ; cmovb t0, t1 - cmovb t0, kBitModelTotal_reg - add sym2, sym2 - PUP prob, probs -endm - - - - -LIT_PROBS macro lpMaskParam:req - ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); - mov t0, processedPos - shl t0, 8 - add sym, t0 - and sym, lpMaskParam - add probs_state_R, pbPos_R - mov x1, LOC lc2 - lea sym, dword ptr[sym_R + 2 * sym_R] - add probs, Literal * PMULT - shl sym, x1_L - add probs, sym_R - UPDATE_0 probs_state_R, 0, IsMatch - inc processedPos -endm - - - -kNumPosBitsMax equ 4 -kNumPosStatesMax equ (1 SHL kNumPosBitsMax) - -kLenNumLowBits equ 3 -kLenNumLowSymbols equ (1 SHL kLenNumLowBits) -kLenNumHighBits equ 8 -kLenNumHighSymbols equ (1 SHL kLenNumHighBits) -kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) - -LenLow equ 0 -LenChoice equ LenLow -LenChoice2 equ (LenLow + kLenNumLowSymbols) -LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) - -kNumStates equ 12 -kNumStates2 equ 16 -kNumLitStates equ 7 - -kStartPosModelIndex equ 4 -kEndPosModelIndex equ 14 -kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) - -kNumPosSlotBits equ 6 -kNumLenToPosStates equ 4 - -kNumAlignBits equ 4 -kAlignTableSize equ (1 SHL kNumAlignBits) - -kMatchMinLen equ 2 -kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) - -kStartOffset equ 1664 -SpecPos equ (-kStartOffset) -IsRep0Long equ (SpecPos + kNumFullDistances) -RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) -LenCoder equ (RepLenCoder + kNumLenProbs) -IsMatch equ (LenCoder + kNumLenProbs) -kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) -IsRep equ (kAlign + kAlignTableSize) -IsRepG0 equ (IsRep + kNumStates) -IsRepG1 equ (IsRepG0 + kNumStates) -IsRepG2 equ (IsRepG1 + kNumStates) -PosSlot equ (IsRepG2 + kNumStates) -Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) -NUM_BASE_PROBS equ (Literal + kStartOffset) - -if kAlign ne 0 - .err -endif - -if NUM_BASE_PROBS ne 1984 - .err -endif - - -PTR_FIELD equ dq ? - -CLzmaDec_Asm struct - lc db ? - lp db ? - pb db ? - _pad_ db ? - dicSize dd ? - - probs_Spec PTR_FIELD - probs_1664 PTR_FIELD - dic_Spec PTR_FIELD - dicBufSize PTR_FIELD - dicPos_Spec PTR_FIELD - buf_Spec PTR_FIELD - - range_Spec dd ? - code_Spec dd ? - processedPos_Spec dd ? - checkDicSize dd ? - rep0 dd ? - rep1 dd ? - rep2 dd ? - rep3 dd ? - state_Spec dd ? - remainLen dd ? -CLzmaDec_Asm ends - - -CLzmaDec_Asm_Loc struct - OLD_RSP PTR_FIELD - lzmaPtr PTR_FIELD - _pad0_ PTR_FIELD - _pad1_ PTR_FIELD - _pad2_ PTR_FIELD - dicBufSize PTR_FIELD - probs_Spec PTR_FIELD - dic_Spec PTR_FIELD - - limit PTR_FIELD - bufLimit PTR_FIELD - lc2 dd ? - lpMask dd ? - pbMask dd ? - checkDicSize dd ? - - _pad_ dd ? - remainLen dd ? - dicPos_Spec PTR_FIELD - rep0 dd ? - rep1 dd ? - rep2 dd ? - rep3 dd ? -CLzmaDec_Asm_Loc ends - - -GLOB_2 equ [sym_R].CLzmaDec_Asm. -GLOB equ [r1].CLzmaDec_Asm. -LOC_0 equ [r0].CLzmaDec_Asm_Loc. -LOC equ [RSP].CLzmaDec_Asm_Loc. - - -COPY_VAR macro name - mov t0, GLOB_2 name - mov LOC_0 name, t0 -endm - - -RESTORE_VAR macro name - mov t0, LOC name - mov GLOB name, t0 -endm - - - -IsMatchBranch_Pre macro reg - ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; - mov pbPos, LOC pbMask - and pbPos, processedPos - shl pbPos, (kLenNumLowBits + 1 + PSHIFT) - lea probs_state_R, [probs + 1 * state_R] -endm - - -IsMatchBranch macro reg - IsMatchBranch_Pre - IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label -endm - - -CheckLimits macro reg - cmp buf, LOC bufLimit - jae fin_OK - cmp dicPos, LOC limit - jae fin_OK -endm - - - -; RSP is (16x + 8) bytes aligned in WIN64-x64 -; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) - -PARAM_lzma equ REG_ABI_PARAM_0 -PARAM_limit equ REG_ABI_PARAM_1 -PARAM_bufLimit equ REG_ABI_PARAM_2 - -; MY_ALIGN_64 -MY_PROC LzmaDec_DecodeReal_3, 3 -MY_PUSH_PRESERVED_ABI_REGS - - lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] - and r0, -128 - mov r5, RSP - mov RSP, r0 - mov LOC_0 Old_RSP, r5 - mov LOC_0 lzmaPtr, PARAM_lzma - - mov LOC_0 remainLen, 0 ; remainLen must be ZERO - - mov LOC_0 bufLimit, PARAM_bufLimit - mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 - mov dic, GLOB_2 dic_Spec - add PARAM_limit, dic - mov LOC_0 limit, PARAM_limit - - COPY_VAR(rep0) - COPY_VAR(rep1) - COPY_VAR(rep2) - COPY_VAR(rep3) - - mov dicPos, GLOB_2 dicPos_Spec - add dicPos, dic - mov LOC_0 dicPos_Spec, dicPos - mov LOC_0 dic_Spec, dic - - mov x1_L, GLOB_2 pb - mov t0, 1 - shl t0, x1_L - dec t0 - mov LOC_0 pbMask, t0 - - ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; - ; unsigned lc = p->prop.lc; - ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); - - mov x1_L, GLOB_2 lc - mov x2, 100h - mov t0, x2 - shr x2, x1_L - ; inc x1 - add x1_L, PSHIFT - mov LOC_0 lc2, x1 - mov x1_L, GLOB_2 lp - shl t0, x1_L - sub t0, x2 - mov LOC_0 lpMask, t0 - mov lpMask_reg, t0 - - ; mov probs, GLOB_2 probs_Spec - ; add probs, kStartOffset SHL PSHIFT - mov probs, GLOB_2 probs_1664 - mov LOC_0 probs_Spec, probs - - mov t0_R, GLOB_2 dicBufSize - mov LOC_0 dicBufSize, t0_R - - mov x1, GLOB_2 checkDicSize - mov LOC_0 checkDicSize, x1 - - mov processedPos, GLOB_2 processedPos_Spec - - mov state, GLOB_2 state_Spec - shl state, PSHIFT - - mov buf, GLOB_2 buf_Spec - mov range, GLOB_2 range_Spec - mov cod, GLOB_2 code_Spec - mov kBitModelTotal_reg, kBitModelTotal - xor sym, sym - - ; if (processedPos != 0 || checkDicSize != 0) - or x1, processedPos - jz @f - - add t0_R, dic - cmp dicPos, dic - cmovnz t0_R, dicPos - movzx sym, byte ptr[t0_R - 1] - -@@: - IsMatchBranch_Pre - cmp state, 4 * PMULT - jb lit_end - cmp state, kNumLitStates * PMULT - jb lit_matched_end - jmp lz_end - - - - -; ---------- LITERAL ---------- -MY_ALIGN_64 -lit_start: - xor state, state -lit_start_2: - LIT_PROBS lpMask_reg - - ifdef _LZMA_SIZE_OPT - - PLOAD x1, probs + 1 * PMULT - mov sym, 1 -MY_ALIGN_16 -lit_loop: - BIT_1 x1, x2 - mov x1, x2 - cmp sym, 127 - jbe lit_loop - - else - - BIT_0 x1, x2 - BIT_1 x2, x1 - BIT_1 x1, x2 - BIT_1 x2, x1 - BIT_1 x1, x2 - BIT_1 x2, x1 - BIT_1 x1, x2 - - endif - - BIT_2 x2, 256 - 1 - - ; mov dic, LOC dic_Spec - mov probs, LOC probs_Spec - IsMatchBranch_Pre - mov byte ptr[dicPos], sym_L - inc dicPos - - CheckLimits -lit_end: - IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start - - ; jmp IsMatch_label - -; ---------- MATCHES ---------- -; MY_ALIGN_32 -IsMatch_label: - UPDATE_1 probs_state_R, pbPos_R, IsMatch - IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label - - add probs, LenCoder * PMULT - add state, kNumStates * PMULT - -; ---------- LEN DECODE ---------- -len_decode: - mov len_temp, 8 - 1 - kMatchMinLen - IF_BIT_0_NOUP probs, 0, 0, len_mid_0 - UPDATE_1 probs, 0, 0 - add probs, (1 SHL (kLenNumLowBits + PSHIFT)) - mov len_temp, -1 - kMatchMinLen - IF_BIT_0_NOUP probs, 0, 0, len_mid_0 - UPDATE_1 probs, 0, 0 - add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) - mov sym, 1 - PLOAD x1, probs + 1 * PMULT - -MY_ALIGN_32 -len8_loop: - BIT_1 x1, x2 - mov x1, x2 - cmp sym, 64 - jb len8_loop - - mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen - jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs - -MY_ALIGN_32 -len_mid_0: - UPDATE_0 probs, 0, 0 - add probs, pbPos_R - BIT_0 x2, x1 -len_mid_2: - BIT_1 x1, x2 - BIT_2 x2, len_temp - mov probs, LOC probs_Spec - cmp state, kNumStates * PMULT - jb copy_match - - -; ---------- DECODE DISTANCE ---------- - ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); - - mov t0, 3 + kMatchMinLen - cmp sym, 3 + kMatchMinLen - cmovb t0, sym - add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) - shl t0, (kNumPosSlotBits + PSHIFT) - add probs, t0_R - - ; sym = Len - ; mov LOC remainLen, sym - mov len_temp, sym - - ifdef _LZMA_SIZE_OPT - - PLOAD x1, probs + 1 * PMULT - mov sym, 1 -MY_ALIGN_16 -slot_loop: - BIT_1 x1, x2 - mov x1, x2 - cmp sym, 32 - jb slot_loop - - else - - BIT_0 x1, x2 - BIT_1 x2, x1 - BIT_1 x1, x2 - BIT_1 x2, x1 - BIT_1 x1, x2 - - endif - - mov x1, sym - BIT_2 x2, 64-1 - - and sym, 3 - mov probs, LOC probs_Spec - cmp x1, 32 + kEndPosModelIndex / 2 - jb short_dist - - ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); - sub x1, (32 + 1 + kNumAlignBits) - ; distance = (2 | (distance & 1)); - or sym, 2 - PLOAD x2, probs + 1 * PMULT - shl sym, kNumAlignBits + 1 - lea sym2_R, [probs + 2 * PMULT] - - jmp direct_norm - ; lea t1, [sym_R + (1 SHL kNumAlignBits)] - ; cmp range, kTopValue - ; jb direct_norm - -; ---------- DIRECT DISTANCE ---------- -MY_ALIGN_32 -direct_loop: - shr range, 1 - mov t0, cod - sub cod, range - cmovs cod, t0 - cmovns sym, t1 - - comment ~ - sub cod, range - mov x2, cod - sar x2, 31 - lea sym, dword ptr [r2 + sym_R * 2 + 1] - and x2, range - add cod, x2 - ~ - dec x1 - je direct_end - - add sym, sym -direct_norm: - lea t1, [sym_R + (1 SHL kNumAlignBits)] - cmp range, kTopValue - jae near ptr direct_loop - ; we align for 32 here with "near ptr" command above - NORM_2 - jmp direct_loop - -MY_ALIGN_32 -direct_end: - ; prob = + kAlign; - ; distance <<= kNumAlignBits; - REV_0 x2, x1 - REV_1 x1, x2, 2 - REV_1 x2, x1, 4 - REV_2 x1, 8 - -decode_dist_end: - - ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) - - mov t1, LOC rep0 - mov x1, LOC rep1 - mov x2, LOC rep2 - - mov t0, LOC checkDicSize - test t0, t0 - cmove t0, processedPos - cmp sym, t0 - jae end_of_payload - ; jmp end_of_payload ; for debug - - ; rep3 = rep2; - ; rep2 = rep1; - ; rep1 = rep0; - ; rep0 = distance + 1; - - inc sym - mov LOC rep0, sym - ; mov sym, LOC remainLen - mov sym, len_temp - mov LOC rep1, t1 - mov LOC rep2, x1 - mov LOC rep3, x2 - - ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; - cmp state, (kNumStates + kNumLitStates) * PMULT - mov state, kNumLitStates * PMULT - mov t0, (kNumLitStates + 3) * PMULT - cmovae state, t0 - - -; ---------- COPY MATCH ---------- -copy_match: - - ; len += kMatchMinLen; - ; add sym, kMatchMinLen - - ; if ((rem = limit - dicPos) == 0) - ; { - ; p->dicPos = dicPos; - ; return SZ_ERROR_DATA; - ; } - mov cnt_R, LOC limit - sub cnt_R, dicPos - jz fin_dicPos_LIMIT - - ; curLen = ((rem < len) ? (unsigned)rem : len); - cmp cnt_R, sym_R - ; cmovae cnt_R, sym_R ; 64-bit - cmovae cnt, sym ; 32-bit - - mov dic, LOC dic_Spec - mov x1, LOC rep0 - - mov t0_R, dicPos - add dicPos, cnt_R - ; processedPos += curLen; - add processedPos, cnt - ; len -= curLen; - sub sym, cnt - mov LOC remainLen, sym - - sub t0_R, dic - - ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); - sub t0_R, r1 - jae @f - - mov r1, LOC dicBufSize - add t0_R, r1 - sub r1, t0_R - cmp cnt_R, r1 - ja copy_match_cross -@@: - ; if (curLen <= dicBufSize - pos) - -; ---------- COPY MATCH FAST ---------- - ; Byte *dest = dic + dicPos; - ; mov r1, dic - ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; - ; sub t0_R, dicPos - ; dicPos += curLen; - - ; const Byte *lim = dest + curLen; - add t0_R, dic - movzx sym, byte ptr[t0_R] - add t0_R, cnt_R - neg cnt_R - ; lea r1, [dicPos - 1] -copy_common: - dec dicPos - ; cmp LOC rep0, 1 - ; je rep0Label - - ; t0_R - src_lim - ; r1 - dest_lim - 1 - ; cnt_R - (-cnt) - - IsMatchBranch_Pre - inc cnt_R - jz copy_end -MY_ALIGN_16 -@@: - mov byte ptr[cnt_R * 1 + dicPos], sym_L - movzx sym, byte ptr[cnt_R * 1 + t0_R] - inc cnt_R - jnz @b - -copy_end: -lz_end_match: - mov byte ptr[dicPos], sym_L - inc dicPos - - ; IsMatchBranch_Pre - CheckLimits -lz_end: - IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label - - - -; ---------- LITERAL MATCHED ---------- - - LIT_PROBS LOC lpMask - - ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; - mov x1, LOC rep0 - ; mov dic, LOC dic_Spec - mov LOC dicPos_Spec, dicPos - - ; state -= (state < 10) ? 3 : 6; - lea t0, [state_R - 6 * PMULT] - sub state, 3 * PMULT - cmp state, 7 * PMULT - cmovae state, t0 - - sub dicPos, dic - sub dicPos, r1 - jae @f - add dicPos, LOC dicBufSize -@@: - comment ~ - xor t0, t0 - sub dicPos, r1 - cmovb t0_R, LOC dicBufSize - ~ - - movzx match, byte ptr[dic + dicPos * 1] - - ifdef _LZMA_SIZE_OPT - - mov offs, 256 * PMULT - shl match, (PSHIFT + 1) - mov bit, match - mov sym, 1 -MY_ALIGN_16 -litm_loop: - LITM - cmp sym, 256 - jb litm_loop - sub sym, 256 - - else - - LITM_0 - LITM - LITM - LITM - LITM - LITM - LITM - LITM_2 - - endif - - mov probs, LOC probs_Spec - IsMatchBranch_Pre - ; mov dic, LOC dic_Spec - mov dicPos, LOC dicPos_Spec - mov byte ptr[dicPos], sym_L - inc dicPos - - CheckLimits -lit_matched_end: - IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label - ; IsMatchBranch - mov lpMask_reg, LOC lpMask - sub state, 3 * PMULT - jmp lit_start_2 - - - -; ---------- REP 0 LITERAL ---------- -MY_ALIGN_32 -IsRep0Short_label: - UPDATE_0 probs_state_R, pbPos_R, IsRep0Long - - ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; - mov dic, LOC dic_Spec - mov t0_R, dicPos - mov probBranch, LOC rep0 - sub t0_R, dic - - sub probs, RepLenCoder * PMULT - - ; state = state < kNumLitStates ? 9 : 11; - or state, 1 * PMULT - - ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT - ; so we don't need the following (dicPos == limit) check here: - ; cmp dicPos, LOC limit - ; jae fin_dicPos_LIMIT_REP_SHORT - - inc processedPos - - IsMatchBranch_Pre - -; xor sym, sym -; sub t0_R, probBranch_R -; cmovb sym_R, LOC dicBufSize -; add t0_R, sym_R - sub t0_R, probBranch_R - jae @f - add t0_R, LOC dicBufSize -@@: - movzx sym, byte ptr[dic + t0_R * 1] - jmp lz_end_match - - -MY_ALIGN_32 -IsRep_label: - UPDATE_1 probs_state_R, 0, IsRep - - ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. - ; So we don't check it here. - - ; mov t0, processedPos - ; or t0, LOC checkDicSize - ; jz fin_ERROR_2 - - ; state = state < kNumLitStates ? 8 : 11; - cmp state, kNumLitStates * PMULT - mov state, 8 * PMULT - mov probBranch, 11 * PMULT - cmovae state, probBranch - - ; prob = probs + RepLenCoder; - add probs, RepLenCoder * PMULT - - IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label - IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label - UPDATE_1 probs_state_R, pbPos_R, IsRep0Long - jmp len_decode - -MY_ALIGN_32 -IsRepG0_label: - UPDATE_1 probs_state_R, 0, IsRepG0 - mov dist2, LOC rep0 - mov dist, LOC rep1 - mov LOC rep1, dist2 - - IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label - mov LOC rep0, dist - jmp len_decode - -; MY_ALIGN_32 -IsRepG1_label: - UPDATE_1 probs_state_R, 0, IsRepG1 - mov dist2, LOC rep2 - mov LOC rep2, dist - - IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label - mov LOC rep0, dist2 - jmp len_decode - -; MY_ALIGN_32 -IsRepG2_label: - UPDATE_1 probs_state_R, 0, IsRepG2 - mov dist, LOC rep3 - mov LOC rep3, dist2 - mov LOC rep0, dist - jmp len_decode - - - -; ---------- SPEC SHORT DISTANCE ---------- - -MY_ALIGN_32 -short_dist: - sub x1, 32 + 1 - jbe decode_dist_end - or sym, 2 - shl sym, x1_L - lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] - mov sym2, PMULT ; step -MY_ALIGN_32 -spec_loop: - REV_1_VAR x2 - dec x1 - jnz spec_loop - - mov probs, LOC probs_Spec - sub sym, sym2 - sub sym, SpecPos * PMULT - sub sym_R, probs - shr sym, PSHIFT - - jmp decode_dist_end - - -; ---------- COPY MATCH CROSS ---------- -copy_match_cross: - ; t0_R - src pos - ; r1 - len to dicBufSize - ; cnt_R - total copy len - - mov t1_R, t0_R ; srcPos - mov t0_R, dic - mov r1, LOC dicBufSize ; - neg cnt_R -@@: - movzx sym, byte ptr[t1_R * 1 + t0_R] - inc t1_R - mov byte ptr[cnt_R * 1 + dicPos], sym_L - inc cnt_R - cmp t1_R, r1 - jne @b - - movzx sym, byte ptr[t0_R] - sub t0_R, cnt_R - jmp copy_common - - - - -; fin_dicPos_LIMIT_REP_SHORT: - ; mov sym, 1 - -fin_dicPos_LIMIT: - mov LOC remainLen, sym - jmp fin_OK - ; For more strict mode we can stop decoding with error - ; mov sym, 1 - ; jmp fin - - -fin_ERROR_MATCH_DIST: - - ; rep3 = rep2; - ; rep2 = rep1; - ; rep1 = rep0; - ; rep0 = distance + 1; - - add len_temp, kMatchSpecLen_Error_Data - mov LOC remainLen, len_temp - - mov LOC rep0, sym - mov LOC rep1, t1 - mov LOC rep2, x1 - mov LOC rep3, x2 - - ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; - cmp state, (kNumStates + kNumLitStates) * PMULT - mov state, kNumLitStates * PMULT - mov t0, (kNumLitStates + 3) * PMULT - cmovae state, t0 - - ; jmp fin_OK - mov sym, 1 - jmp fin - -end_of_payload: - inc sym - jnz fin_ERROR_MATCH_DIST - - mov LOC remainLen, kMatchSpecLenStart - sub state, kNumStates * PMULT - -fin_OK: - xor sym, sym - -fin: - NORM - - mov r1, LOC lzmaPtr - - sub dicPos, LOC dic_Spec - mov GLOB dicPos_Spec, dicPos - mov GLOB buf_Spec, buf - mov GLOB range_Spec, range - mov GLOB code_Spec, cod - shr state, PSHIFT - mov GLOB state_Spec, state - mov GLOB processedPos_Spec, processedPos - - RESTORE_VAR(remainLen) - RESTORE_VAR(rep0) - RESTORE_VAR(rep1) - RESTORE_VAR(rep2) - RESTORE_VAR(rep3) - - mov x0, sym - - mov RSP, LOC Old_RSP - -MY_POP_PRESERVED_ABI_REGS -MY_ENDP - -_TEXT$LZMADECOPT ENDS - -end +; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function +; 2021-02-23: Igor Pavlov : Public domain +; +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). + +ifndef x64 +; x64=1 +; .err +endif + +include 7zAsm.asm + +MY_ASM_START + +_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' + +MY_ALIGN macro num:req + align num +endm + +MY_ALIGN_16 macro + MY_ALIGN 16 +endm + +MY_ALIGN_32 macro + MY_ALIGN 32 +endm + +MY_ALIGN_64 macro + MY_ALIGN 64 +endm + + +; _LZMA_SIZE_OPT equ 1 + +; _LZMA_PROB32 equ 1 + +ifdef _LZMA_PROB32 + PSHIFT equ 2 + PLOAD macro dest, mem + mov dest, dword ptr [mem] + endm + PSTORE macro src, mem + mov dword ptr [mem], src + endm +else + PSHIFT equ 1 + PLOAD macro dest, mem + movzx dest, word ptr [mem] + endm + PSTORE macro src, mem + mov word ptr [mem], @CatStr(src, _W) + endm +endif + +PMULT equ (1 SHL PSHIFT) +PMULT_HALF equ (1 SHL (PSHIFT - 1)) +PMULT_2 equ (1 SHL (PSHIFT + 1)) + +kMatchSpecLen_Error_Data equ (1 SHL 9) + +; x0 range +; x1 pbPos / (prob) TREE +; x2 probBranch / prm (MATCHED) / pbPos / cnt +; x3 sym +;====== r4 === RSP +; x5 cod +; x6 t1 NORM_CALC / probs_state / dist +; x7 t0 NORM_CALC / prob2 IF_BIT_1 +; x8 state +; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg +; x10 kBitModelTotal_reg +; r11 probs +; x12 offs (MATCHED) / dic / len_temp +; x13 processedPos +; x14 bit (MATCHED) / dicPos +; r15 buf + + +cod equ x5 +cod_L equ x5_L +range equ x0 +state equ x8 +state_R equ r8 +buf equ r15 +processedPos equ x13 +kBitModelTotal_reg equ x10 + +probBranch equ x2 +probBranch_R equ r2 +probBranch_W equ x2_W + +pbPos equ x1 +pbPos_R equ r1 + +cnt equ x2 +cnt_R equ r2 + +lpMask_reg equ x9 +dicPos equ r14 + +sym equ x3 +sym_R equ r3 +sym_L equ x3_L + +probs equ r11 +dic equ r12 + +t0 equ x7 +t0_W equ x7_W +t0_R equ r7 + +prob2 equ t0 +prob2_W equ t0_W + +t1 equ x6 +t1_R equ r6 + +probs_state equ t1 +probs_state_R equ t1_R + +prm equ r2 +match equ x9 +match_R equ r9 +offs equ x12 +offs_R equ r12 +bit equ x14 +bit_R equ r14 + +sym2 equ x9 +sym2_R equ r9 + +len_temp equ x12 + +dist equ sym +dist2 equ x9 + + + +kNumBitModelTotalBits equ 11 +kBitModelTotal equ (1 SHL kNumBitModelTotalBits) +kNumMoveBits equ 5 +kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) +kTopValue equ (1 SHL 24) + +NORM_2 macro + ; movzx t0, BYTE PTR [buf] + shl cod, 8 + mov cod_L, BYTE PTR [buf] + shl range, 8 + ; or cod, t0 + inc buf +endm + + +NORM macro + cmp range, kTopValue + jae SHORT @F + NORM_2 +@@: +endm + + +; ---------- Branch MACROS ---------- + +UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req + mov prob2, kBitModelTotal_reg + sub prob2, probBranch + shr prob2, kNumMoveBits + add probBranch, prob2 + PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req + sub prob2, range + sub cod, range + mov range, prob2 + mov prob2, probBranch + shr probBranch, kNumMoveBits + sub prob2, probBranch + PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +CMP_COD macro probsArray:req, probOffset:req, probDisp:req + PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT + NORM + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +endm + + +IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jae toLabel +endm + + +IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel + UPDATE_0 probsArray, probOffset, probDisp +endm + + +IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jb toLabel +endm + + +; ---------- CMOV MACROS ---------- + +NORM_CALC macro prob:req + NORM + mov t0, range + shr range, kNumBitModelTotalBits + imul range, prob + sub t0, range + mov t1, cod + sub cod, range +endm + + +PUP macro prob:req, probPtr:req + sub t0, prob + ; only sar works for both 16/32 bit prob modes + sar t0, kNumMoveBits + add t0, prob + PSTORE t0, probPtr +endm + + +PUP_SUB macro prob:req, probPtr:req, symSub:req + sbb sym, symSub + PUP prob, probPtr +endm + + +PUP_COD macro prob:req, probPtr:req, symSub:req + mov t0, kBitModelOffset + cmovb cod, t1 + mov t1, sym + cmovb t0, kBitModelTotal_reg + PUP_SUB prob, probPtr, symSub +endm + + +BIT_0 macro prob:req, probNext:req + PLOAD prob, probs + 1 * PMULT + PLOAD probNext, probs + 1 * PMULT_2 + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 1 * PMULT_2 + PMULT + cmovae probNext, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 2 + PUP_SUB prob, probs + 1 * PMULT, 0 - 1 +endm + + +BIT_1 macro prob:req, probNext:req + PLOAD probNext, probs + sym_R * PMULT_2 + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + sym_R * PMULT + PMULT + cmovae probNext, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 +endm + + +BIT_2 macro prob:req, symSub:req + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, symSub +endm + + +; ---------- MATCHED LITERAL ---------- + +LITM_0 macro + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, offs + and bit, match + PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT + lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] + ; lea prm, [probs + 256 * PMULT + 1 * PMULT] + ; add prm, bit_R + xor offs, bit + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 0 + PUP_SUB x1, prm, -2-1 +endm + + +LITM macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + xor offs, bit + add sym, sym + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 +endm + + +LITM_2 macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + add sym, sym + + NORM_CALC x1 + + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 +endm + + +; ---------- REVERSE BITS ---------- + +REV_0 macro prob:req, probNext:req + ; PLOAD prob, probs + 1 * PMULT + ; lea sym2_R, [probs + 2 * PMULT] + ; PLOAD probNext, probs + 2 * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 3 * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [probs + 3 * PMULT] + cmovae sym2_R, t1_R + PUP prob, probs + 1 * PMULT +endm + + +REV_1 macro prob:req, probNext:req, step:req + add sym2_R, step * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, sym2_R + step * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [sym2_R + step * PMULT] + cmovae sym2_R, t1_R + PUP prob, t1_R - step * PMULT_2 +endm + + +REV_2 macro prob:req, step:req + sub sym2_R, probs + shr sym2, PSHIFT + or sym, sym2 + + NORM_CALC prob + + cmovae range, t0 + lea t0, [sym - step] + cmovb sym, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + PUP prob, probs + sym2_R * PMULT +endm + + +REV_1_VAR macro prob:req + PLOAD prob, sym_R + mov probs, sym_R + add sym_R, sym2_R + + NORM_CALC prob + + cmovae range, t0 + lea t0_R, [sym_R + 1 * sym2_R] + cmovae sym_R, t0_R + mov t0, kBitModelOffset + cmovb cod, t1 + ; mov t1, kBitModelTotal + ; cmovb t0, t1 + cmovb t0, kBitModelTotal_reg + add sym2, sym2 + PUP prob, probs +endm + + + + +LIT_PROBS macro lpMaskParam:req + ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); + mov t0, processedPos + shl t0, 8 + add sym, t0 + and sym, lpMaskParam + add probs_state_R, pbPos_R + mov x1, LOC lc2 + lea sym, dword ptr[sym_R + 2 * sym_R] + add probs, Literal * PMULT + shl sym, x1_L + add probs, sym_R + UPDATE_0 probs_state_R, 0, IsMatch + inc processedPos +endm + + + +kNumPosBitsMax equ 4 +kNumPosStatesMax equ (1 SHL kNumPosBitsMax) + +kLenNumLowBits equ 3 +kLenNumLowSymbols equ (1 SHL kLenNumLowBits) +kLenNumHighBits equ 8 +kLenNumHighSymbols equ (1 SHL kLenNumHighBits) +kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +LenLow equ 0 +LenChoice equ LenLow +LenChoice2 equ (LenLow + kLenNumLowSymbols) +LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +kNumStates equ 12 +kNumStates2 equ 16 +kNumLitStates equ 7 + +kStartPosModelIndex equ 4 +kEndPosModelIndex equ 14 +kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) + +kNumPosSlotBits equ 6 +kNumLenToPosStates equ 4 + +kNumAlignBits equ 4 +kAlignTableSize equ (1 SHL kNumAlignBits) + +kMatchMinLen equ 2 +kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +kStartOffset equ 1664 +SpecPos equ (-kStartOffset) +IsRep0Long equ (SpecPos + kNumFullDistances) +RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) +LenCoder equ (RepLenCoder + kNumLenProbs) +IsMatch equ (LenCoder + kNumLenProbs) +kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) +IsRep equ (kAlign + kAlignTableSize) +IsRepG0 equ (IsRep + kNumStates) +IsRepG1 equ (IsRepG0 + kNumStates) +IsRepG2 equ (IsRepG1 + kNumStates) +PosSlot equ (IsRepG2 + kNumStates) +Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) +NUM_BASE_PROBS equ (Literal + kStartOffset) + +if kAlign ne 0 + .err +endif + +if NUM_BASE_PROBS ne 1984 + .err +endif + + +PTR_FIELD equ dq ? + +CLzmaDec_Asm struct + lc db ? + lp db ? + pb db ? + _pad_ db ? + dicSize dd ? + + probs_Spec PTR_FIELD + probs_1664 PTR_FIELD + dic_Spec PTR_FIELD + dicBufSize PTR_FIELD + dicPos_Spec PTR_FIELD + buf_Spec PTR_FIELD + + range_Spec dd ? + code_Spec dd ? + processedPos_Spec dd ? + checkDicSize dd ? + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? + state_Spec dd ? + remainLen dd ? +CLzmaDec_Asm ends + + +CLzmaDec_Asm_Loc struct + OLD_RSP PTR_FIELD + lzmaPtr PTR_FIELD + _pad0_ PTR_FIELD + _pad1_ PTR_FIELD + _pad2_ PTR_FIELD + dicBufSize PTR_FIELD + probs_Spec PTR_FIELD + dic_Spec PTR_FIELD + + limit PTR_FIELD + bufLimit PTR_FIELD + lc2 dd ? + lpMask dd ? + pbMask dd ? + checkDicSize dd ? + + _pad_ dd ? + remainLen dd ? + dicPos_Spec PTR_FIELD + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? +CLzmaDec_Asm_Loc ends + + +GLOB_2 equ [sym_R].CLzmaDec_Asm. +GLOB equ [r1].CLzmaDec_Asm. +LOC_0 equ [r0].CLzmaDec_Asm_Loc. +LOC equ [RSP].CLzmaDec_Asm_Loc. + + +COPY_VAR macro name + mov t0, GLOB_2 name + mov LOC_0 name, t0 +endm + + +RESTORE_VAR macro name + mov t0, LOC name + mov GLOB name, t0 +endm + + + +IsMatchBranch_Pre macro reg + ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + mov pbPos, LOC pbMask + and pbPos, processedPos + shl pbPos, (kLenNumLowBits + 1 + PSHIFT) + lea probs_state_R, [probs + 1 * state_R] +endm + + +IsMatchBranch macro reg + IsMatchBranch_Pre + IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label +endm + + +CheckLimits macro reg + cmp buf, LOC bufLimit + jae fin_OK + cmp dicPos, LOC limit + jae fin_OK +endm + + + +; RSP is (16x + 8) bytes aligned in WIN64-x64 +; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) + +PARAM_lzma equ REG_ABI_PARAM_0 +PARAM_limit equ REG_ABI_PARAM_1 +PARAM_bufLimit equ REG_ABI_PARAM_2 + +; MY_ALIGN_64 +MY_PROC LzmaDec_DecodeReal_3, 3 +MY_PUSH_PRESERVED_ABI_REGS + + lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] + and r0, -128 + mov r5, RSP + mov RSP, r0 + mov LOC_0 Old_RSP, r5 + mov LOC_0 lzmaPtr, PARAM_lzma + + mov LOC_0 remainLen, 0 ; remainLen must be ZERO + + mov LOC_0 bufLimit, PARAM_bufLimit + mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 + mov dic, GLOB_2 dic_Spec + add PARAM_limit, dic + mov LOC_0 limit, PARAM_limit + + COPY_VAR(rep0) + COPY_VAR(rep1) + COPY_VAR(rep2) + COPY_VAR(rep3) + + mov dicPos, GLOB_2 dicPos_Spec + add dicPos, dic + mov LOC_0 dicPos_Spec, dicPos + mov LOC_0 dic_Spec, dic + + mov x1_L, GLOB_2 pb + mov t0, 1 + shl t0, x1_L + dec t0 + mov LOC_0 pbMask, t0 + + ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; + ; unsigned lc = p->prop.lc; + ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); + + mov x1_L, GLOB_2 lc + mov x2, 100h + mov t0, x2 + shr x2, x1_L + ; inc x1 + add x1_L, PSHIFT + mov LOC_0 lc2, x1 + mov x1_L, GLOB_2 lp + shl t0, x1_L + sub t0, x2 + mov LOC_0 lpMask, t0 + mov lpMask_reg, t0 + + ; mov probs, GLOB_2 probs_Spec + ; add probs, kStartOffset SHL PSHIFT + mov probs, GLOB_2 probs_1664 + mov LOC_0 probs_Spec, probs + + mov t0_R, GLOB_2 dicBufSize + mov LOC_0 dicBufSize, t0_R + + mov x1, GLOB_2 checkDicSize + mov LOC_0 checkDicSize, x1 + + mov processedPos, GLOB_2 processedPos_Spec + + mov state, GLOB_2 state_Spec + shl state, PSHIFT + + mov buf, GLOB_2 buf_Spec + mov range, GLOB_2 range_Spec + mov cod, GLOB_2 code_Spec + mov kBitModelTotal_reg, kBitModelTotal + xor sym, sym + + ; if (processedPos != 0 || checkDicSize != 0) + or x1, processedPos + jz @f + + add t0_R, dic + cmp dicPos, dic + cmovnz t0_R, dicPos + movzx sym, byte ptr[t0_R - 1] + +@@: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + + +; ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + xor state, state +lit_start_2: + LIT_PROBS lpMask_reg + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +lit_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 127 + jbe lit_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + BIT_2 x2, 256 - 1 + + ; mov dic, LOC dic_Spec + mov probs, LOC probs_Spec + IsMatchBranch_Pre + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_end: + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start + + ; jmp IsMatch_label + +; ---------- MATCHES ---------- +; MY_ALIGN_32 +IsMatch_label: + UPDATE_1 probs_state_R, pbPos_R, IsMatch + IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label + + add probs, LenCoder * PMULT + add state, kNumStates * PMULT + +; ---------- LEN DECODE ---------- +len_decode: + mov len_temp, 8 - 1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, (1 SHL (kLenNumLowBits + PSHIFT)) + mov len_temp, -1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) + mov sym, 1 + PLOAD x1, probs + 1 * PMULT + +MY_ALIGN_32 +len8_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 64 + jb len8_loop + + mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen + jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs + +MY_ALIGN_32 +len_mid_0: + UPDATE_0 probs, 0, 0 + add probs, pbPos_R + BIT_0 x2, x1 +len_mid_2: + BIT_1 x1, x2 + BIT_2 x2, len_temp + mov probs, LOC probs_Spec + cmp state, kNumStates * PMULT + jb copy_match + + +; ---------- DECODE DISTANCE ---------- + ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp sym, 3 + kMatchMinLen + cmovb t0, sym + add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) + shl t0, (kNumPosSlotBits + PSHIFT) + add probs, t0_R + + ; sym = Len + ; mov LOC remainLen, sym + mov len_temp, sym + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +slot_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 32 + jb slot_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + mov x1, sym + BIT_2 x2, 64-1 + + and sym, 3 + mov probs, LOC probs_Spec + cmp x1, 32 + kEndPosModelIndex / 2 + jb short_dist + + ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + sub x1, (32 + 1 + kNumAlignBits) + ; distance = (2 | (distance & 1)); + or sym, 2 + PLOAD x2, probs + 1 * PMULT + shl sym, kNumAlignBits + 1 + lea sym2_R, [probs + 2 * PMULT] + + jmp direct_norm + ; lea t1, [sym_R + (1 SHL kNumAlignBits)] + ; cmp range, kTopValue + ; jb direct_norm + +; ---------- DIRECT DISTANCE ---------- +MY_ALIGN_32 +direct_loop: + shr range, 1 + mov t0, cod + sub cod, range + cmovs cod, t0 + cmovns sym, t1 + + comment ~ + sub cod, range + mov x2, cod + sar x2, 31 + lea sym, dword ptr [r2 + sym_R * 2 + 1] + and x2, range + add cod, x2 + ~ + dec x1 + je direct_end + + add sym, sym +direct_norm: + lea t1, [sym_R + (1 SHL kNumAlignBits)] + cmp range, kTopValue + jae near ptr direct_loop + ; we align for 32 here with "near ptr" command above + NORM_2 + jmp direct_loop + +MY_ALIGN_32 +direct_end: + ; prob = + kAlign; + ; distance <<= kNumAlignBits; + REV_0 x2, x1 + REV_1 x1, x2, 2 + REV_1 x2, x1, 4 + REV_2 x1, 8 + +decode_dist_end: + + ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + mov t1, LOC rep0 + mov x1, LOC rep1 + mov x2, LOC rep2 + + mov t0, LOC checkDicSize + test t0, t0 + cmove t0, processedPos + cmp sym, t0 + jae end_of_payload + ; jmp end_of_payload ; for debug + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + inc sym + mov LOC rep0, sym + ; mov sym, LOC remainLen + mov sym, len_temp + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + +; ---------- COPY MATCH ---------- +copy_match: + + ; len += kMatchMinLen; + ; add sym, kMatchMinLen + + ; if ((rem = limit - dicPos) == 0) + ; { + ; p->dicPos = dicPos; + ; return SZ_ERROR_DATA; + ; } + mov cnt_R, LOC limit + sub cnt_R, dicPos + jz fin_dicPos_LIMIT + + ; curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, sym_R + ; cmovae cnt_R, sym_R ; 64-bit + cmovae cnt, sym ; 32-bit + + mov dic, LOC dic_Spec + mov x1, LOC rep0 + + mov t0_R, dicPos + add dicPos, cnt_R + ; processedPos += curLen; + add processedPos, cnt + ; len -= curLen; + sub sym, cnt + mov LOC remainLen, sym + + sub t0_R, dic + + ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + sub t0_R, r1 + jae @f + + mov r1, LOC dicBufSize + add t0_R, r1 + sub r1, t0_R + cmp cnt_R, r1 + ja copy_match_cross +@@: + ; if (curLen <= dicBufSize - pos) + +; ---------- COPY MATCH FAST ---------- + ; Byte *dest = dic + dicPos; + ; mov r1, dic + ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; + ; sub t0_R, dicPos + ; dicPos += curLen; + + ; const Byte *lim = dest + curLen; + add t0_R, dic + movzx sym, byte ptr[t0_R] + add t0_R, cnt_R + neg cnt_R + ; lea r1, [dicPos - 1] +copy_common: + dec dicPos + ; cmp LOC rep0, 1 + ; je rep0Label + + ; t0_R - src_lim + ; r1 - dest_lim - 1 + ; cnt_R - (-cnt) + + IsMatchBranch_Pre + inc cnt_R + jz copy_end +MY_ALIGN_16 +@@: + mov byte ptr[cnt_R * 1 + dicPos], sym_L + movzx sym, byte ptr[cnt_R * 1 + t0_R] + inc cnt_R + jnz @b + +copy_end: +lz_end_match: + mov byte ptr[dicPos], sym_L + inc dicPos + + ; IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + + + +; ---------- LITERAL MATCHED ---------- + + LIT_PROBS LOC lpMask + + ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov x1, LOC rep0 + ; mov dic, LOC dic_Spec + mov LOC dicPos_Spec, dicPos + + ; state -= (state < 10) ? 3 : 6; + lea t0, [state_R - 6 * PMULT] + sub state, 3 * PMULT + cmp state, 7 * PMULT + cmovae state, t0 + + sub dicPos, dic + sub dicPos, r1 + jae @f + add dicPos, LOC dicBufSize +@@: + comment ~ + xor t0, t0 + sub dicPos, r1 + cmovb t0_R, LOC dicBufSize + ~ + + movzx match, byte ptr[dic + dicPos * 1] + + ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, match + mov sym, 1 +MY_ALIGN_16 +litm_loop: + LITM + cmp sym, 256 + jb litm_loop + sub sym, 256 + + else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + endif + + mov probs, LOC probs_Spec + IsMatchBranch_Pre + ; mov dic, LOC dic_Spec + mov dicPos, LOC dicPos_Spec + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_matched_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + ; IsMatchBranch + mov lpMask_reg, LOC lpMask + sub state, 3 * PMULT + jmp lit_start_2 + + + +; ---------- REP 0 LITERAL ---------- +MY_ALIGN_32 +IsRep0Short_label: + UPDATE_0 probs_state_R, pbPos_R, IsRep0Long + + ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov dic, LOC dic_Spec + mov t0_R, dicPos + mov probBranch, LOC rep0 + sub t0_R, dic + + sub probs, RepLenCoder * PMULT + + ; state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + + ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT + ; so we don't need the following (dicPos == limit) check here: + ; cmp dicPos, LOC limit + ; jae fin_dicPos_LIMIT_REP_SHORT + + inc processedPos + + IsMatchBranch_Pre + +; xor sym, sym +; sub t0_R, probBranch_R +; cmovb sym_R, LOC dicBufSize +; add t0_R, sym_R + sub t0_R, probBranch_R + jae @f + add t0_R, LOC dicBufSize +@@: + movzx sym, byte ptr[dic + t0_R * 1] + jmp lz_end_match + + +MY_ALIGN_32 +IsRep_label: + UPDATE_1 probs_state_R, 0, IsRep + + ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + ; So we don't check it here. + + ; mov t0, processedPos + ; or t0, LOC checkDicSize + ; jz fin_ERROR_2 + + ; state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + ; prob = probs + RepLenCoder; + add probs, RepLenCoder * PMULT + + IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label + UPDATE_1 probs_state_R, pbPos_R, IsRep0Long + jmp len_decode + +MY_ALIGN_32 +IsRepG0_label: + UPDATE_1 probs_state_R, 0, IsRepG0 + mov dist2, LOC rep0 + mov dist, LOC rep1 + mov LOC rep1, dist2 + + IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label + mov LOC rep0, dist + jmp len_decode + +; MY_ALIGN_32 +IsRepG1_label: + UPDATE_1 probs_state_R, 0, IsRepG1 + mov dist2, LOC rep2 + mov LOC rep2, dist + + IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label + mov LOC rep0, dist2 + jmp len_decode + +; MY_ALIGN_32 +IsRepG2_label: + UPDATE_1 probs_state_R, 0, IsRepG2 + mov dist, LOC rep3 + mov LOC rep3, dist2 + mov LOC rep0, dist + jmp len_decode + + + +; ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_32 +short_dist: + sub x1, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, x1_L + lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] + mov sym2, PMULT ; step +MY_ALIGN_32 +spec_loop: + REV_1_VAR x2 + dec x1 + jnz spec_loop + + mov probs, LOC probs_Spec + sub sym, sym2 + sub sym, SpecPos * PMULT + sub sym_R, probs + shr sym, PSHIFT + + jmp decode_dist_end + + +; ---------- COPY MATCH CROSS ---------- +copy_match_cross: + ; t0_R - src pos + ; r1 - len to dicBufSize + ; cnt_R - total copy len + + mov t1_R, t0_R ; srcPos + mov t0_R, dic + mov r1, LOC dicBufSize ; + neg cnt_R +@@: + movzx sym, byte ptr[t1_R * 1 + t0_R] + inc t1_R + mov byte ptr[cnt_R * 1 + dicPos], sym_L + inc cnt_R + cmp t1_R, r1 + jne @b + + movzx sym, byte ptr[t0_R] + sub t0_R, cnt_R + jmp copy_common + + + + +; fin_dicPos_LIMIT_REP_SHORT: + ; mov sym, 1 + +fin_dicPos_LIMIT: + mov LOC remainLen, sym + jmp fin_OK + ; For more strict mode we can stop decoding with error + ; mov sym, 1 + ; jmp fin + + +fin_ERROR_MATCH_DIST: + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + add len_temp, kMatchSpecLen_Error_Data + mov LOC remainLen, len_temp + + mov LOC rep0, sym + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + ; jmp fin_OK + mov sym, 1 + jmp fin + +end_of_payload: + inc sym + jnz fin_ERROR_MATCH_DIST + + mov LOC remainLen, kMatchSpecLenStart + sub state, kNumStates * PMULT + +fin_OK: + xor sym, sym + +fin: + NORM + + mov r1, LOC lzmaPtr + + sub dicPos, LOC dic_Spec + mov GLOB dicPos_Spec, dicPos + mov GLOB buf_Spec, buf + mov GLOB range_Spec, range + mov GLOB code_Spec, cod + shr state, PSHIFT + mov GLOB state_Spec, state + mov GLOB processedPos_Spec, processedPos + + RESTORE_VAR(remainLen) + RESTORE_VAR(rep0) + RESTORE_VAR(rep1) + RESTORE_VAR(rep2) + RESTORE_VAR(rep3) + + mov x0, sym + + mov RSP, LOC Old_RSP + +MY_POP_PRESERVED_ABI_REGS +MY_ENDP + +_TEXT$LZMADECOPT ENDS + +end