X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Fx86%2FLzmaDecOpt.asm;fp=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Fx86%2FLzmaDecOpt.asm;h=7e08acc6397435183750dcb39849c4c6037e8b64;hb=9e052883388b2b607a488e48b3cb6db52a0997dd;hp=0000000000000000000000000000000000000000;hpb=7e49a25d527a1d79569f39a6f2d8d28a0d6cebd5;p=pcsx_rearmed.git diff --git a/deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm b/deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm new file mode 100644 index 00000000..7e08acc6 --- /dev/null +++ b/deps/libchdr/deps/lzma-22.01/src/Asm/x86/LzmaDecOpt.asm @@ -0,0 +1,1303 @@ +; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function +; 2021-02-23: Igor Pavlov : Public domain +; +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). + +ifndef x64 +; x64=1 +; .err +endif + +include 7zAsm.asm + +MY_ASM_START + +_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' + +MY_ALIGN macro num:req + align num +endm + +MY_ALIGN_16 macro + MY_ALIGN 16 +endm + +MY_ALIGN_32 macro + MY_ALIGN 32 +endm + +MY_ALIGN_64 macro + MY_ALIGN 64 +endm + + +; _LZMA_SIZE_OPT equ 1 + +; _LZMA_PROB32 equ 1 + +ifdef _LZMA_PROB32 + PSHIFT equ 2 + PLOAD macro dest, mem + mov dest, dword ptr [mem] + endm + PSTORE macro src, mem + mov dword ptr [mem], src + endm +else + PSHIFT equ 1 + PLOAD macro dest, mem + movzx dest, word ptr [mem] + endm + PSTORE macro src, mem + mov word ptr [mem], @CatStr(src, _W) + endm +endif + +PMULT equ (1 SHL PSHIFT) +PMULT_HALF equ (1 SHL (PSHIFT - 1)) +PMULT_2 equ (1 SHL (PSHIFT + 1)) + +kMatchSpecLen_Error_Data equ (1 SHL 9) + +; x0 range +; x1 pbPos / (prob) TREE +; x2 probBranch / prm (MATCHED) / pbPos / cnt +; x3 sym +;====== r4 === RSP +; x5 cod +; x6 t1 NORM_CALC / probs_state / dist +; x7 t0 NORM_CALC / prob2 IF_BIT_1 +; x8 state +; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg +; x10 kBitModelTotal_reg +; r11 probs +; x12 offs (MATCHED) / dic / len_temp +; x13 processedPos +; x14 bit (MATCHED) / dicPos +; r15 buf + + +cod equ x5 +cod_L equ x5_L +range equ x0 +state equ x8 +state_R equ r8 +buf equ r15 +processedPos equ x13 +kBitModelTotal_reg equ x10 + +probBranch equ x2 +probBranch_R equ r2 +probBranch_W equ x2_W + +pbPos equ x1 +pbPos_R equ r1 + +cnt equ x2 +cnt_R equ r2 + +lpMask_reg equ x9 +dicPos equ r14 + +sym equ x3 +sym_R equ r3 +sym_L equ x3_L + +probs equ r11 +dic equ r12 + +t0 equ x7 +t0_W equ x7_W +t0_R equ r7 + +prob2 equ t0 +prob2_W equ t0_W + +t1 equ x6 +t1_R equ r6 + +probs_state equ t1 +probs_state_R equ t1_R + +prm equ r2 +match equ x9 +match_R equ r9 +offs equ x12 +offs_R equ r12 +bit equ x14 +bit_R equ r14 + +sym2 equ x9 +sym2_R equ r9 + +len_temp equ x12 + +dist equ sym +dist2 equ x9 + + + +kNumBitModelTotalBits equ 11 +kBitModelTotal equ (1 SHL kNumBitModelTotalBits) +kNumMoveBits equ 5 +kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) +kTopValue equ (1 SHL 24) + +NORM_2 macro + ; movzx t0, BYTE PTR [buf] + shl cod, 8 + mov cod_L, BYTE PTR [buf] + shl range, 8 + ; or cod, t0 + inc buf +endm + + +NORM macro + cmp range, kTopValue + jae SHORT @F + NORM_2 +@@: +endm + + +; ---------- Branch MACROS ---------- + +UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req + mov prob2, kBitModelTotal_reg + sub prob2, probBranch + shr prob2, kNumMoveBits + add probBranch, prob2 + PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req + sub prob2, range + sub cod, range + mov range, prob2 + mov prob2, probBranch + shr probBranch, kNumMoveBits + sub prob2, probBranch + PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +CMP_COD macro probsArray:req, probOffset:req, probDisp:req + PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT + NORM + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +endm + + +IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jae toLabel +endm + + +IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel + UPDATE_0 probsArray, probOffset, probDisp +endm + + +IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jb toLabel +endm + + +; ---------- CMOV MACROS ---------- + +NORM_CALC macro prob:req + NORM + mov t0, range + shr range, kNumBitModelTotalBits + imul range, prob + sub t0, range + mov t1, cod + sub cod, range +endm + + +PUP macro prob:req, probPtr:req + sub t0, prob + ; only sar works for both 16/32 bit prob modes + sar t0, kNumMoveBits + add t0, prob + PSTORE t0, probPtr +endm + + +PUP_SUB macro prob:req, probPtr:req, symSub:req + sbb sym, symSub + PUP prob, probPtr +endm + + +PUP_COD macro prob:req, probPtr:req, symSub:req + mov t0, kBitModelOffset + cmovb cod, t1 + mov t1, sym + cmovb t0, kBitModelTotal_reg + PUP_SUB prob, probPtr, symSub +endm + + +BIT_0 macro prob:req, probNext:req + PLOAD prob, probs + 1 * PMULT + PLOAD probNext, probs + 1 * PMULT_2 + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 1 * PMULT_2 + PMULT + cmovae probNext, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 2 + PUP_SUB prob, probs + 1 * PMULT, 0 - 1 +endm + + +BIT_1 macro prob:req, probNext:req + PLOAD probNext, probs + sym_R * PMULT_2 + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + sym_R * PMULT + PMULT + cmovae probNext, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 +endm + + +BIT_2 macro prob:req, symSub:req + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, symSub +endm + + +; ---------- MATCHED LITERAL ---------- + +LITM_0 macro + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, offs + and bit, match + PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT + lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] + ; lea prm, [probs + 256 * PMULT + 1 * PMULT] + ; add prm, bit_R + xor offs, bit + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 0 + PUP_SUB x1, prm, -2-1 +endm + + +LITM macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + xor offs, bit + add sym, sym + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 +endm + + +LITM_2 macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + add sym, sym + + NORM_CALC x1 + + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 +endm + + +; ---------- REVERSE BITS ---------- + +REV_0 macro prob:req, probNext:req + ; PLOAD prob, probs + 1 * PMULT + ; lea sym2_R, [probs + 2 * PMULT] + ; PLOAD probNext, probs + 2 * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 3 * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [probs + 3 * PMULT] + cmovae sym2_R, t1_R + PUP prob, probs + 1 * PMULT +endm + + +REV_1 macro prob:req, probNext:req, step:req + add sym2_R, step * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, sym2_R + step * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [sym2_R + step * PMULT] + cmovae sym2_R, t1_R + PUP prob, t1_R - step * PMULT_2 +endm + + +REV_2 macro prob:req, step:req + sub sym2_R, probs + shr sym2, PSHIFT + or sym, sym2 + + NORM_CALC prob + + cmovae range, t0 + lea t0, [sym - step] + cmovb sym, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + PUP prob, probs + sym2_R * PMULT +endm + + +REV_1_VAR macro prob:req + PLOAD prob, sym_R + mov probs, sym_R + add sym_R, sym2_R + + NORM_CALC prob + + cmovae range, t0 + lea t0_R, [sym_R + 1 * sym2_R] + cmovae sym_R, t0_R + mov t0, kBitModelOffset + cmovb cod, t1 + ; mov t1, kBitModelTotal + ; cmovb t0, t1 + cmovb t0, kBitModelTotal_reg + add sym2, sym2 + PUP prob, probs +endm + + + + +LIT_PROBS macro lpMaskParam:req + ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); + mov t0, processedPos + shl t0, 8 + add sym, t0 + and sym, lpMaskParam + add probs_state_R, pbPos_R + mov x1, LOC lc2 + lea sym, dword ptr[sym_R + 2 * sym_R] + add probs, Literal * PMULT + shl sym, x1_L + add probs, sym_R + UPDATE_0 probs_state_R, 0, IsMatch + inc processedPos +endm + + + +kNumPosBitsMax equ 4 +kNumPosStatesMax equ (1 SHL kNumPosBitsMax) + +kLenNumLowBits equ 3 +kLenNumLowSymbols equ (1 SHL kLenNumLowBits) +kLenNumHighBits equ 8 +kLenNumHighSymbols equ (1 SHL kLenNumHighBits) +kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +LenLow equ 0 +LenChoice equ LenLow +LenChoice2 equ (LenLow + kLenNumLowSymbols) +LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +kNumStates equ 12 +kNumStates2 equ 16 +kNumLitStates equ 7 + +kStartPosModelIndex equ 4 +kEndPosModelIndex equ 14 +kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) + +kNumPosSlotBits equ 6 +kNumLenToPosStates equ 4 + +kNumAlignBits equ 4 +kAlignTableSize equ (1 SHL kNumAlignBits) + +kMatchMinLen equ 2 +kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +kStartOffset equ 1664 +SpecPos equ (-kStartOffset) +IsRep0Long equ (SpecPos + kNumFullDistances) +RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) +LenCoder equ (RepLenCoder + kNumLenProbs) +IsMatch equ (LenCoder + kNumLenProbs) +kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) +IsRep equ (kAlign + kAlignTableSize) +IsRepG0 equ (IsRep + kNumStates) +IsRepG1 equ (IsRepG0 + kNumStates) +IsRepG2 equ (IsRepG1 + kNumStates) +PosSlot equ (IsRepG2 + kNumStates) +Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) +NUM_BASE_PROBS equ (Literal + kStartOffset) + +if kAlign ne 0 + .err +endif + +if NUM_BASE_PROBS ne 1984 + .err +endif + + +PTR_FIELD equ dq ? + +CLzmaDec_Asm struct + lc db ? + lp db ? + pb db ? + _pad_ db ? + dicSize dd ? + + probs_Spec PTR_FIELD + probs_1664 PTR_FIELD + dic_Spec PTR_FIELD + dicBufSize PTR_FIELD + dicPos_Spec PTR_FIELD + buf_Spec PTR_FIELD + + range_Spec dd ? + code_Spec dd ? + processedPos_Spec dd ? + checkDicSize dd ? + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? + state_Spec dd ? + remainLen dd ? +CLzmaDec_Asm ends + + +CLzmaDec_Asm_Loc struct + OLD_RSP PTR_FIELD + lzmaPtr PTR_FIELD + _pad0_ PTR_FIELD + _pad1_ PTR_FIELD + _pad2_ PTR_FIELD + dicBufSize PTR_FIELD + probs_Spec PTR_FIELD + dic_Spec PTR_FIELD + + limit PTR_FIELD + bufLimit PTR_FIELD + lc2 dd ? + lpMask dd ? + pbMask dd ? + checkDicSize dd ? + + _pad_ dd ? + remainLen dd ? + dicPos_Spec PTR_FIELD + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? +CLzmaDec_Asm_Loc ends + + +GLOB_2 equ [sym_R].CLzmaDec_Asm. +GLOB equ [r1].CLzmaDec_Asm. +LOC_0 equ [r0].CLzmaDec_Asm_Loc. +LOC equ [RSP].CLzmaDec_Asm_Loc. + + +COPY_VAR macro name + mov t0, GLOB_2 name + mov LOC_0 name, t0 +endm + + +RESTORE_VAR macro name + mov t0, LOC name + mov GLOB name, t0 +endm + + + +IsMatchBranch_Pre macro reg + ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + mov pbPos, LOC pbMask + and pbPos, processedPos + shl pbPos, (kLenNumLowBits + 1 + PSHIFT) + lea probs_state_R, [probs + 1 * state_R] +endm + + +IsMatchBranch macro reg + IsMatchBranch_Pre + IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label +endm + + +CheckLimits macro reg + cmp buf, LOC bufLimit + jae fin_OK + cmp dicPos, LOC limit + jae fin_OK +endm + + + +; RSP is (16x + 8) bytes aligned in WIN64-x64 +; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) + +PARAM_lzma equ REG_ABI_PARAM_0 +PARAM_limit equ REG_ABI_PARAM_1 +PARAM_bufLimit equ REG_ABI_PARAM_2 + +; MY_ALIGN_64 +MY_PROC LzmaDec_DecodeReal_3, 3 +MY_PUSH_PRESERVED_ABI_REGS + + lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] + and r0, -128 + mov r5, RSP + mov RSP, r0 + mov LOC_0 Old_RSP, r5 + mov LOC_0 lzmaPtr, PARAM_lzma + + mov LOC_0 remainLen, 0 ; remainLen must be ZERO + + mov LOC_0 bufLimit, PARAM_bufLimit + mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 + mov dic, GLOB_2 dic_Spec + add PARAM_limit, dic + mov LOC_0 limit, PARAM_limit + + COPY_VAR(rep0) + COPY_VAR(rep1) + COPY_VAR(rep2) + COPY_VAR(rep3) + + mov dicPos, GLOB_2 dicPos_Spec + add dicPos, dic + mov LOC_0 dicPos_Spec, dicPos + mov LOC_0 dic_Spec, dic + + mov x1_L, GLOB_2 pb + mov t0, 1 + shl t0, x1_L + dec t0 + mov LOC_0 pbMask, t0 + + ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; + ; unsigned lc = p->prop.lc; + ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); + + mov x1_L, GLOB_2 lc + mov x2, 100h + mov t0, x2 + shr x2, x1_L + ; inc x1 + add x1_L, PSHIFT + mov LOC_0 lc2, x1 + mov x1_L, GLOB_2 lp + shl t0, x1_L + sub t0, x2 + mov LOC_0 lpMask, t0 + mov lpMask_reg, t0 + + ; mov probs, GLOB_2 probs_Spec + ; add probs, kStartOffset SHL PSHIFT + mov probs, GLOB_2 probs_1664 + mov LOC_0 probs_Spec, probs + + mov t0_R, GLOB_2 dicBufSize + mov LOC_0 dicBufSize, t0_R + + mov x1, GLOB_2 checkDicSize + mov LOC_0 checkDicSize, x1 + + mov processedPos, GLOB_2 processedPos_Spec + + mov state, GLOB_2 state_Spec + shl state, PSHIFT + + mov buf, GLOB_2 buf_Spec + mov range, GLOB_2 range_Spec + mov cod, GLOB_2 code_Spec + mov kBitModelTotal_reg, kBitModelTotal + xor sym, sym + + ; if (processedPos != 0 || checkDicSize != 0) + or x1, processedPos + jz @f + + add t0_R, dic + cmp dicPos, dic + cmovnz t0_R, dicPos + movzx sym, byte ptr[t0_R - 1] + +@@: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + + +; ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + xor state, state +lit_start_2: + LIT_PROBS lpMask_reg + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +lit_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 127 + jbe lit_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + BIT_2 x2, 256 - 1 + + ; mov dic, LOC dic_Spec + mov probs, LOC probs_Spec + IsMatchBranch_Pre + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_end: + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start + + ; jmp IsMatch_label + +; ---------- MATCHES ---------- +; MY_ALIGN_32 +IsMatch_label: + UPDATE_1 probs_state_R, pbPos_R, IsMatch + IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label + + add probs, LenCoder * PMULT + add state, kNumStates * PMULT + +; ---------- LEN DECODE ---------- +len_decode: + mov len_temp, 8 - 1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, (1 SHL (kLenNumLowBits + PSHIFT)) + mov len_temp, -1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) + mov sym, 1 + PLOAD x1, probs + 1 * PMULT + +MY_ALIGN_32 +len8_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 64 + jb len8_loop + + mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen + jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs + +MY_ALIGN_32 +len_mid_0: + UPDATE_0 probs, 0, 0 + add probs, pbPos_R + BIT_0 x2, x1 +len_mid_2: + BIT_1 x1, x2 + BIT_2 x2, len_temp + mov probs, LOC probs_Spec + cmp state, kNumStates * PMULT + jb copy_match + + +; ---------- DECODE DISTANCE ---------- + ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp sym, 3 + kMatchMinLen + cmovb t0, sym + add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) + shl t0, (kNumPosSlotBits + PSHIFT) + add probs, t0_R + + ; sym = Len + ; mov LOC remainLen, sym + mov len_temp, sym + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +slot_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 32 + jb slot_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + mov x1, sym + BIT_2 x2, 64-1 + + and sym, 3 + mov probs, LOC probs_Spec + cmp x1, 32 + kEndPosModelIndex / 2 + jb short_dist + + ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + sub x1, (32 + 1 + kNumAlignBits) + ; distance = (2 | (distance & 1)); + or sym, 2 + PLOAD x2, probs + 1 * PMULT + shl sym, kNumAlignBits + 1 + lea sym2_R, [probs + 2 * PMULT] + + jmp direct_norm + ; lea t1, [sym_R + (1 SHL kNumAlignBits)] + ; cmp range, kTopValue + ; jb direct_norm + +; ---------- DIRECT DISTANCE ---------- +MY_ALIGN_32 +direct_loop: + shr range, 1 + mov t0, cod + sub cod, range + cmovs cod, t0 + cmovns sym, t1 + + comment ~ + sub cod, range + mov x2, cod + sar x2, 31 + lea sym, dword ptr [r2 + sym_R * 2 + 1] + and x2, range + add cod, x2 + ~ + dec x1 + je direct_end + + add sym, sym +direct_norm: + lea t1, [sym_R + (1 SHL kNumAlignBits)] + cmp range, kTopValue + jae near ptr direct_loop + ; we align for 32 here with "near ptr" command above + NORM_2 + jmp direct_loop + +MY_ALIGN_32 +direct_end: + ; prob = + kAlign; + ; distance <<= kNumAlignBits; + REV_0 x2, x1 + REV_1 x1, x2, 2 + REV_1 x2, x1, 4 + REV_2 x1, 8 + +decode_dist_end: + + ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + mov t1, LOC rep0 + mov x1, LOC rep1 + mov x2, LOC rep2 + + mov t0, LOC checkDicSize + test t0, t0 + cmove t0, processedPos + cmp sym, t0 + jae end_of_payload + ; jmp end_of_payload ; for debug + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + inc sym + mov LOC rep0, sym + ; mov sym, LOC remainLen + mov sym, len_temp + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + +; ---------- COPY MATCH ---------- +copy_match: + + ; len += kMatchMinLen; + ; add sym, kMatchMinLen + + ; if ((rem = limit - dicPos) == 0) + ; { + ; p->dicPos = dicPos; + ; return SZ_ERROR_DATA; + ; } + mov cnt_R, LOC limit + sub cnt_R, dicPos + jz fin_dicPos_LIMIT + + ; curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, sym_R + ; cmovae cnt_R, sym_R ; 64-bit + cmovae cnt, sym ; 32-bit + + mov dic, LOC dic_Spec + mov x1, LOC rep0 + + mov t0_R, dicPos + add dicPos, cnt_R + ; processedPos += curLen; + add processedPos, cnt + ; len -= curLen; + sub sym, cnt + mov LOC remainLen, sym + + sub t0_R, dic + + ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + sub t0_R, r1 + jae @f + + mov r1, LOC dicBufSize + add t0_R, r1 + sub r1, t0_R + cmp cnt_R, r1 + ja copy_match_cross +@@: + ; if (curLen <= dicBufSize - pos) + +; ---------- COPY MATCH FAST ---------- + ; Byte *dest = dic + dicPos; + ; mov r1, dic + ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; + ; sub t0_R, dicPos + ; dicPos += curLen; + + ; const Byte *lim = dest + curLen; + add t0_R, dic + movzx sym, byte ptr[t0_R] + add t0_R, cnt_R + neg cnt_R + ; lea r1, [dicPos - 1] +copy_common: + dec dicPos + ; cmp LOC rep0, 1 + ; je rep0Label + + ; t0_R - src_lim + ; r1 - dest_lim - 1 + ; cnt_R - (-cnt) + + IsMatchBranch_Pre + inc cnt_R + jz copy_end +MY_ALIGN_16 +@@: + mov byte ptr[cnt_R * 1 + dicPos], sym_L + movzx sym, byte ptr[cnt_R * 1 + t0_R] + inc cnt_R + jnz @b + +copy_end: +lz_end_match: + mov byte ptr[dicPos], sym_L + inc dicPos + + ; IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + + + +; ---------- LITERAL MATCHED ---------- + + LIT_PROBS LOC lpMask + + ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov x1, LOC rep0 + ; mov dic, LOC dic_Spec + mov LOC dicPos_Spec, dicPos + + ; state -= (state < 10) ? 3 : 6; + lea t0, [state_R - 6 * PMULT] + sub state, 3 * PMULT + cmp state, 7 * PMULT + cmovae state, t0 + + sub dicPos, dic + sub dicPos, r1 + jae @f + add dicPos, LOC dicBufSize +@@: + comment ~ + xor t0, t0 + sub dicPos, r1 + cmovb t0_R, LOC dicBufSize + ~ + + movzx match, byte ptr[dic + dicPos * 1] + + ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, match + mov sym, 1 +MY_ALIGN_16 +litm_loop: + LITM + cmp sym, 256 + jb litm_loop + sub sym, 256 + + else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + endif + + mov probs, LOC probs_Spec + IsMatchBranch_Pre + ; mov dic, LOC dic_Spec + mov dicPos, LOC dicPos_Spec + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_matched_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + ; IsMatchBranch + mov lpMask_reg, LOC lpMask + sub state, 3 * PMULT + jmp lit_start_2 + + + +; ---------- REP 0 LITERAL ---------- +MY_ALIGN_32 +IsRep0Short_label: + UPDATE_0 probs_state_R, pbPos_R, IsRep0Long + + ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov dic, LOC dic_Spec + mov t0_R, dicPos + mov probBranch, LOC rep0 + sub t0_R, dic + + sub probs, RepLenCoder * PMULT + + ; state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + + ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT + ; so we don't need the following (dicPos == limit) check here: + ; cmp dicPos, LOC limit + ; jae fin_dicPos_LIMIT_REP_SHORT + + inc processedPos + + IsMatchBranch_Pre + +; xor sym, sym +; sub t0_R, probBranch_R +; cmovb sym_R, LOC dicBufSize +; add t0_R, sym_R + sub t0_R, probBranch_R + jae @f + add t0_R, LOC dicBufSize +@@: + movzx sym, byte ptr[dic + t0_R * 1] + jmp lz_end_match + + +MY_ALIGN_32 +IsRep_label: + UPDATE_1 probs_state_R, 0, IsRep + + ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + ; So we don't check it here. + + ; mov t0, processedPos + ; or t0, LOC checkDicSize + ; jz fin_ERROR_2 + + ; state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + ; prob = probs + RepLenCoder; + add probs, RepLenCoder * PMULT + + IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label + UPDATE_1 probs_state_R, pbPos_R, IsRep0Long + jmp len_decode + +MY_ALIGN_32 +IsRepG0_label: + UPDATE_1 probs_state_R, 0, IsRepG0 + mov dist2, LOC rep0 + mov dist, LOC rep1 + mov LOC rep1, dist2 + + IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label + mov LOC rep0, dist + jmp len_decode + +; MY_ALIGN_32 +IsRepG1_label: + UPDATE_1 probs_state_R, 0, IsRepG1 + mov dist2, LOC rep2 + mov LOC rep2, dist + + IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label + mov LOC rep0, dist2 + jmp len_decode + +; MY_ALIGN_32 +IsRepG2_label: + UPDATE_1 probs_state_R, 0, IsRepG2 + mov dist, LOC rep3 + mov LOC rep3, dist2 + mov LOC rep0, dist + jmp len_decode + + + +; ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_32 +short_dist: + sub x1, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, x1_L + lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] + mov sym2, PMULT ; step +MY_ALIGN_32 +spec_loop: + REV_1_VAR x2 + dec x1 + jnz spec_loop + + mov probs, LOC probs_Spec + sub sym, sym2 + sub sym, SpecPos * PMULT + sub sym_R, probs + shr sym, PSHIFT + + jmp decode_dist_end + + +; ---------- COPY MATCH CROSS ---------- +copy_match_cross: + ; t0_R - src pos + ; r1 - len to dicBufSize + ; cnt_R - total copy len + + mov t1_R, t0_R ; srcPos + mov t0_R, dic + mov r1, LOC dicBufSize ; + neg cnt_R +@@: + movzx sym, byte ptr[t1_R * 1 + t0_R] + inc t1_R + mov byte ptr[cnt_R * 1 + dicPos], sym_L + inc cnt_R + cmp t1_R, r1 + jne @b + + movzx sym, byte ptr[t0_R] + sub t0_R, cnt_R + jmp copy_common + + + + +; fin_dicPos_LIMIT_REP_SHORT: + ; mov sym, 1 + +fin_dicPos_LIMIT: + mov LOC remainLen, sym + jmp fin_OK + ; For more strict mode we can stop decoding with error + ; mov sym, 1 + ; jmp fin + + +fin_ERROR_MATCH_DIST: + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + add len_temp, kMatchSpecLen_Error_Data + mov LOC remainLen, len_temp + + mov LOC rep0, sym + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + ; jmp fin_OK + mov sym, 1 + jmp fin + +end_of_payload: + inc sym + jnz fin_ERROR_MATCH_DIST + + mov LOC remainLen, kMatchSpecLenStart + sub state, kNumStates * PMULT + +fin_OK: + xor sym, sym + +fin: + NORM + + mov r1, LOC lzmaPtr + + sub dicPos, LOC dic_Spec + mov GLOB dicPos_Spec, dicPos + mov GLOB buf_Spec, buf + mov GLOB range_Spec, range + mov GLOB code_Spec, cod + shr state, PSHIFT + mov GLOB state_Spec, state + mov GLOB processedPos_Spec, processedPos + + RESTORE_VAR(remainLen) + RESTORE_VAR(rep0) + RESTORE_VAR(rep1) + RESTORE_VAR(rep2) + RESTORE_VAR(rep3) + + mov x0, sym + + mov RSP, LOC Old_RSP + +MY_POP_PRESERVED_ABI_REGS +MY_ENDP + +_TEXT$LZMADECOPT ENDS + +end