X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Farm64%2FLzmaDecOpt.S;fp=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Farm64%2FLzmaDecOpt.S;h=10dc47354867b9bec953783e2db87151e48aeed1;hb=9e052883388b2b607a488e48b3cb6db52a0997dd;hp=0000000000000000000000000000000000000000;hpb=7e49a25d527a1d79569f39a6f2d8d28a0d6cebd5;p=pcsx_rearmed.git diff --git a/deps/libchdr/deps/lzma-22.01/src/Asm/arm64/LzmaDecOpt.S b/deps/libchdr/deps/lzma-22.01/src/Asm/arm64/LzmaDecOpt.S new file mode 100644 index 00000000..10dc4735 --- /dev/null +++ b/deps/libchdr/deps/lzma-22.01/src/Asm/arm64/LzmaDecOpt.S @@ -0,0 +1,1487 @@ +// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function +// 2021-04-25 : Igor Pavlov : Public domain + +/* +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). +*/ + + +#include "7zAsm.S" + + // .arch armv8-a + // .file "LzmaDecOpt.c" + .text + .align 2 + .p2align 4,,15 +#ifdef __APPLE__ + .globl _LzmaDec_DecodeReal_3 +#else + .global LzmaDec_DecodeReal_3 +#endif + // .type LzmaDec_DecodeReal_3, %function + +// #define _LZMA_SIZE_OPT 1 + +#define LZMA_USE_4BYTES_FILL 1 +// #define LZMA_USE_2BYTES_COPY 1 +// #define LZMA_USE_CMOV_LZ_WRAP 1 +// #define _LZMA_PROB32 1 + +#define MY_ALIGN_FOR_ENTRY MY_ALIGN_32 +#define MY_ALIGN_FOR_LOOP MY_ALIGN_32 +#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16 + +#ifdef _LZMA_PROB32 + .equ PSHIFT , 2 + .macro PLOAD dest:req, mem:req + ldr \dest, [\mem] + .endm + .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req + ldr \dest, [\mem, \offset]! + .endm + .macro PLOAD_2 dest:req, mem1:req, mem2:req + ldr \dest, [\mem1, \mem2] + .endm + .macro PLOAD_LSL dest:req, mem1:req, mem2:req + ldr \dest, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE src:req, mem:req + str \src, [\mem] + .endm + .macro PSTORE_2 src:req, mem1:req, mem2:req + str \src, [\mem1, \mem2] + .endm + .macro PSTORE_LSL src:req, mem1:req, mem2:req + str \src, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req + // you must check that temp_reg is free register when macro is used + add \temp_reg, \mem1, \mem2 + str \src, [\temp_reg, \mem2] + .endm +#else + // .equ PSHIFT , 1 + #define PSHIFT 1 + .macro PLOAD dest:req, mem:req + ldrh \dest, [\mem] + .endm + .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req + ldrh \dest, [\mem, \offset]! + .endm + .macro PLOAD_2 dest:req, mem1:req, mem2:req + ldrh \dest, [\mem1, \mem2] + .endm + .macro PLOAD_LSL dest:req, mem1:req, mem2:req + ldrh \dest, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE src:req, mem:req + strh \src, [\mem] + .endm + .macro PSTORE_2 src:req, mem1:req, mem2:req + strh \src, [\mem1, \mem2] + .endm + .macro PSTORE_LSL src:req, mem1:req, mem2:req + strh \src, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req + strh \src, [\mem1, \mem2] + .endm +#endif + +.equ PMULT , (1 << PSHIFT) +.equ PMULT_2 , (2 << PSHIFT) + +.equ kMatchSpecLen_Error_Data , (1 << 9) + +# x7 t0 : NORM_CALC : prob2 (IF_BIT_1) +# x6 t1 : NORM_CALC : probs_state +# x8 t2 : (LITM) temp : (TREE) temp +# x4 t3 : (LITM) bit : (TREE) temp : UPDATE_0/UPDATE_0 temp +# x10 t4 : (LITM) offs : (TREE) probs_PMULT : numBits +# x9 t5 : (LITM) match : sym2 (ShortDist) +# x1 t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos +# x2 t7 : (LITM) prm : probBranch : cnt +# x3 sym : dist +# x12 len +# x0 range +# x5 cod + + +#define range w0 + +// t6 +#define pbPos w1 +#define pbPos_R r1 +#define prob_reg w1 +#define litm_prob prob_reg + +// t7 +#define probBranch w2 +#define cnt w2 +#define cnt_R r2 +#define prm r2 + +#define sym w3 +#define sym_R r3 +#define dist sym + +#define t3 w4 +#define bit w4 +#define bit_R r4 +#define update_temp_reg r4 + +#define cod w5 + +#define t1 w6 +#define t1_R r6 +#define probs_state t1_R + +#define t0 w7 +#define t0_R r7 +#define prob2 t0 + +#define t2 w8 +#define t2_R r8 + +// t5 +#define match w9 +#define sym2 w9 +#define sym2_R r9 + +#define t4 w10 +#define t4_R r10 + +#define offs w10 +#define offs_R r10 + +#define probs r11 + +#define len w12 +#define len_R x12 + +#define state w13 +#define state_R r13 + +#define dicPos r14 +#define buf r15 +#define bufLimit r16 +#define dicBufSize r17 + +#define limit r19 +#define rep0 w20 +#define rep0_R r20 +#define rep1 w21 +#define rep2 w22 +#define rep3 w23 +#define dic r24 +#define probs_IsMatch r25 +#define probs_Spec r26 +#define checkDicSize w27 +#define processedPos w28 +#define pbMask w29 +#define lc2_lpMask w30 + + +.equ kNumBitModelTotalBits , 11 +.equ kBitModelTotal , (1 << kNumBitModelTotalBits) +.equ kNumMoveBits , 5 +.equ kBitModelOffset , (kBitModelTotal - (1 << kNumMoveBits) + 1) + +.macro NORM_2 macro + ldrb t0, [buf], 1 + shl range, 8 + orr cod, t0, cod, lsl 8 + /* + mov t0, cod + ldrb cod, [buf], 1 + shl range, 8 + bfi cod, t0, #8, #24 + */ +.endm + +.macro TEST_HIGH_BYTE_range macro + tst range, 0xFF000000 +.endm + +.macro NORM macro + TEST_HIGH_BYTE_range + jnz 1f + NORM_2 +1: +.endm + + +# ---------- Branch MACROS ---------- + +.macro UPDATE_0__0 + sub prob2, probBranch, kBitModelOffset +.endm + +.macro UPDATE_0__1 + sub probBranch, probBranch, prob2, asr #(kNumMoveBits) +.endm + +.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req + .if \probDisp == 0 + PSTORE_2 probBranch, \probsArray, \probOffset + .elseif \probOffset == 0 + PSTORE_2 probBranch, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + // add update_temp_reg, \probsArray, \probOffset + PSTORE_2 probBranch, update_temp_reg, \probDisp * PMULT + .endif +.endm + +.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req + UPDATE_0__0 + UPDATE_0__1 + UPDATE_0__2 \probsArray, \probOffset, \probDisp +.endm + + +.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req + // sub cod, cod, prob2 + // sub range, range, prob2 + p2_sub cod, range + sub range, prob2, range + sub prob2, probBranch, probBranch, lsr #(kNumMoveBits) + .if \probDisp == 0 + PSTORE_2 prob2, \probsArray, \probOffset + .elseif \probOffset == 0 + PSTORE_2 prob2, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + // add update_temp_reg, \probsArray, \probOffset + PSTORE_2 prob2, update_temp_reg, \probDisp * PMULT + .endif +.endm + + +.macro CMP_COD_BASE + NORM + // lsr prob2, range, kNumBitModelTotalBits + // imul prob2, probBranch + // cmp cod, prob2 + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +.endm + +.macro CMP_COD_1 probsArray:req + PLOAD probBranch, \probsArray + CMP_COD_BASE +.endm + +.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req + .if \probDisp == 0 + PLOAD_2 probBranch, \probsArray, \probOffset + .elseif \probOffset == 0 + PLOAD_2 probBranch, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + add update_temp_reg, \probsArray, \probOffset + PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT + .endif + CMP_COD_BASE +.endm + + +.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD_3 \probsArray, \probOffset, \probDisp + jae \toLabel +.endm + + +.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel + UPDATE_0 \probsArray, \probOffset, \probDisp +.endm + + +.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD_3 \probsArray, \probOffset, \probDisp + jb \toLabel +.endm + +.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req + CMP_COD_1 \probsArray + jb \toLabel +.endm + + +# ---------- CMOV MACROS ---------- + +.macro NORM_LSR + NORM + lsr t0, range, #kNumBitModelTotalBits +.endm + +.macro COD_RANGE_SUB + subs t1, cod, t0 + p2_sub range, t0 +.endm + +.macro RANGE_IMUL prob:req + imul t0, \prob +.endm + +.macro NORM_CALC prob:req + NORM_LSR + RANGE_IMUL \prob + COD_RANGE_SUB +.endm + +.macro CMOV_range + cmovb range, t0 +.endm + +.macro CMOV_code + cmovae cod, t1 +.endm + +.macro CMOV_code_Model_Pre prob:req + sub t0, \prob, kBitModelOffset + CMOV_code + cmovae t0, \prob +.endm + + +.macro PUP_BASE_2 prob:req, dest_reg:req + # only sar works for both 16/32 bit prob modes + sub \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits) +.endm + +.macro PUP prob:req, probPtr:req, mem2:req + PUP_BASE_2 \prob, t0 + PSTORE_2 t0, \probPtr, \mem2 +.endm + + + +#define probs_PMULT t4_R + +.macro BIT_01 + add probs_PMULT, probs, PMULT +.endm + + +.macro BIT_0_R prob:req + PLOAD_2 \prob, probs, 1 * PMULT + NORM_LSR + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + PLOAD_2 t2, probs, 1 * PMULT_2 + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT + PUP_BASE_2 \prob, t3 + csel \prob, t2, t0, lo + CMOV_code + mov sym, 2 + PSTORE_2 t3, probs, 1 * PMULT + adc sym, sym, wzr + BIT_01 +.endm + +.macro BIT_1_R prob:req + NORM_LSR + p2_add sym, sym + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + PLOAD_LSL t2, probs, sym_R + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + PLOAD_LSL t0, probs_PMULT, sym_R + PUP_BASE_2 \prob, t3 + csel \prob, t2, t0, lo + CMOV_code + PSTORE_LSL_M1 t3, probs, sym_R, t2_R + adc sym, sym, wzr +.endm + + +.macro BIT_2_R prob:req + NORM_LSR + p2_add sym, sym + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + CMOV_code + PUP_BASE_2 \prob, t3 + PSTORE_LSL_M1 t3, probs, sym_R, t2_R + adc sym, sym, wzr +.endm + + +# ---------- MATCHED LITERAL ---------- + +.macro LITM_0 macro + shl match, (PSHIFT + 1) + and bit, match, 256 * PMULT + add prm, probs, 256 * PMULT + 1 * PMULT + p2_add match, match + p2_add prm, bit_R + eor offs, bit, 256 * PMULT + PLOAD litm_prob, prm + + NORM_LSR + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + cmovae offs, bit + CMOV_range + and bit, match, offs + cmovae t2, litm_prob + CMOV_code + mov sym, 2 + PUP_BASE_2 litm_prob, t2 + PSTORE t2, prm + add prm, probs, offs_R + adc sym, sym, wzr +.endm + +.macro LITM macro + p2_add prm, bit_R + xor offs, bit + PLOAD_LSL litm_prob, prm, sym_R + + NORM_LSR + p2_add match, match + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + cmovae offs, bit + CMOV_range + and bit, match, offs + cmovae t2, litm_prob + CMOV_code + PUP_BASE_2 litm_prob, t2 + PSTORE_LSL t2, prm, sym_R + add prm, probs, offs_R + adc sym, sym, sym +.endm + + +.macro LITM_2 macro + p2_add prm, bit_R + PLOAD_LSL litm_prob, prm, sym_R + + NORM_LSR + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + CMOV_range + cmovae t2, litm_prob + CMOV_code + PUP_BASE_2 litm_prob, t2 + PSTORE_LSL t2, prm, sym_R + adc sym, sym, sym +.endm + + +# ---------- REVERSE BITS ---------- + +.macro REV_0 prob:req + NORM_CALC \prob + CMOV_range + PLOAD t2, sym2_R + PLOAD_2 t3, probs, 3 * PMULT + CMOV_code_Model_Pre \prob + add t1_R, probs, 3 * PMULT + cmovae sym2_R, t1_R + PUP \prob, probs, 1 * PMULT + csel \prob, t2, t3, lo +.endm + + +.macro REV_1 prob:req, step:req + NORM_LSR + PLOAD_PREINDEXED t2, sym2_R, (\step * PMULT) + RANGE_IMUL \prob + COD_RANGE_SUB + CMOV_range + PLOAD_2 t3, sym2_R, (\step * PMULT) + sub t0, \prob, kBitModelOffset + CMOV_code + add t1_R, sym2_R, \step * PMULT + cmovae t0, \prob + cmovae sym2_R, t1_R + PUP_BASE_2 \prob, t0 + csel \prob, t2, t3, lo + PSTORE_2 t0, t1_R, 0 - \step * PMULT_2 +.endm + + +.macro REV_2 prob:req, step:req + sub t1_R, sym2_R, probs + NORM_LSR + orr sym, sym, t1, lsr #PSHIFT + RANGE_IMUL \prob + COD_RANGE_SUB + sub t2, sym, \step + CMOV_range + cmovb sym, t2 + CMOV_code_Model_Pre \prob + PUP \prob, sym2_R, 0 +.endm + + +.macro REV_1_VAR prob:req + PLOAD \prob, sym_R + mov probs, sym_R + p2_add sym_R, sym2_R + NORM_LSR + add t2_R, sym_R, sym2_R + RANGE_IMUL \prob + COD_RANGE_SUB + cmovae sym_R, t2_R + CMOV_range + CMOV_code_Model_Pre \prob + p2_add sym2, sym2 + PUP \prob, probs, 0 +.endm + + +.macro add_big dest:req, src:req, param:req + .if (\param) < (1 << 12) + add \dest, \src, \param + .else + #ifndef _LZMA_PROB32 + .error "unexpcted add_big expansion" + #endif + add \dest, \src, (\param) / 2 + add \dest, \dest, (\param) - (\param) / 2 + .endif +.endm + +.macro sub_big dest:req, src:req, param:req + .if (\param) < (1 << 12) + sub \dest, \src, \param + .else + #ifndef _LZMA_PROB32 + .error "unexpcted sub_big expansion" + #endif + sub \dest, \src, (\param) / 2 + sub \dest, \dest, (\param) - (\param) / 2 + .endif +.endm + + +.macro SET_probs offset:req + // add_big probs, probs_Spec, (\offset) * PMULT + add probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT +.endm + + +.macro LIT_PROBS + add sym, sym, processedPos, lsl 8 + inc processedPos + UPDATE_0__0 + shl sym, lc2_lpMask + SET_probs Literal + p2_and sym, lc2_lpMask + // p2_add probs_state, pbPos_R + p2_add probs, sym_R + UPDATE_0__1 + add probs, probs, sym_R, lsl 1 + UPDATE_0__2 probs_state, pbPos_R, 0 +.endm + + + +.equ kNumPosBitsMax , 4 +.equ kNumPosStatesMax , (1 << kNumPosBitsMax) + +.equ kLenNumLowBits , 3 +.equ kLenNumLowSymbols , (1 << kLenNumLowBits) +.equ kLenNumHighBits , 8 +.equ kLenNumHighSymbols , (1 << kLenNumHighBits) +.equ kNumLenProbs , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +.equ LenLow , 0 +.equ LenChoice , LenLow +.equ LenChoice2 , (LenLow + kLenNumLowSymbols) +.equ LenHigh , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +.equ kNumStates , 12 +.equ kNumStates2 , 16 +.equ kNumLitStates , 7 + +.equ kStartPosModelIndex , 4 +.equ kEndPosModelIndex , 14 +.equ kNumFullDistances , (1 << (kEndPosModelIndex >> 1)) + +.equ kNumPosSlotBits , 6 +.equ kNumLenToPosStates , 4 + +.equ kNumAlignBits , 4 +.equ kAlignTableSize , (1 << kNumAlignBits) + +.equ kMatchMinLen , 2 +.equ kMatchSpecLenStart , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +// .equ kStartOffset , 1408 +.equ kStartOffset , 0 +.equ SpecPos , (-kStartOffset) +.equ IsRep0Long , (SpecPos + kNumFullDistances) +.equ RepLenCoder , (IsRep0Long + (kNumStates2 << kNumPosBitsMax)) +.equ LenCoder , (RepLenCoder + kNumLenProbs) +.equ IsMatch , (LenCoder + kNumLenProbs) +.equ kAlign , (IsMatch + (kNumStates2 << kNumPosBitsMax)) +.equ IsRep , (kAlign + kAlignTableSize) +.equ IsRepG0 , (IsRep + kNumStates) +.equ IsRepG1 , (IsRepG0 + kNumStates) +.equ IsRepG2 , (IsRepG1 + kNumStates) +.equ PosSlot , (IsRepG2 + kNumStates) +.equ Literal , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits)) +.equ NUM_BASE_PROBS , (Literal + kStartOffset) + +.if kStartOffset != 0 // && IsMatch != 0 + .error "Stop_Compiling_Bad_StartOffset" +.endif + +.if NUM_BASE_PROBS != 1984 + .error "Stop_Compiling_Bad_LZMA_PROBS" +.endif + +.equ offset_lc , 0 +.equ offset_lp , 1 +.equ offset_pb , 2 +.equ offset_dicSize , 4 +.equ offset_probs , 4 + offset_dicSize +.equ offset_probs_1664 , 8 + offset_probs +.equ offset_dic , 8 + offset_probs_1664 +.equ offset_dicBufSize , 8 + offset_dic +.equ offset_dicPos , 8 + offset_dicBufSize +.equ offset_buf , 8 + offset_dicPos +.equ offset_range , 8 + offset_buf +.equ offset_code , 4 + offset_range +.equ offset_processedPos , 4 + offset_code +.equ offset_checkDicSize , 4 + offset_processedPos +.equ offset_rep0 , 4 + offset_checkDicSize +.equ offset_rep1 , 4 + offset_rep0 +.equ offset_rep2 , 4 + offset_rep1 +.equ offset_rep3 , 4 + offset_rep2 +.equ offset_state , 4 + offset_rep3 +.equ offset_remainLen , 4 + offset_state +.equ offset_TOTAL_SIZE , 4 + offset_remainLen + +.if offset_TOTAL_SIZE != 96 + .error "Incorrect offset_TOTAL_SIZE" +.endif + + +.macro IsMatchBranch_Pre + # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + and pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT) + add probs_state, probs_IsMatch, state_R +.endm + + +/* +.macro IsMatchBranch + IsMatchBranch_Pre + IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label +.endm +*/ + +.macro CheckLimits + cmp buf, bufLimit + jae fin_OK + cmp dicPos, limit + jae fin_OK +.endm + +#define CheckLimits_lit CheckLimits +/* +.macro CheckLimits_lit + cmp buf, bufLimit + jae fin_OK_lit + cmp dicPos, limit + jae fin_OK_lit +.endm +*/ + + +#define PARAM_lzma REG_ABI_PARAM_0 +#define PARAM_limit REG_ABI_PARAM_1 +#define PARAM_bufLimit REG_ABI_PARAM_2 + + +.macro LOAD_LZMA_VAR reg:req, struct_offs:req + ldr \reg, [PARAM_lzma, \struct_offs] +.endm + +.macro LOAD_LZMA_BYTE reg:req, struct_offs:req + ldrb \reg, [PARAM_lzma, \struct_offs] +.endm + +.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req + ldp \reg0, \reg1, [PARAM_lzma, \struct_offs] +.endm + + +LzmaDec_DecodeReal_3: +_LzmaDec_DecodeReal_3: +/* +.LFB0: + .cfi_startproc +*/ + + stp x19, x20, [sp, -128]! + stp x21, x22, [sp, 16] + stp x23, x24, [sp, 32] + stp x25, x26, [sp, 48] + stp x27, x28, [sp, 64] + stp x29, x30, [sp, 80] + + str PARAM_lzma, [sp, 120] + + mov bufLimit, PARAM_bufLimit + mov limit, PARAM_limit + + LOAD_LZMA_PAIR dic, dicBufSize, offset_dic + LOAD_LZMA_PAIR dicPos, buf, offset_dicPos + LOAD_LZMA_PAIR rep0, rep1, offset_rep0 + LOAD_LZMA_PAIR rep2, rep3, offset_rep2 + + mov t0, 1 << (kLenNumLowBits + 1 + PSHIFT) + LOAD_LZMA_BYTE pbMask, offset_pb + p2_add limit, dic + mov len, wzr // we can set it in all requiread branches instead + lsl pbMask, t0, pbMask + p2_add dicPos, dic + p2_sub pbMask, t0 + + LOAD_LZMA_BYTE lc2_lpMask, offset_lc + mov t0, 256 << PSHIFT + LOAD_LZMA_BYTE t1, offset_lp + p2_add t1, lc2_lpMask + p2_sub lc2_lpMask, (256 << PSHIFT) - PSHIFT + shl t0, t1 + p2_add lc2_lpMask, t0 + + LOAD_LZMA_VAR probs_Spec, offset_probs + LOAD_LZMA_VAR checkDicSize, offset_checkDicSize + LOAD_LZMA_VAR processedPos, offset_processedPos + LOAD_LZMA_VAR state, offset_state + // range is r0 : this load must be last don't move + LOAD_LZMA_PAIR range, cod, offset_range + mov sym, wzr + shl state, PSHIFT + + add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT) + + // if (processedPos != 0 || checkDicSize != 0) + orr t0, checkDicSize, processedPos + cbz t0, 1f + add t0_R, dicBufSize, dic + cmp dicPos, dic + cmovne t0_R, dicPos + ldrb sym, [t0_R, -1] +1: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + +#define BIT_0 BIT_0_R prob_reg +#define BIT_1 BIT_1_R prob_reg +#define BIT_2 BIT_2_R prob_reg + +# ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + mov state, wzr +lit_start_2: + LIT_PROBS + + #ifdef _LZMA_SIZE_OPT + + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +lit_loop: + BIT_1 + tbz sym, 7, lit_loop + + #else + + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + + #endif + + BIT_2 + IsMatchBranch_Pre + strb sym, [dicPos], 1 + p2_and sym, 255 + + CheckLimits_lit +lit_end: + IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start + + # jmp IsMatch_label + + +#define FLAG_STATE_BITS (4 + PSHIFT) + +# ---------- MATCHES ---------- +# MY_ALIGN_FOR_ENTRY +IsMatch_label: + UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch) + IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label + + SET_probs LenCoder + or state, (1 << FLAG_STATE_BITS) + +# ---------- LEN DECODE ---------- +len_decode: + mov len, 8 - kMatchMinLen + IF_BIT_0_NOUP_1 probs, len_mid_0 + UPDATE_1 probs, 0, 0 + p2_add probs, (1 << (kLenNumLowBits + PSHIFT)) + mov len, 0 - kMatchMinLen + IF_BIT_0_NOUP_1 probs, len_mid_0 + UPDATE_1 probs, 0, 0 + p2_add probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT)) + + #if 0 == 1 + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + #else + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +len8_loop: + BIT_1 + tbz sym, 6, len8_loop + #endif + + mov len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen + jmp len_mid_2 + +MY_ALIGN_FOR_ENTRY +len_mid_0: + UPDATE_0 probs, 0, 0 + p2_add probs, pbPos_R + BIT_0 +len_mid_2: + BIT_1 + BIT_2 + sub len, sym, len + tbz state, FLAG_STATE_BITS, copy_match + +# ---------- DECODE DISTANCE ---------- + // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp len, 3 + kMatchMinLen + cmovb t0, len + SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits)) + add probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT) + + #ifdef _LZMA_SIZE_OPT + + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +slot_loop: + BIT_1 + tbz sym, 5, slot_loop + + #else + + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + + #endif + + #define numBits t4 + mov numBits, sym + BIT_2 + // we need only low bits + p2_and sym, 3 + cmp numBits, 32 + kEndPosModelIndex / 2 + jb short_dist + + SET_probs kAlign + + # unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + p2_sub numBits, (32 + 1 + kNumAlignBits) + # distance = (2 | (distance & 1)); + or sym, 2 + PLOAD_2 prob_reg, probs, 1 * PMULT + add sym2_R, probs, 2 * PMULT + +# ---------- DIRECT DISTANCE ---------- + +.macro DIRECT_1 + shr range, 1 + subs t0, cod, range + p2_add sym, sym + // add t1, sym, 1 + csel cod, cod, t0, mi + csinc sym, sym, sym, mi + // csel sym, t1, sym, pl + // adc sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams + dec_s numBits + je direct_end +.endm + + #ifdef _LZMA_SIZE_OPT + + jmp direct_norm +MY_ALIGN_FOR_ENTRY +direct_loop: + DIRECT_1 +direct_norm: + TEST_HIGH_BYTE_range + jnz direct_loop + NORM_2 + jmp direct_loop + + #else + +.macro DIRECT_2 + TEST_HIGH_BYTE_range + jz direct_unroll + DIRECT_1 +.endm + + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + +direct_unroll: + NORM_2 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + jmp direct_unroll + + #endif + +MY_ALIGN_FOR_ENTRY +direct_end: + shl sym, kNumAlignBits + REV_0 prob_reg + REV_1 prob_reg, 2 + REV_1 prob_reg, 4 + REV_2 prob_reg, 8 + +decode_dist_end: + + // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + tst checkDicSize, checkDicSize + csel t0, processedPos, checkDicSize, eq + cmp sym, t0 + jae end_of_payload + // jmp end_of_payload # for debug + + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + add rep0, sym, 1 + +.macro STATE_UPDATE_FOR_MATCH + // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + // cmp state, (kNumStates + kNumLitStates) * PMULT + cmp state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS) + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 +.endm + STATE_UPDATE_FOR_MATCH + +# ---------- COPY MATCH ---------- +copy_match: + + // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA; + subs cnt_R, limit, dicPos + // jz fin_dicPos_LIMIT + jz fin_OK + + // curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, len_R + cmovae cnt, len + + sub t0_R, dicPos, dic + p2_add dicPos, cnt_R + p2_add processedPos, cnt + p2_sub len, cnt + + // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + p2_sub_s t0_R, rep0_R + jae 1f + + cmn t0_R, cnt_R + p2_add t0_R, dicBufSize + ja copy_match_cross +1: +# ---------- COPY MATCH FAST ---------- + # t0_R : src_pos + p2_add t0_R, dic + ldrb sym, [t0_R] + p2_add t0_R, cnt_R + p1_neg cnt_R + +copy_common: + dec dicPos + + # dicPos : (ptr_to_last_dest_BYTE) + # t0_R : (src_lim) + # cnt_R : (-curLen) + + IsMatchBranch_Pre + + inc_s cnt_R + jz copy_end + + cmp rep0, 1 + je copy_match_0 + + #ifdef LZMA_USE_2BYTES_COPY + strb sym, [dicPos, cnt_R] + dec dicPos + # dicPos : (ptr_to_last_dest_16bitWORD) + p2_and cnt_R, -2 + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jz 2f +MY_ALIGN_FOR_LOOP +1: + /* + strh sym, [dicPos, cnt_R] + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jz 2f + */ + + strh sym, [dicPos, cnt_R] + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jnz 1b +2: + + /* + // for universal little/big endian code, but slow + strh sym, [dicPos] + inc dicPos + ldrb sym, [t0_R, -1] + */ + + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + // we must improve big-endian detection for another compilers + // for big-endian we need to revert bytes + rev16 sym, sym + #endif + + // (sym) must represent as little-endian here: + strb sym, [dicPos], 1 + shr sym, 8 + + #else + +MY_ALIGN_FOR_LOOP +1: + strb sym, [dicPos, cnt_R] + ldrb sym, [t0_R, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + ldrb sym, [t0_R, cnt_R] + inc_s cnt_R + jnz 1b + #endif + +copy_end: +lz_end_match: + strb sym, [dicPos], 1 + + # IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label + + + +# ---------- LITERAL MATCHED ---------- + + LIT_PROBS + + // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + + sub t0_R, dicPos, dic + p2_sub_s t0_R, rep0_R + + #ifdef LZMA_USE_CMOV_LZ_WRAP + add t1_R, t0_R, dicBufSize + cmovb t0_R, t1_R + #else + jae 1f + p2_add t0_R, dicBufSize +1: + #endif + + ldrb match, [dic, t0_R] + + // state -= (state < 10) ? 3 : 6; + sub sym, state, 6 * PMULT + cmp state, 10 * PMULT + p2_sub state, 3 * PMULT + cmovae state, sym + + #ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov sym, 1 + and bit, match, offs + add prm, probs, offs_R + +MY_ALIGN_FOR_LOOP +litm_loop: + LITM + tbz sym, 8, litm_loop + + #else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + #endif + + IsMatchBranch_Pre + strb sym, [dicPos], 1 + p2_and sym, 255 + + // mov len, wzr // LITM uses same regisetr (len / offs). So we clear it + CheckLimits_lit +lit_matched_end: + IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label + # IsMatchBranch + p2_sub state, 3 * PMULT + jmp lit_start_2 + + + +# ---------- REP 0 LITERAL ---------- +MY_ALIGN_FOR_ENTRY +IsRep0Short_label: + UPDATE_0 probs_state, pbPos_R, 0 + + // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + sub t0_R, dicPos, dic + + // state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + + # the caller doesn't allow (dicPos >= limit) case for REP_SHORT + # so we don't need the following (dicPos == limit) check here: + # cmp dicPos, limit + # jae fin_dicPos_LIMIT_REP_SHORT + # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes + + inc processedPos + + IsMatchBranch_Pre + + p2_sub_s t0_R, rep0_R + #ifdef LZMA_USE_CMOV_LZ_WRAP + add sym_R, t0_R, dicBufSize + cmovb t0_R, sym_R + #else + jae 1f + p2_add t0_R, dicBufSize +1: + #endif + + ldrb sym, [dic, t0_R] + // mov len, wzr + jmp lz_end_match + +MY_ALIGN_FOR_ENTRY +IsRep_label: + UPDATE_1 probs_state, 0, (IsRep - IsMatch) + + # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + # So we don't check it here. + + # mov t0, processedPos + # or t0, checkDicSize + # jz fin_ERROR_2 + + // state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + SET_probs RepLenCoder + + IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label + sub_big probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT + IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label + UPDATE_1 probs_state, pbPos_R, 0 + jmp len_decode + +MY_ALIGN_FOR_ENTRY +IsRepG0_label: + UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch) + IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label + mov dist, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + +# MY_ALIGN_FOR_ENTRY +IsRepG1_label: + UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch) + IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label + mov dist, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + +# MY_ALIGN_FOR_ENTRY +IsRepG2_label: + UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch) + mov dist, rep3 + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + + + +# ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_FOR_ENTRY +short_dist: + p2_sub_s numBits, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, numBits + add sym_R, probs_Spec, sym_R, lsl #PSHIFT + p2_add sym_R, SpecPos * PMULT + 1 * PMULT + mov sym2, PMULT // # step +MY_ALIGN_FOR_LOOP +spec_loop: + REV_1_VAR prob_reg + dec_s numBits + jnz spec_loop + + p2_add sym2_R, probs_Spec + .if SpecPos != 0 + p2_add sym2_R, SpecPos * PMULT + .endif + p2_sub sym_R, sym2_R + shr sym, PSHIFT + + jmp decode_dist_end + + + +# ---------- COPY MATCH 0 ---------- +MY_ALIGN_FOR_ENTRY +copy_match_0: + #ifdef LZMA_USE_4BYTES_FILL + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + orr t3, sym, sym, lsl 8 + p2_and cnt_R, -4 + orr t3, t3, t3, lsl 16 +MY_ALIGN_FOR_LOOP_16 +1: + /* + str t3, [dicPos, cnt_R] + adds cnt_R, cnt_R, 4 + jz 2f + */ + + str t3, [dicPos, cnt_R] + adds cnt_R, cnt_R, 4 + jnz 1b +2: + // p2_and sym, 255 + #else + +MY_ALIGN_FOR_LOOP +1: + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jnz 1b + #endif + + jmp copy_end + + +# ---------- COPY MATCH CROSS ---------- +copy_match_cross: + # t0_R - src pos + # cnt_R - total copy len + + p1_neg cnt_R +1: + ldrb sym, [dic, t0_R] + inc t0_R + strb sym, [dicPos, cnt_R] + inc cnt_R + cmp t0_R, dicBufSize + jne 1b + + ldrb sym, [dic] + sub t0_R, dic, cnt_R + jmp copy_common + + + + +/* +fin_dicPos_LIMIT_REP_SHORT: + mov len, 1 + jmp fin_OK +*/ + +/* +fin_dicPos_LIMIT: + jmp fin_OK + # For more strict mode we can stop decoding with error + # mov sym, 1 + # jmp fin +*/ + +fin_ERROR_MATCH_DIST: + # rep0 = distance + 1; + p2_add len, kMatchSpecLen_Error_Data + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, sym + STATE_UPDATE_FOR_MATCH + # jmp fin_OK + mov sym, 1 + jmp fin + +end_of_payload: + inc_s sym + jnz fin_ERROR_MATCH_DIST + + mov len, kMatchSpecLenStart + xor state, (1 << FLAG_STATE_BITS) + jmp fin_OK + +/* +fin_OK_lit: + mov len, wzr +*/ + +fin_OK: + mov sym, wzr + +fin: + NORM + + #define fin_lzma_reg t0_R + + .macro STORE_LZMA_VAR reg:req, struct_offs:req + str \reg, [fin_lzma_reg, \struct_offs] + .endm + + .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req + stp \reg0, \reg1, [fin_lzma_reg, \struct_offs] + .endm + + ldr fin_lzma_reg, [sp, 120] + p2_sub dicPos, dic + shr state, PSHIFT + + STORE_LZMA_PAIR dicPos, buf, offset_dicPos + STORE_LZMA_PAIR range, cod, offset_range + STORE_LZMA_VAR processedPos, offset_processedPos + STORE_LZMA_PAIR rep0, rep1, offset_rep0 + STORE_LZMA_PAIR rep2, rep3, offset_rep2 + STORE_LZMA_PAIR state, len, offset_state + + mov w0, sym + + ldp x29, x30, [sp, 80] + ldp x27, x28, [sp, 64] + ldp x25, x26, [sp, 48] + ldp x23, x24, [sp, 32] + ldp x21, x22, [sp, 16] + ldp x19, x20, [sp], 128 + + ret +/* + .cfi_endproc +.LFE0: + .size LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3 + .ident "TAG_LZMA" + .section .note.GNU-stack,"",@progbits +*/