--- /dev/null
+// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function
+// 2021-04-25 : Igor Pavlov : Public domain
+
+/*
+; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
+; function for check at link time.
+; That code is tightly coupled with LzmaDec_TryDummy()
+; and with another functions in LzmaDec.c file.
+; CLzmaDec structure, (probs) array layout, input and output of
+; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
+*/
+
+
+#include "7zAsm.S"
+
+ // .arch armv8-a
+ // .file "LzmaDecOpt.c"
+ .text
+ .align 2
+ .p2align 4,,15
+#ifdef __APPLE__
+ .globl _LzmaDec_DecodeReal_3
+#else
+ .global LzmaDec_DecodeReal_3
+#endif
+ // .type LzmaDec_DecodeReal_3, %function
+
+// #define _LZMA_SIZE_OPT 1
+
+#define LZMA_USE_4BYTES_FILL 1
+// #define LZMA_USE_2BYTES_COPY 1
+// #define LZMA_USE_CMOV_LZ_WRAP 1
+// #define _LZMA_PROB32 1
+
+#define MY_ALIGN_FOR_ENTRY MY_ALIGN_32
+#define MY_ALIGN_FOR_LOOP MY_ALIGN_32
+#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16
+
+#ifdef _LZMA_PROB32
+ .equ PSHIFT , 2
+ .macro PLOAD dest:req, mem:req
+ ldr \dest, [\mem]
+ .endm
+ .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
+ ldr \dest, [\mem, \offset]!
+ .endm
+ .macro PLOAD_2 dest:req, mem1:req, mem2:req
+ ldr \dest, [\mem1, \mem2]
+ .endm
+ .macro PLOAD_LSL dest:req, mem1:req, mem2:req
+ ldr \dest, [\mem1, \mem2, lsl #PSHIFT]
+ .endm
+ .macro PSTORE src:req, mem:req
+ str \src, [\mem]
+ .endm
+ .macro PSTORE_2 src:req, mem1:req, mem2:req
+ str \src, [\mem1, \mem2]
+ .endm
+ .macro PSTORE_LSL src:req, mem1:req, mem2:req
+ str \src, [\mem1, \mem2, lsl #PSHIFT]
+ .endm
+ .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
+ // you must check that temp_reg is free register when macro is used
+ add \temp_reg, \mem1, \mem2
+ str \src, [\temp_reg, \mem2]
+ .endm
+#else
+ // .equ PSHIFT , 1
+ #define PSHIFT 1
+ .macro PLOAD dest:req, mem:req
+ ldrh \dest, [\mem]
+ .endm
+ .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
+ ldrh \dest, [\mem, \offset]!
+ .endm
+ .macro PLOAD_2 dest:req, mem1:req, mem2:req
+ ldrh \dest, [\mem1, \mem2]
+ .endm
+ .macro PLOAD_LSL dest:req, mem1:req, mem2:req
+ ldrh \dest, [\mem1, \mem2, lsl #PSHIFT]
+ .endm
+ .macro PSTORE src:req, mem:req
+ strh \src, [\mem]
+ .endm
+ .macro PSTORE_2 src:req, mem1:req, mem2:req
+ strh \src, [\mem1, \mem2]
+ .endm
+ .macro PSTORE_LSL src:req, mem1:req, mem2:req
+ strh \src, [\mem1, \mem2, lsl #PSHIFT]
+ .endm
+ .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
+ strh \src, [\mem1, \mem2]
+ .endm
+#endif
+
+.equ PMULT , (1 << PSHIFT)
+.equ PMULT_2 , (2 << PSHIFT)
+
+.equ kMatchSpecLen_Error_Data , (1 << 9)
+
+# x7 t0 : NORM_CALC : prob2 (IF_BIT_1)
+# x6 t1 : NORM_CALC : probs_state
+# x8 t2 : (LITM) temp : (TREE) temp
+# x4 t3 : (LITM) bit : (TREE) temp : UPDATE_0/UPDATE_0 temp
+# x10 t4 : (LITM) offs : (TREE) probs_PMULT : numBits
+# x9 t5 : (LITM) match : sym2 (ShortDist)
+# x1 t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos
+# x2 t7 : (LITM) prm : probBranch : cnt
+# x3 sym : dist
+# x12 len
+# x0 range
+# x5 cod
+
+
+#define range w0
+
+// t6
+#define pbPos w1
+#define pbPos_R r1
+#define prob_reg w1
+#define litm_prob prob_reg
+
+// t7
+#define probBranch w2
+#define cnt w2
+#define cnt_R r2
+#define prm r2
+
+#define sym w3
+#define sym_R r3
+#define dist sym
+
+#define t3 w4
+#define bit w4
+#define bit_R r4
+#define update_temp_reg r4
+
+#define cod w5
+
+#define t1 w6
+#define t1_R r6
+#define probs_state t1_R
+
+#define t0 w7
+#define t0_R r7
+#define prob2 t0
+
+#define t2 w8
+#define t2_R r8
+
+// t5
+#define match w9
+#define sym2 w9
+#define sym2_R r9
+
+#define t4 w10
+#define t4_R r10
+
+#define offs w10
+#define offs_R r10
+
+#define probs r11
+
+#define len w12
+#define len_R x12
+
+#define state w13
+#define state_R r13
+
+#define dicPos r14
+#define buf r15
+#define bufLimit r16
+#define dicBufSize r17
+
+#define limit r19
+#define rep0 w20
+#define rep0_R r20
+#define rep1 w21
+#define rep2 w22
+#define rep3 w23
+#define dic r24
+#define probs_IsMatch r25
+#define probs_Spec r26
+#define checkDicSize w27
+#define processedPos w28
+#define pbMask w29
+#define lc2_lpMask w30
+
+
+.equ kNumBitModelTotalBits , 11
+.equ kBitModelTotal , (1 << kNumBitModelTotalBits)
+.equ kNumMoveBits , 5
+.equ kBitModelOffset , (kBitModelTotal - (1 << kNumMoveBits) + 1)
+
+.macro NORM_2 macro
+ ldrb t0, [buf], 1
+ shl range, 8
+ orr cod, t0, cod, lsl 8
+ /*
+ mov t0, cod
+ ldrb cod, [buf], 1
+ shl range, 8
+ bfi cod, t0, #8, #24
+ */
+.endm
+
+.macro TEST_HIGH_BYTE_range macro
+ tst range, 0xFF000000
+.endm
+
+.macro NORM macro
+ TEST_HIGH_BYTE_range
+ jnz 1f
+ NORM_2
+1:
+.endm
+
+
+# ---------- Branch MACROS ----------
+
+.macro UPDATE_0__0
+ sub prob2, probBranch, kBitModelOffset
+.endm
+
+.macro UPDATE_0__1
+ sub probBranch, probBranch, prob2, asr #(kNumMoveBits)
+.endm
+
+.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req
+ .if \probDisp == 0
+ PSTORE_2 probBranch, \probsArray, \probOffset
+ .elseif \probOffset == 0
+ PSTORE_2 probBranch, \probsArray, \probDisp * PMULT
+ .else
+ .error "unsupported"
+ // add update_temp_reg, \probsArray, \probOffset
+ PSTORE_2 probBranch, update_temp_reg, \probDisp * PMULT
+ .endif
+.endm
+
+.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req
+ UPDATE_0__0
+ UPDATE_0__1
+ UPDATE_0__2 \probsArray, \probOffset, \probDisp
+.endm
+
+
+.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req
+ // sub cod, cod, prob2
+ // sub range, range, prob2
+ p2_sub cod, range
+ sub range, prob2, range
+ sub prob2, probBranch, probBranch, lsr #(kNumMoveBits)
+ .if \probDisp == 0
+ PSTORE_2 prob2, \probsArray, \probOffset
+ .elseif \probOffset == 0
+ PSTORE_2 prob2, \probsArray, \probDisp * PMULT
+ .else
+ .error "unsupported"
+ // add update_temp_reg, \probsArray, \probOffset
+ PSTORE_2 prob2, update_temp_reg, \probDisp * PMULT
+ .endif
+.endm
+
+
+.macro CMP_COD_BASE
+ NORM
+ // lsr prob2, range, kNumBitModelTotalBits
+ // imul prob2, probBranch
+ // cmp cod, prob2
+ mov prob2, range
+ shr range, kNumBitModelTotalBits
+ imul range, probBranch
+ cmp cod, range
+.endm
+
+.macro CMP_COD_1 probsArray:req
+ PLOAD probBranch, \probsArray
+ CMP_COD_BASE
+.endm
+
+.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req
+ .if \probDisp == 0
+ PLOAD_2 probBranch, \probsArray, \probOffset
+ .elseif \probOffset == 0
+ PLOAD_2 probBranch, \probsArray, \probDisp * PMULT
+ .else
+ .error "unsupported"
+ add update_temp_reg, \probsArray, \probOffset
+ PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT
+ .endif
+ CMP_COD_BASE
+.endm
+
+
+.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD_3 \probsArray, \probOffset, \probDisp
+ jae \toLabel
+.endm
+
+
+.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel
+ UPDATE_0 \probsArray, \probOffset, \probDisp
+.endm
+
+
+.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD_3 \probsArray, \probOffset, \probDisp
+ jb \toLabel
+.endm
+
+.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req
+ CMP_COD_1 \probsArray
+ jb \toLabel
+.endm
+
+
+# ---------- CMOV MACROS ----------
+
+.macro NORM_LSR
+ NORM
+ lsr t0, range, #kNumBitModelTotalBits
+.endm
+
+.macro COD_RANGE_SUB
+ subs t1, cod, t0
+ p2_sub range, t0
+.endm
+
+.macro RANGE_IMUL prob:req
+ imul t0, \prob
+.endm
+
+.macro NORM_CALC prob:req
+ NORM_LSR
+ RANGE_IMUL \prob
+ COD_RANGE_SUB
+.endm
+
+.macro CMOV_range
+ cmovb range, t0
+.endm
+
+.macro CMOV_code
+ cmovae cod, t1
+.endm
+
+.macro CMOV_code_Model_Pre prob:req
+ sub t0, \prob, kBitModelOffset
+ CMOV_code
+ cmovae t0, \prob
+.endm
+
+
+.macro PUP_BASE_2 prob:req, dest_reg:req
+ # only sar works for both 16/32 bit prob modes
+ sub \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits)
+.endm
+
+.macro PUP prob:req, probPtr:req, mem2:req
+ PUP_BASE_2 \prob, t0
+ PSTORE_2 t0, \probPtr, \mem2
+.endm
+
+
+
+#define probs_PMULT t4_R
+
+.macro BIT_01
+ add probs_PMULT, probs, PMULT
+.endm
+
+
+.macro BIT_0_R prob:req
+ PLOAD_2 \prob, probs, 1 * PMULT
+ NORM_LSR
+ sub t3, \prob, kBitModelOffset
+ RANGE_IMUL \prob
+ PLOAD_2 t2, probs, 1 * PMULT_2
+ COD_RANGE_SUB
+ CMOV_range
+ cmovae t3, \prob
+ PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT
+ PUP_BASE_2 \prob, t3
+ csel \prob, t2, t0, lo
+ CMOV_code
+ mov sym, 2
+ PSTORE_2 t3, probs, 1 * PMULT
+ adc sym, sym, wzr
+ BIT_01
+.endm
+
+.macro BIT_1_R prob:req
+ NORM_LSR
+ p2_add sym, sym
+ sub t3, \prob, kBitModelOffset
+ RANGE_IMUL \prob
+ PLOAD_LSL t2, probs, sym_R
+ COD_RANGE_SUB
+ CMOV_range
+ cmovae t3, \prob
+ PLOAD_LSL t0, probs_PMULT, sym_R
+ PUP_BASE_2 \prob, t3
+ csel \prob, t2, t0, lo
+ CMOV_code
+ PSTORE_LSL_M1 t3, probs, sym_R, t2_R
+ adc sym, sym, wzr
+.endm
+
+
+.macro BIT_2_R prob:req
+ NORM_LSR
+ p2_add sym, sym
+ sub t3, \prob, kBitModelOffset
+ RANGE_IMUL \prob
+ COD_RANGE_SUB
+ CMOV_range
+ cmovae t3, \prob
+ CMOV_code
+ PUP_BASE_2 \prob, t3
+ PSTORE_LSL_M1 t3, probs, sym_R, t2_R
+ adc sym, sym, wzr
+.endm
+
+
+# ---------- MATCHED LITERAL ----------
+
+.macro LITM_0 macro
+ shl match, (PSHIFT + 1)
+ and bit, match, 256 * PMULT
+ add prm, probs, 256 * PMULT + 1 * PMULT
+ p2_add match, match
+ p2_add prm, bit_R
+ eor offs, bit, 256 * PMULT
+ PLOAD litm_prob, prm
+
+ NORM_LSR
+ sub t2, litm_prob, kBitModelOffset
+ RANGE_IMUL litm_prob
+ COD_RANGE_SUB
+ cmovae offs, bit
+ CMOV_range
+ and bit, match, offs
+ cmovae t2, litm_prob
+ CMOV_code
+ mov sym, 2
+ PUP_BASE_2 litm_prob, t2
+ PSTORE t2, prm
+ add prm, probs, offs_R
+ adc sym, sym, wzr
+.endm
+
+.macro LITM macro
+ p2_add prm, bit_R
+ xor offs, bit
+ PLOAD_LSL litm_prob, prm, sym_R
+
+ NORM_LSR
+ p2_add match, match
+ sub t2, litm_prob, kBitModelOffset
+ RANGE_IMUL litm_prob
+ COD_RANGE_SUB
+ cmovae offs, bit
+ CMOV_range
+ and bit, match, offs
+ cmovae t2, litm_prob
+ CMOV_code
+ PUP_BASE_2 litm_prob, t2
+ PSTORE_LSL t2, prm, sym_R
+ add prm, probs, offs_R
+ adc sym, sym, sym
+.endm
+
+
+.macro LITM_2 macro
+ p2_add prm, bit_R
+ PLOAD_LSL litm_prob, prm, sym_R
+
+ NORM_LSR
+ sub t2, litm_prob, kBitModelOffset
+ RANGE_IMUL litm_prob
+ COD_RANGE_SUB
+ CMOV_range
+ cmovae t2, litm_prob
+ CMOV_code
+ PUP_BASE_2 litm_prob, t2
+ PSTORE_LSL t2, prm, sym_R
+ adc sym, sym, sym
+.endm
+
+
+# ---------- REVERSE BITS ----------
+
+.macro REV_0 prob:req
+ NORM_CALC \prob
+ CMOV_range
+ PLOAD t2, sym2_R
+ PLOAD_2 t3, probs, 3 * PMULT
+ CMOV_code_Model_Pre \prob
+ add t1_R, probs, 3 * PMULT
+ cmovae sym2_R, t1_R
+ PUP \prob, probs, 1 * PMULT
+ csel \prob, t2, t3, lo
+.endm
+
+
+.macro REV_1 prob:req, step:req
+ NORM_LSR
+ PLOAD_PREINDEXED t2, sym2_R, (\step * PMULT)
+ RANGE_IMUL \prob
+ COD_RANGE_SUB
+ CMOV_range
+ PLOAD_2 t3, sym2_R, (\step * PMULT)
+ sub t0, \prob, kBitModelOffset
+ CMOV_code
+ add t1_R, sym2_R, \step * PMULT
+ cmovae t0, \prob
+ cmovae sym2_R, t1_R
+ PUP_BASE_2 \prob, t0
+ csel \prob, t2, t3, lo
+ PSTORE_2 t0, t1_R, 0 - \step * PMULT_2
+.endm
+
+
+.macro REV_2 prob:req, step:req
+ sub t1_R, sym2_R, probs
+ NORM_LSR
+ orr sym, sym, t1, lsr #PSHIFT
+ RANGE_IMUL \prob
+ COD_RANGE_SUB
+ sub t2, sym, \step
+ CMOV_range
+ cmovb sym, t2
+ CMOV_code_Model_Pre \prob
+ PUP \prob, sym2_R, 0
+.endm
+
+
+.macro REV_1_VAR prob:req
+ PLOAD \prob, sym_R
+ mov probs, sym_R
+ p2_add sym_R, sym2_R
+ NORM_LSR
+ add t2_R, sym_R, sym2_R
+ RANGE_IMUL \prob
+ COD_RANGE_SUB
+ cmovae sym_R, t2_R
+ CMOV_range
+ CMOV_code_Model_Pre \prob
+ p2_add sym2, sym2
+ PUP \prob, probs, 0
+.endm
+
+
+.macro add_big dest:req, src:req, param:req
+ .if (\param) < (1 << 12)
+ add \dest, \src, \param
+ .else
+ #ifndef _LZMA_PROB32
+ .error "unexpcted add_big expansion"
+ #endif
+ add \dest, \src, (\param) / 2
+ add \dest, \dest, (\param) - (\param) / 2
+ .endif
+.endm
+
+.macro sub_big dest:req, src:req, param:req
+ .if (\param) < (1 << 12)
+ sub \dest, \src, \param
+ .else
+ #ifndef _LZMA_PROB32
+ .error "unexpcted sub_big expansion"
+ #endif
+ sub \dest, \src, (\param) / 2
+ sub \dest, \dest, (\param) - (\param) / 2
+ .endif
+.endm
+
+
+.macro SET_probs offset:req
+ // add_big probs, probs_Spec, (\offset) * PMULT
+ add probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT
+.endm
+
+
+.macro LIT_PROBS
+ add sym, sym, processedPos, lsl 8
+ inc processedPos
+ UPDATE_0__0
+ shl sym, lc2_lpMask
+ SET_probs Literal
+ p2_and sym, lc2_lpMask
+ // p2_add probs_state, pbPos_R
+ p2_add probs, sym_R
+ UPDATE_0__1
+ add probs, probs, sym_R, lsl 1
+ UPDATE_0__2 probs_state, pbPos_R, 0
+.endm
+
+
+
+.equ kNumPosBitsMax , 4
+.equ kNumPosStatesMax , (1 << kNumPosBitsMax)
+
+.equ kLenNumLowBits , 3
+.equ kLenNumLowSymbols , (1 << kLenNumLowBits)
+.equ kLenNumHighBits , 8
+.equ kLenNumHighSymbols , (1 << kLenNumHighBits)
+.equ kNumLenProbs , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
+
+.equ LenLow , 0
+.equ LenChoice , LenLow
+.equ LenChoice2 , (LenLow + kLenNumLowSymbols)
+.equ LenHigh , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
+
+.equ kNumStates , 12
+.equ kNumStates2 , 16
+.equ kNumLitStates , 7
+
+.equ kStartPosModelIndex , 4
+.equ kEndPosModelIndex , 14
+.equ kNumFullDistances , (1 << (kEndPosModelIndex >> 1))
+
+.equ kNumPosSlotBits , 6
+.equ kNumLenToPosStates , 4
+
+.equ kNumAlignBits , 4
+.equ kAlignTableSize , (1 << kNumAlignBits)
+
+.equ kMatchMinLen , 2
+.equ kMatchSpecLenStart , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
+
+// .equ kStartOffset , 1408
+.equ kStartOffset , 0
+.equ SpecPos , (-kStartOffset)
+.equ IsRep0Long , (SpecPos + kNumFullDistances)
+.equ RepLenCoder , (IsRep0Long + (kNumStates2 << kNumPosBitsMax))
+.equ LenCoder , (RepLenCoder + kNumLenProbs)
+.equ IsMatch , (LenCoder + kNumLenProbs)
+.equ kAlign , (IsMatch + (kNumStates2 << kNumPosBitsMax))
+.equ IsRep , (kAlign + kAlignTableSize)
+.equ IsRepG0 , (IsRep + kNumStates)
+.equ IsRepG1 , (IsRepG0 + kNumStates)
+.equ IsRepG2 , (IsRepG1 + kNumStates)
+.equ PosSlot , (IsRepG2 + kNumStates)
+.equ Literal , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
+.equ NUM_BASE_PROBS , (Literal + kStartOffset)
+
+.if kStartOffset != 0 // && IsMatch != 0
+ .error "Stop_Compiling_Bad_StartOffset"
+.endif
+
+.if NUM_BASE_PROBS != 1984
+ .error "Stop_Compiling_Bad_LZMA_PROBS"
+.endif
+
+.equ offset_lc , 0
+.equ offset_lp , 1
+.equ offset_pb , 2
+.equ offset_dicSize , 4
+.equ offset_probs , 4 + offset_dicSize
+.equ offset_probs_1664 , 8 + offset_probs
+.equ offset_dic , 8 + offset_probs_1664
+.equ offset_dicBufSize , 8 + offset_dic
+.equ offset_dicPos , 8 + offset_dicBufSize
+.equ offset_buf , 8 + offset_dicPos
+.equ offset_range , 8 + offset_buf
+.equ offset_code , 4 + offset_range
+.equ offset_processedPos , 4 + offset_code
+.equ offset_checkDicSize , 4 + offset_processedPos
+.equ offset_rep0 , 4 + offset_checkDicSize
+.equ offset_rep1 , 4 + offset_rep0
+.equ offset_rep2 , 4 + offset_rep1
+.equ offset_rep3 , 4 + offset_rep2
+.equ offset_state , 4 + offset_rep3
+.equ offset_remainLen , 4 + offset_state
+.equ offset_TOTAL_SIZE , 4 + offset_remainLen
+
+.if offset_TOTAL_SIZE != 96
+ .error "Incorrect offset_TOTAL_SIZE"
+.endif
+
+
+.macro IsMatchBranch_Pre
+ # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
+ and pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT)
+ add probs_state, probs_IsMatch, state_R
+.endm
+
+
+/*
+.macro IsMatchBranch
+ IsMatchBranch_Pre
+ IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+.endm
+*/
+
+.macro CheckLimits
+ cmp buf, bufLimit
+ jae fin_OK
+ cmp dicPos, limit
+ jae fin_OK
+.endm
+
+#define CheckLimits_lit CheckLimits
+/*
+.macro CheckLimits_lit
+ cmp buf, bufLimit
+ jae fin_OK_lit
+ cmp dicPos, limit
+ jae fin_OK_lit
+.endm
+*/
+
+
+#define PARAM_lzma REG_ABI_PARAM_0
+#define PARAM_limit REG_ABI_PARAM_1
+#define PARAM_bufLimit REG_ABI_PARAM_2
+
+
+.macro LOAD_LZMA_VAR reg:req, struct_offs:req
+ ldr \reg, [PARAM_lzma, \struct_offs]
+.endm
+
+.macro LOAD_LZMA_BYTE reg:req, struct_offs:req
+ ldrb \reg, [PARAM_lzma, \struct_offs]
+.endm
+
+.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
+ ldp \reg0, \reg1, [PARAM_lzma, \struct_offs]
+.endm
+
+
+LzmaDec_DecodeReal_3:
+_LzmaDec_DecodeReal_3:
+/*
+.LFB0:
+ .cfi_startproc
+*/
+
+ stp x19, x20, [sp, -128]!
+ stp x21, x22, [sp, 16]
+ stp x23, x24, [sp, 32]
+ stp x25, x26, [sp, 48]
+ stp x27, x28, [sp, 64]
+ stp x29, x30, [sp, 80]
+
+ str PARAM_lzma, [sp, 120]
+
+ mov bufLimit, PARAM_bufLimit
+ mov limit, PARAM_limit
+
+ LOAD_LZMA_PAIR dic, dicBufSize, offset_dic
+ LOAD_LZMA_PAIR dicPos, buf, offset_dicPos
+ LOAD_LZMA_PAIR rep0, rep1, offset_rep0
+ LOAD_LZMA_PAIR rep2, rep3, offset_rep2
+
+ mov t0, 1 << (kLenNumLowBits + 1 + PSHIFT)
+ LOAD_LZMA_BYTE pbMask, offset_pb
+ p2_add limit, dic
+ mov len, wzr // we can set it in all requiread branches instead
+ lsl pbMask, t0, pbMask
+ p2_add dicPos, dic
+ p2_sub pbMask, t0
+
+ LOAD_LZMA_BYTE lc2_lpMask, offset_lc
+ mov t0, 256 << PSHIFT
+ LOAD_LZMA_BYTE t1, offset_lp
+ p2_add t1, lc2_lpMask
+ p2_sub lc2_lpMask, (256 << PSHIFT) - PSHIFT
+ shl t0, t1
+ p2_add lc2_lpMask, t0
+
+ LOAD_LZMA_VAR probs_Spec, offset_probs
+ LOAD_LZMA_VAR checkDicSize, offset_checkDicSize
+ LOAD_LZMA_VAR processedPos, offset_processedPos
+ LOAD_LZMA_VAR state, offset_state
+ // range is r0 : this load must be last don't move
+ LOAD_LZMA_PAIR range, cod, offset_range
+ mov sym, wzr
+ shl state, PSHIFT
+
+ add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT)
+
+ // if (processedPos != 0 || checkDicSize != 0)
+ orr t0, checkDicSize, processedPos
+ cbz t0, 1f
+ add t0_R, dicBufSize, dic
+ cmp dicPos, dic
+ cmovne t0_R, dicPos
+ ldrb sym, [t0_R, -1]
+1:
+ IsMatchBranch_Pre
+ cmp state, 4 * PMULT
+ jb lit_end
+ cmp state, kNumLitStates * PMULT
+ jb lit_matched_end
+ jmp lz_end
+
+
+
+#define BIT_0 BIT_0_R prob_reg
+#define BIT_1 BIT_1_R prob_reg
+#define BIT_2 BIT_2_R prob_reg
+
+# ---------- LITERAL ----------
+MY_ALIGN_64
+lit_start:
+ mov state, wzr
+lit_start_2:
+ LIT_PROBS
+
+ #ifdef _LZMA_SIZE_OPT
+
+ PLOAD_2 prob_reg, probs, 1 * PMULT
+ mov sym, 1
+ BIT_01
+MY_ALIGN_FOR_LOOP
+lit_loop:
+ BIT_1
+ tbz sym, 7, lit_loop
+
+ #else
+
+ BIT_0
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+
+ #endif
+
+ BIT_2
+ IsMatchBranch_Pre
+ strb sym, [dicPos], 1
+ p2_and sym, 255
+
+ CheckLimits_lit
+lit_end:
+ IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start
+
+ # jmp IsMatch_label
+
+
+#define FLAG_STATE_BITS (4 + PSHIFT)
+
+# ---------- MATCHES ----------
+# MY_ALIGN_FOR_ENTRY
+IsMatch_label:
+ UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch)
+ IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label
+
+ SET_probs LenCoder
+ or state, (1 << FLAG_STATE_BITS)
+
+# ---------- LEN DECODE ----------
+len_decode:
+ mov len, 8 - kMatchMinLen
+ IF_BIT_0_NOUP_1 probs, len_mid_0
+ UPDATE_1 probs, 0, 0
+ p2_add probs, (1 << (kLenNumLowBits + PSHIFT))
+ mov len, 0 - kMatchMinLen
+ IF_BIT_0_NOUP_1 probs, len_mid_0
+ UPDATE_1 probs, 0, 0
+ p2_add probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT))
+
+ #if 0 == 1
+ BIT_0
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+ #else
+ PLOAD_2 prob_reg, probs, 1 * PMULT
+ mov sym, 1
+ BIT_01
+MY_ALIGN_FOR_LOOP
+len8_loop:
+ BIT_1
+ tbz sym, 6, len8_loop
+ #endif
+
+ mov len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen
+ jmp len_mid_2
+
+MY_ALIGN_FOR_ENTRY
+len_mid_0:
+ UPDATE_0 probs, 0, 0
+ p2_add probs, pbPos_R
+ BIT_0
+len_mid_2:
+ BIT_1
+ BIT_2
+ sub len, sym, len
+ tbz state, FLAG_STATE_BITS, copy_match
+
+# ---------- DECODE DISTANCE ----------
+ // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
+
+ mov t0, 3 + kMatchMinLen
+ cmp len, 3 + kMatchMinLen
+ cmovb t0, len
+ SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits))
+ add probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT)
+
+ #ifdef _LZMA_SIZE_OPT
+
+ PLOAD_2 prob_reg, probs, 1 * PMULT
+ mov sym, 1
+ BIT_01
+MY_ALIGN_FOR_LOOP
+slot_loop:
+ BIT_1
+ tbz sym, 5, slot_loop
+
+ #else
+
+ BIT_0
+ BIT_1
+ BIT_1
+ BIT_1
+ BIT_1
+
+ #endif
+
+ #define numBits t4
+ mov numBits, sym
+ BIT_2
+ // we need only low bits
+ p2_and sym, 3
+ cmp numBits, 32 + kEndPosModelIndex / 2
+ jb short_dist
+
+ SET_probs kAlign
+
+ # unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
+ p2_sub numBits, (32 + 1 + kNumAlignBits)
+ # distance = (2 | (distance & 1));
+ or sym, 2
+ PLOAD_2 prob_reg, probs, 1 * PMULT
+ add sym2_R, probs, 2 * PMULT
+
+# ---------- DIRECT DISTANCE ----------
+
+.macro DIRECT_1
+ shr range, 1
+ subs t0, cod, range
+ p2_add sym, sym
+ // add t1, sym, 1
+ csel cod, cod, t0, mi
+ csinc sym, sym, sym, mi
+ // csel sym, t1, sym, pl
+ // adc sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams
+ dec_s numBits
+ je direct_end
+.endm
+
+ #ifdef _LZMA_SIZE_OPT
+
+ jmp direct_norm
+MY_ALIGN_FOR_ENTRY
+direct_loop:
+ DIRECT_1
+direct_norm:
+ TEST_HIGH_BYTE_range
+ jnz direct_loop
+ NORM_2
+ jmp direct_loop
+
+ #else
+
+.macro DIRECT_2
+ TEST_HIGH_BYTE_range
+ jz direct_unroll
+ DIRECT_1
+.endm
+
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+ DIRECT_2
+
+direct_unroll:
+ NORM_2
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ DIRECT_1
+ jmp direct_unroll
+
+ #endif
+
+MY_ALIGN_FOR_ENTRY
+direct_end:
+ shl sym, kNumAlignBits
+ REV_0 prob_reg
+ REV_1 prob_reg, 2
+ REV_1 prob_reg, 4
+ REV_2 prob_reg, 8
+
+decode_dist_end:
+
+ // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
+
+ tst checkDicSize, checkDicSize
+ csel t0, processedPos, checkDicSize, eq
+ cmp sym, t0
+ jae end_of_payload
+ // jmp end_of_payload # for debug
+
+ mov rep3, rep2
+ mov rep2, rep1
+ mov rep1, rep0
+ add rep0, sym, 1
+
+.macro STATE_UPDATE_FOR_MATCH
+ // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
+ // cmp state, (kNumStates + kNumLitStates) * PMULT
+ cmp state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS)
+ mov state, kNumLitStates * PMULT
+ mov t0, (kNumLitStates + 3) * PMULT
+ cmovae state, t0
+.endm
+ STATE_UPDATE_FOR_MATCH
+
+# ---------- COPY MATCH ----------
+copy_match:
+
+ // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA;
+ subs cnt_R, limit, dicPos
+ // jz fin_dicPos_LIMIT
+ jz fin_OK
+
+ // curLen = ((rem < len) ? (unsigned)rem : len);
+ cmp cnt_R, len_R
+ cmovae cnt, len
+
+ sub t0_R, dicPos, dic
+ p2_add dicPos, cnt_R
+ p2_add processedPos, cnt
+ p2_sub len, cnt
+
+ // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
+ p2_sub_s t0_R, rep0_R
+ jae 1f
+
+ cmn t0_R, cnt_R
+ p2_add t0_R, dicBufSize
+ ja copy_match_cross
+1:
+# ---------- COPY MATCH FAST ----------
+ # t0_R : src_pos
+ p2_add t0_R, dic
+ ldrb sym, [t0_R]
+ p2_add t0_R, cnt_R
+ p1_neg cnt_R
+
+copy_common:
+ dec dicPos
+
+ # dicPos : (ptr_to_last_dest_BYTE)
+ # t0_R : (src_lim)
+ # cnt_R : (-curLen)
+
+ IsMatchBranch_Pre
+
+ inc_s cnt_R
+ jz copy_end
+
+ cmp rep0, 1
+ je copy_match_0
+
+ #ifdef LZMA_USE_2BYTES_COPY
+ strb sym, [dicPos, cnt_R]
+ dec dicPos
+ # dicPos : (ptr_to_last_dest_16bitWORD)
+ p2_and cnt_R, -2
+ ldrh sym, [t0_R, cnt_R]
+ adds cnt_R, cnt_R, 2
+ jz 2f
+MY_ALIGN_FOR_LOOP
+1:
+ /*
+ strh sym, [dicPos, cnt_R]
+ ldrh sym, [t0_R, cnt_R]
+ adds cnt_R, cnt_R, 2
+ jz 2f
+ */
+
+ strh sym, [dicPos, cnt_R]
+ ldrh sym, [t0_R, cnt_R]
+ adds cnt_R, cnt_R, 2
+ jnz 1b
+2:
+
+ /*
+ // for universal little/big endian code, but slow
+ strh sym, [dicPos]
+ inc dicPos
+ ldrb sym, [t0_R, -1]
+ */
+
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ // we must improve big-endian detection for another compilers
+ // for big-endian we need to revert bytes
+ rev16 sym, sym
+ #endif
+
+ // (sym) must represent as little-endian here:
+ strb sym, [dicPos], 1
+ shr sym, 8
+
+ #else
+
+MY_ALIGN_FOR_LOOP
+1:
+ strb sym, [dicPos, cnt_R]
+ ldrb sym, [t0_R, cnt_R]
+ inc_s cnt_R
+ jz copy_end
+
+ strb sym, [dicPos, cnt_R]
+ ldrb sym, [t0_R, cnt_R]
+ inc_s cnt_R
+ jnz 1b
+ #endif
+
+copy_end:
+lz_end_match:
+ strb sym, [dicPos], 1
+
+ # IsMatchBranch_Pre
+ CheckLimits
+lz_end:
+ IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+
+
+
+# ---------- LITERAL MATCHED ----------
+
+ LIT_PROBS
+
+ // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+
+ sub t0_R, dicPos, dic
+ p2_sub_s t0_R, rep0_R
+
+ #ifdef LZMA_USE_CMOV_LZ_WRAP
+ add t1_R, t0_R, dicBufSize
+ cmovb t0_R, t1_R
+ #else
+ jae 1f
+ p2_add t0_R, dicBufSize
+1:
+ #endif
+
+ ldrb match, [dic, t0_R]
+
+ // state -= (state < 10) ? 3 : 6;
+ sub sym, state, 6 * PMULT
+ cmp state, 10 * PMULT
+ p2_sub state, 3 * PMULT
+ cmovae state, sym
+
+ #ifdef _LZMA_SIZE_OPT
+
+ mov offs, 256 * PMULT
+ shl match, (PSHIFT + 1)
+ mov sym, 1
+ and bit, match, offs
+ add prm, probs, offs_R
+
+MY_ALIGN_FOR_LOOP
+litm_loop:
+ LITM
+ tbz sym, 8, litm_loop
+
+ #else
+
+ LITM_0
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM_2
+
+ #endif
+
+ IsMatchBranch_Pre
+ strb sym, [dicPos], 1
+ p2_and sym, 255
+
+ // mov len, wzr // LITM uses same regisetr (len / offs). So we clear it
+ CheckLimits_lit
+lit_matched_end:
+ IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+ # IsMatchBranch
+ p2_sub state, 3 * PMULT
+ jmp lit_start_2
+
+
+
+# ---------- REP 0 LITERAL ----------
+MY_ALIGN_FOR_ENTRY
+IsRep0Short_label:
+ UPDATE_0 probs_state, pbPos_R, 0
+
+ // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+ sub t0_R, dicPos, dic
+
+ // state = state < kNumLitStates ? 9 : 11;
+ or state, 1 * PMULT
+
+ # the caller doesn't allow (dicPos >= limit) case for REP_SHORT
+ # so we don't need the following (dicPos == limit) check here:
+ # cmp dicPos, limit
+ # jae fin_dicPos_LIMIT_REP_SHORT
+ # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes
+
+ inc processedPos
+
+ IsMatchBranch_Pre
+
+ p2_sub_s t0_R, rep0_R
+ #ifdef LZMA_USE_CMOV_LZ_WRAP
+ add sym_R, t0_R, dicBufSize
+ cmovb t0_R, sym_R
+ #else
+ jae 1f
+ p2_add t0_R, dicBufSize
+1:
+ #endif
+
+ ldrb sym, [dic, t0_R]
+ // mov len, wzr
+ jmp lz_end_match
+
+MY_ALIGN_FOR_ENTRY
+IsRep_label:
+ UPDATE_1 probs_state, 0, (IsRep - IsMatch)
+
+ # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
+ # So we don't check it here.
+
+ # mov t0, processedPos
+ # or t0, checkDicSize
+ # jz fin_ERROR_2
+
+ // state = state < kNumLitStates ? 8 : 11;
+ cmp state, kNumLitStates * PMULT
+ mov state, 8 * PMULT
+ mov probBranch, 11 * PMULT
+ cmovae state, probBranch
+
+ SET_probs RepLenCoder
+
+ IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label
+ sub_big probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT
+ IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label
+ UPDATE_1 probs_state, pbPos_R, 0
+ jmp len_decode
+
+MY_ALIGN_FOR_ENTRY
+IsRepG0_label:
+ UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch)
+ IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label
+ mov dist, rep1
+ mov rep1, rep0
+ mov rep0, dist
+ jmp len_decode
+
+# MY_ALIGN_FOR_ENTRY
+IsRepG1_label:
+ UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch)
+ IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label
+ mov dist, rep2
+ mov rep2, rep1
+ mov rep1, rep0
+ mov rep0, dist
+ jmp len_decode
+
+# MY_ALIGN_FOR_ENTRY
+IsRepG2_label:
+ UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch)
+ mov dist, rep3
+ mov rep3, rep2
+ mov rep2, rep1
+ mov rep1, rep0
+ mov rep0, dist
+ jmp len_decode
+
+
+
+# ---------- SPEC SHORT DISTANCE ----------
+
+MY_ALIGN_FOR_ENTRY
+short_dist:
+ p2_sub_s numBits, 32 + 1
+ jbe decode_dist_end
+ or sym, 2
+ shl sym, numBits
+ add sym_R, probs_Spec, sym_R, lsl #PSHIFT
+ p2_add sym_R, SpecPos * PMULT + 1 * PMULT
+ mov sym2, PMULT // # step
+MY_ALIGN_FOR_LOOP
+spec_loop:
+ REV_1_VAR prob_reg
+ dec_s numBits
+ jnz spec_loop
+
+ p2_add sym2_R, probs_Spec
+ .if SpecPos != 0
+ p2_add sym2_R, SpecPos * PMULT
+ .endif
+ p2_sub sym_R, sym2_R
+ shr sym, PSHIFT
+
+ jmp decode_dist_end
+
+
+
+# ---------- COPY MATCH 0 ----------
+MY_ALIGN_FOR_ENTRY
+copy_match_0:
+ #ifdef LZMA_USE_4BYTES_FILL
+ strb sym, [dicPos, cnt_R]
+ inc_s cnt_R
+ jz copy_end
+
+ strb sym, [dicPos, cnt_R]
+ inc_s cnt_R
+ jz copy_end
+
+ strb sym, [dicPos, cnt_R]
+ inc_s cnt_R
+ jz copy_end
+
+ orr t3, sym, sym, lsl 8
+ p2_and cnt_R, -4
+ orr t3, t3, t3, lsl 16
+MY_ALIGN_FOR_LOOP_16
+1:
+ /*
+ str t3, [dicPos, cnt_R]
+ adds cnt_R, cnt_R, 4
+ jz 2f
+ */
+
+ str t3, [dicPos, cnt_R]
+ adds cnt_R, cnt_R, 4
+ jnz 1b
+2:
+ // p2_and sym, 255
+ #else
+
+MY_ALIGN_FOR_LOOP
+1:
+ strb sym, [dicPos, cnt_R]
+ inc_s cnt_R
+ jz copy_end
+
+ strb sym, [dicPos, cnt_R]
+ inc_s cnt_R
+ jnz 1b
+ #endif
+
+ jmp copy_end
+
+
+# ---------- COPY MATCH CROSS ----------
+copy_match_cross:
+ # t0_R - src pos
+ # cnt_R - total copy len
+
+ p1_neg cnt_R
+1:
+ ldrb sym, [dic, t0_R]
+ inc t0_R
+ strb sym, [dicPos, cnt_R]
+ inc cnt_R
+ cmp t0_R, dicBufSize
+ jne 1b
+
+ ldrb sym, [dic]
+ sub t0_R, dic, cnt_R
+ jmp copy_common
+
+
+
+
+/*
+fin_dicPos_LIMIT_REP_SHORT:
+ mov len, 1
+ jmp fin_OK
+*/
+
+/*
+fin_dicPos_LIMIT:
+ jmp fin_OK
+ # For more strict mode we can stop decoding with error
+ # mov sym, 1
+ # jmp fin
+*/
+
+fin_ERROR_MATCH_DIST:
+ # rep0 = distance + 1;
+ p2_add len, kMatchSpecLen_Error_Data
+ mov rep3, rep2
+ mov rep2, rep1
+ mov rep1, rep0
+ mov rep0, sym
+ STATE_UPDATE_FOR_MATCH
+ # jmp fin_OK
+ mov sym, 1
+ jmp fin
+
+end_of_payload:
+ inc_s sym
+ jnz fin_ERROR_MATCH_DIST
+
+ mov len, kMatchSpecLenStart
+ xor state, (1 << FLAG_STATE_BITS)
+ jmp fin_OK
+
+/*
+fin_OK_lit:
+ mov len, wzr
+*/
+
+fin_OK:
+ mov sym, wzr
+
+fin:
+ NORM
+
+ #define fin_lzma_reg t0_R
+
+ .macro STORE_LZMA_VAR reg:req, struct_offs:req
+ str \reg, [fin_lzma_reg, \struct_offs]
+ .endm
+
+ .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
+ stp \reg0, \reg1, [fin_lzma_reg, \struct_offs]
+ .endm
+
+ ldr fin_lzma_reg, [sp, 120]
+ p2_sub dicPos, dic
+ shr state, PSHIFT
+
+ STORE_LZMA_PAIR dicPos, buf, offset_dicPos
+ STORE_LZMA_PAIR range, cod, offset_range
+ STORE_LZMA_VAR processedPos, offset_processedPos
+ STORE_LZMA_PAIR rep0, rep1, offset_rep0
+ STORE_LZMA_PAIR rep2, rep3, offset_rep2
+ STORE_LZMA_PAIR state, len, offset_state
+
+ mov w0, sym
+
+ ldp x29, x30, [sp, 80]
+ ldp x27, x28, [sp, 64]
+ ldp x25, x26, [sp, 48]
+ ldp x23, x24, [sp, 32]
+ ldp x21, x22, [sp, 16]
+ ldp x19, x20, [sp], 128
+
+ ret
+/*
+ .cfi_endproc
+.LFE0:
+ .size LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3
+ .ident "TAG_LZMA"
+ .section .note.GNU-stack,"",@progbits
+*/