update libchdr
[pcsx_rearmed.git] / deps / libchdr / deps / lzma-22.01 / src / Asm / arm64 / LzmaDecOpt.S
diff --git a/deps/libchdr/deps/lzma-22.01/src/Asm/arm64/LzmaDecOpt.S b/deps/libchdr/deps/lzma-22.01/src/Asm/arm64/LzmaDecOpt.S
new file mode 100644 (file)
index 0000000..10dc473
--- /dev/null
@@ -0,0 +1,1487 @@
+// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function
+// 2021-04-25 : Igor Pavlov : Public domain
+
+/*
+; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
+; function for check at link time.
+; That code is tightly coupled with LzmaDec_TryDummy()
+; and with another functions in LzmaDec.c file.
+; CLzmaDec structure, (probs) array layout, input and output of
+; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
+*/
+
+
+#include "7zAsm.S"
+
+       // .arch armv8-a
+       // .file        "LzmaDecOpt.c"
+       .text
+       .align  2
+       .p2align 4,,15
+#ifdef __APPLE__
+        .globl _LzmaDec_DecodeReal_3
+#else        
+       .global LzmaDec_DecodeReal_3
+#endif        
+       // .type LzmaDec_DecodeReal_3, %function
+
+// #define _LZMA_SIZE_OPT 1
+
+#define LZMA_USE_4BYTES_FILL 1
+// #define LZMA_USE_2BYTES_COPY 1
+// #define LZMA_USE_CMOV_LZ_WRAP 1
+// #define _LZMA_PROB32 1
+
+#define MY_ALIGN_FOR_ENTRY   MY_ALIGN_32
+#define MY_ALIGN_FOR_LOOP    MY_ALIGN_32
+#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16
+
+#ifdef _LZMA_PROB32
+        .equ PSHIFT , 2
+        .macro PLOAD dest:req, mem:req
+                ldr     \dest, [\mem]
+        .endm
+        .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
+                ldr     \dest, [\mem, \offset]!
+        .endm
+        .macro PLOAD_2 dest:req, mem1:req, mem2:req
+                ldr     \dest, [\mem1, \mem2]
+        .endm
+        .macro PLOAD_LSL dest:req, mem1:req, mem2:req
+                ldr     \dest, [\mem1, \mem2, lsl #PSHIFT]
+        .endm
+        .macro PSTORE src:req, mem:req
+                str     \src, [\mem]
+        .endm
+        .macro PSTORE_2 src:req, mem1:req, mem2:req
+                str     \src, [\mem1, \mem2]
+        .endm
+        .macro PSTORE_LSL src:req, mem1:req, mem2:req
+                str     \src, [\mem1, \mem2, lsl #PSHIFT]
+        .endm
+        .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
+                // you must check that temp_reg is free register when macro is used
+                add     \temp_reg, \mem1, \mem2
+                str     \src, [\temp_reg, \mem2]
+        .endm
+#else
+        // .equ PSHIFT  , 1
+        #define PSHIFT  1
+        .macro PLOAD dest:req, mem:req
+                ldrh    \dest, [\mem]
+        .endm
+        .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
+                ldrh    \dest, [\mem, \offset]!
+        .endm
+        .macro PLOAD_2 dest:req, mem1:req, mem2:req
+                ldrh    \dest, [\mem1, \mem2]
+        .endm
+        .macro PLOAD_LSL dest:req, mem1:req, mem2:req
+                ldrh    \dest, [\mem1, \mem2, lsl #PSHIFT]
+        .endm
+        .macro PSTORE src:req, mem:req
+                strh    \src, [\mem]
+        .endm
+        .macro PSTORE_2 src:req, mem1:req, mem2:req
+                strh    \src, [\mem1, \mem2]
+        .endm
+        .macro PSTORE_LSL src:req, mem1:req, mem2:req
+                strh    \src, [\mem1, \mem2, lsl #PSHIFT]
+        .endm
+        .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
+                strh    \src, [\mem1, \mem2]
+        .endm
+#endif
+
+.equ PMULT    , (1 << PSHIFT)
+.equ PMULT_2  , (2 << PSHIFT)
+
+.equ kMatchSpecLen_Error_Data , (1 << 9)
+
+#       x7      t0 : NORM_CALC    : prob2 (IF_BIT_1)
+#       x6      t1 : NORM_CALC    : probs_state
+#       x8      t2 : (LITM) temp  : (TREE) temp
+#       x4      t3 : (LITM) bit   : (TREE) temp : UPDATE_0/UPDATE_0 temp
+#       x10     t4 : (LITM) offs  : (TREE) probs_PMULT : numBits
+#       x9      t5 : (LITM) match : sym2 (ShortDist)
+#       x1      t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos
+#       x2      t7 : (LITM) prm   : probBranch  : cnt
+#       x3      sym : dist
+#       x12     len
+#       x0      range
+#       x5      cod
+
+
+#define range   w0
+
+// t6
+#define pbPos     w1
+#define pbPos_R   r1
+#define prob_reg  w1
+#define litm_prob    prob_reg
+
+// t7
+#define probBranch    w2
+#define cnt     w2
+#define cnt_R   r2
+#define prm     r2
+
+#define sym     w3
+#define sym_R   r3
+#define dist       sym
+
+#define t3      w4
+#define bit     w4
+#define bit_R   r4
+#define update_temp_reg  r4
+
+#define cod     w5
+
+#define t1      w6
+#define t1_R    r6
+#define probs_state  t1_R
+
+#define t0      w7
+#define t0_R    r7
+#define prob2      t0
+
+#define t2      w8
+#define t2_R    r8 
+
+// t5
+#define match   w9
+#define sym2    w9
+#define sym2_R  r9
+
+#define t4      w10
+#define t4_R    r10
+
+#define offs    w10
+#define offs_R  r10
+
+#define probs   r11
+
+#define len     w12
+#define len_R   x12
+
+#define state   w13
+#define state_R r13
+
+#define dicPos          r14
+#define buf             r15
+#define bufLimit        r16
+#define dicBufSize      r17
+
+#define limit           r19
+#define rep0            w20
+#define rep0_R          r20
+#define rep1            w21
+#define rep2            w22
+#define rep3            w23
+#define dic             r24
+#define probs_IsMatch   r25
+#define probs_Spec      r26
+#define checkDicSize    w27
+#define processedPos    w28
+#define pbMask          w29
+#define lc2_lpMask      w30
+
+
+.equ kNumBitModelTotalBits   , 11
+.equ kBitModelTotal          , (1 << kNumBitModelTotalBits)
+.equ kNumMoveBits            , 5
+.equ kBitModelOffset         , (kBitModelTotal - (1 << kNumMoveBits) + 1)
+
+.macro NORM_2 macro
+        ldrb    t0, [buf], 1
+        shl     range, 8
+        orr     cod, t0, cod, lsl 8
+        /*
+        mov     t0, cod
+        ldrb    cod, [buf], 1
+        shl     range, 8
+        bfi    cod, t0, #8, #24
+        */
+.endm
+
+.macro TEST_HIGH_BYTE_range macro
+        tst     range, 0xFF000000
+.endm   
+
+.macro NORM macro
+        TEST_HIGH_BYTE_range
+        jnz     1f
+        NORM_2
+1:
+.endm
+
+
+# ---------- Branch MACROS ----------
+
+.macro UPDATE_0__0
+        sub     prob2, probBranch, kBitModelOffset
+.endm
+
+.macro UPDATE_0__1
+        sub     probBranch, probBranch, prob2, asr #(kNumMoveBits)
+.endm
+
+.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req
+     .if \probDisp == 0
+        PSTORE_2  probBranch, \probsArray, \probOffset
+    .elseif \probOffset == 0
+        PSTORE_2  probBranch, \probsArray, \probDisp * PMULT
+    .else
+        .error "unsupported"
+        // add     update_temp_reg, \probsArray, \probOffset
+        PSTORE_2  probBranch, update_temp_reg, \probDisp * PMULT
+    .endif
+.endm
+
+.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req
+        UPDATE_0__0
+        UPDATE_0__1
+        UPDATE_0__2 \probsArray, \probOffset, \probDisp
+.endm
+
+
+.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req
+        // sub     cod, cod, prob2
+        // sub     range, range, prob2
+        p2_sub  cod, range
+        sub     range, prob2, range
+        sub     prob2, probBranch, probBranch, lsr #(kNumMoveBits)
+    .if \probDisp == 0
+        PSTORE_2  prob2, \probsArray, \probOffset
+    .elseif \probOffset == 0
+        PSTORE_2  prob2, \probsArray, \probDisp * PMULT
+    .else
+        .error "unsupported"
+        // add     update_temp_reg, \probsArray, \probOffset
+        PSTORE_2  prob2, update_temp_reg, \probDisp * PMULT
+    .endif
+.endm
+
+
+.macro CMP_COD_BASE
+        NORM
+        // lsr     prob2, range, kNumBitModelTotalBits
+        // imul    prob2, probBranch
+        // cmp     cod, prob2
+        mov     prob2, range
+        shr     range, kNumBitModelTotalBits
+        imul    range, probBranch
+        cmp     cod, range
+.endm
+
+.macro CMP_COD_1 probsArray:req
+        PLOAD   probBranch, \probsArray
+        CMP_COD_BASE
+.endm
+
+.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req
+    .if \probDisp == 0
+        PLOAD_2 probBranch, \probsArray, \probOffset
+    .elseif \probOffset == 0
+        PLOAD_2 probBranch, \probsArray, \probDisp * PMULT
+    .else
+        .error "unsupported"
+        add     update_temp_reg, \probsArray, \probOffset
+        PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT
+    .endif
+        CMP_COD_BASE
+.endm
+
+
+.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
+        CMP_COD_3 \probsArray, \probOffset, \probDisp
+        jae     \toLabel
+.endm
+
+
+.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req
+        IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel
+        UPDATE_0 \probsArray, \probOffset, \probDisp
+.endm
+
+
+.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
+        CMP_COD_3 \probsArray, \probOffset, \probDisp
+        jb      \toLabel
+.endm
+
+.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req
+        CMP_COD_1 \probsArray
+        jb      \toLabel
+.endm
+
+
+# ---------- CMOV MACROS ----------
+
+.macro NORM_LSR
+        NORM
+        lsr     t0, range, #kNumBitModelTotalBits
+.endm
+
+.macro COD_RANGE_SUB
+        subs    t1, cod, t0
+        p2_sub  range, t0
+.endm
+
+.macro RANGE_IMUL prob:req
+        imul    t0, \prob
+.endm
+
+.macro NORM_CALC prob:req
+        NORM_LSR
+        RANGE_IMUL \prob
+        COD_RANGE_SUB
+.endm
+
+.macro CMOV_range
+        cmovb   range, t0
+.endm
+
+.macro CMOV_code
+        cmovae  cod, t1
+.endm
+
+.macro CMOV_code_Model_Pre prob:req
+        sub     t0, \prob, kBitModelOffset
+        CMOV_code
+        cmovae  t0, \prob
+.endm
+        
+
+.macro PUP_BASE_2 prob:req, dest_reg:req
+        # only sar works for both 16/32 bit prob modes
+        sub     \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits)
+.endm
+
+.macro PUP prob:req, probPtr:req, mem2:req
+        PUP_BASE_2 \prob, t0
+        PSTORE_2   t0, \probPtr, \mem2
+.endm
+
+
+
+#define probs_PMULT t4_R
+
+.macro BIT_01
+        add     probs_PMULT, probs, PMULT
+.endm
+
+
+.macro BIT_0_R prob:req
+        PLOAD_2 \prob, probs, 1 * PMULT
+        NORM_LSR
+            sub     t3, \prob, kBitModelOffset
+        RANGE_IMUL  \prob
+            PLOAD_2 t2, probs, 1 * PMULT_2
+        COD_RANGE_SUB
+        CMOV_range
+            cmovae  t3, \prob
+        PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT
+            PUP_BASE_2 \prob, t3
+        csel   \prob, t2, t0, lo
+            CMOV_code
+        mov     sym, 2
+        PSTORE_2  t3, probs, 1 * PMULT
+            adc     sym, sym, wzr
+        BIT_01
+.endm
+
+.macro BIT_1_R prob:req
+        NORM_LSR
+            p2_add  sym, sym
+            sub     t3, \prob, kBitModelOffset
+        RANGE_IMUL  \prob
+            PLOAD_LSL t2, probs, sym_R
+        COD_RANGE_SUB
+        CMOV_range
+            cmovae  t3, \prob
+        PLOAD_LSL t0, probs_PMULT, sym_R
+            PUP_BASE_2 \prob, t3
+        csel   \prob, t2, t0, lo
+            CMOV_code
+        PSTORE_LSL_M1  t3, probs, sym_R, t2_R
+            adc     sym, sym, wzr
+.endm
+
+
+.macro BIT_2_R prob:req
+        NORM_LSR
+            p2_add  sym, sym
+            sub     t3, \prob, kBitModelOffset
+        RANGE_IMUL  \prob
+        COD_RANGE_SUB
+        CMOV_range
+            cmovae  t3, \prob
+            CMOV_code
+            PUP_BASE_2 \prob, t3
+        PSTORE_LSL_M1  t3, probs, sym_R, t2_R
+            adc     sym, sym, wzr
+.endm
+
+
+# ---------- MATCHED LITERAL ----------
+
+.macro LITM_0 macro
+        shl     match, (PSHIFT + 1)
+        and     bit, match, 256 * PMULT
+        add     prm, probs, 256 * PMULT + 1 * PMULT
+        p2_add  match, match
+        p2_add  prm, bit_R
+        eor     offs, bit, 256 * PMULT
+        PLOAD   litm_prob, prm
+        
+        NORM_LSR
+            sub     t2, litm_prob, kBitModelOffset
+        RANGE_IMUL  litm_prob
+        COD_RANGE_SUB
+        cmovae  offs, bit
+            CMOV_range
+        and     bit, match, offs
+            cmovae  t2, litm_prob
+            CMOV_code
+            mov     sym, 2
+        PUP_BASE_2 litm_prob, t2
+        PSTORE  t2, prm
+        add     prm, probs, offs_R
+        adc     sym, sym, wzr
+.endm
+
+.macro LITM macro
+        p2_add  prm, bit_R
+            xor     offs, bit
+        PLOAD_LSL litm_prob, prm, sym_R
+        
+        NORM_LSR
+            p2_add  match, match
+            sub     t2, litm_prob, kBitModelOffset
+        RANGE_IMUL  litm_prob
+        COD_RANGE_SUB
+        cmovae  offs, bit
+            CMOV_range
+        and     bit, match, offs
+            cmovae  t2, litm_prob
+            CMOV_code
+        PUP_BASE_2 litm_prob, t2
+        PSTORE_LSL t2, prm, sym_R
+        add     prm, probs, offs_R
+        adc     sym, sym, sym
+.endm
+
+
+.macro LITM_2 macro
+        p2_add  prm, bit_R
+        PLOAD_LSL litm_prob, prm, sym_R
+        
+        NORM_LSR
+            sub     t2, litm_prob, kBitModelOffset
+        RANGE_IMUL  litm_prob
+        COD_RANGE_SUB
+            CMOV_range
+            cmovae  t2, litm_prob
+            CMOV_code
+        PUP_BASE_2 litm_prob, t2
+        PSTORE_LSL t2, prm, sym_R
+        adc     sym, sym, sym
+.endm
+
+
+# ---------- REVERSE BITS ----------
+
+.macro REV_0 prob:req
+        NORM_CALC \prob
+        CMOV_range
+        PLOAD   t2, sym2_R
+        PLOAD_2 t3, probs, 3 * PMULT
+        CMOV_code_Model_Pre \prob
+        add     t1_R, probs, 3 * PMULT
+        cmovae  sym2_R, t1_R
+        PUP     \prob, probs, 1 * PMULT
+        csel    \prob, t2, t3, lo
+.endm
+
+
+.macro REV_1 prob:req, step:req
+        NORM_LSR
+            PLOAD_PREINDEXED  t2, sym2_R, (\step * PMULT)
+        RANGE_IMUL  \prob
+        COD_RANGE_SUB
+        CMOV_range
+        PLOAD_2 t3, sym2_R, (\step * PMULT)
+        sub     t0, \prob, kBitModelOffset
+        CMOV_code
+        add     t1_R, sym2_R, \step * PMULT
+        cmovae  t0, \prob
+        cmovae  sym2_R, t1_R
+        PUP_BASE_2 \prob, t0
+        csel    \prob, t2, t3, lo
+        PSTORE_2   t0, t1_R, 0 - \step * PMULT_2
+.endm
+
+
+.macro REV_2 prob:req, step:req
+        sub     t1_R, sym2_R, probs
+        NORM_LSR
+            orr     sym, sym, t1, lsr #PSHIFT
+        RANGE_IMUL  \prob
+        COD_RANGE_SUB
+        sub     t2, sym, \step
+        CMOV_range
+        cmovb   sym, t2
+        CMOV_code_Model_Pre \prob
+        PUP     \prob, sym2_R, 0
+.endm
+
+
+.macro REV_1_VAR prob:req
+        PLOAD   \prob, sym_R
+        mov     probs, sym_R
+        p2_add  sym_R, sym2_R
+        NORM_LSR
+            add     t2_R, sym_R, sym2_R
+        RANGE_IMUL  \prob
+        COD_RANGE_SUB
+        cmovae  sym_R, t2_R
+        CMOV_range
+        CMOV_code_Model_Pre \prob
+        p2_add  sym2, sym2
+        PUP     \prob, probs, 0
+.endm
+
+
+.macro add_big dest:req, src:req, param:req
+    .if (\param) < (1 << 12)
+        add     \dest, \src, \param
+    .else
+        #ifndef _LZMA_PROB32    
+          .error "unexpcted add_big expansion"
+        #endif
+        add     \dest, \src, (\param) / 2
+        add     \dest, \dest, (\param) - (\param) / 2
+    .endif
+.endm
+
+.macro sub_big dest:req, src:req, param:req
+    .if (\param) < (1 << 12)
+        sub     \dest, \src, \param
+    .else
+        #ifndef _LZMA_PROB32    
+          .error "unexpcted sub_big expansion"
+        #endif
+        sub     \dest, \src, (\param) / 2
+        sub     \dest, \dest, (\param) - (\param) / 2
+    .endif
+.endm
+
+
+.macro SET_probs offset:req
+        // add_big probs, probs_Spec, (\offset) * PMULT
+        add     probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT
+.endm        
+
+
+.macro LIT_PROBS
+        add     sym, sym, processedPos, lsl 8
+        inc     processedPos
+        UPDATE_0__0
+        shl     sym, lc2_lpMask
+        SET_probs Literal
+        p2_and  sym, lc2_lpMask
+        // p2_add  probs_state, pbPos_R
+        p2_add  probs, sym_R
+        UPDATE_0__1
+        add     probs, probs, sym_R, lsl 1
+        UPDATE_0__2 probs_state, pbPos_R, 0
+.endm
+
+
+
+.equ kNumPosBitsMax       , 4
+.equ kNumPosStatesMax     , (1 << kNumPosBitsMax)
+                         
+.equ kLenNumLowBits       , 3
+.equ kLenNumLowSymbols    , (1 << kLenNumLowBits)
+.equ kLenNumHighBits      , 8
+.equ kLenNumHighSymbols   , (1 << kLenNumHighBits)
+.equ kNumLenProbs         , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
+                         
+.equ LenLow               , 0
+.equ LenChoice            , LenLow
+.equ LenChoice2           , (LenLow + kLenNumLowSymbols)
+.equ LenHigh              , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
+                         
+.equ kNumStates           , 12
+.equ kNumStates2          , 16
+.equ kNumLitStates        , 7
+                         
+.equ kStartPosModelIndex  , 4
+.equ kEndPosModelIndex    , 14
+.equ kNumFullDistances    , (1 << (kEndPosModelIndex >> 1))
+                         
+.equ kNumPosSlotBits      , 6
+.equ kNumLenToPosStates   , 4
+                         
+.equ kNumAlignBits        , 4
+.equ kAlignTableSize      , (1 << kNumAlignBits)
+                         
+.equ kMatchMinLen         , 2
+.equ kMatchSpecLenStart   , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
+
+// .equ kStartOffset    , 1408
+.equ kStartOffset    , 0
+.equ SpecPos         , (-kStartOffset)
+.equ IsRep0Long      , (SpecPos + kNumFullDistances)
+.equ RepLenCoder     , (IsRep0Long + (kNumStates2 << kNumPosBitsMax))
+.equ LenCoder        , (RepLenCoder + kNumLenProbs)
+.equ IsMatch         , (LenCoder + kNumLenProbs)
+.equ kAlign          , (IsMatch + (kNumStates2 << kNumPosBitsMax))
+.equ IsRep           , (kAlign + kAlignTableSize)
+.equ IsRepG0         , (IsRep + kNumStates)
+.equ IsRepG1         , (IsRepG0 + kNumStates)
+.equ IsRepG2         , (IsRepG1 + kNumStates)
+.equ PosSlot         , (IsRepG2 + kNumStates)
+.equ Literal         , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
+.equ NUM_BASE_PROBS  , (Literal + kStartOffset)
+
+.if kStartOffset != 0   // && IsMatch != 0
+  .error "Stop_Compiling_Bad_StartOffset"
+.endif
+
+.if NUM_BASE_PROBS != 1984
+  .error "Stop_Compiling_Bad_LZMA_PROBS"
+.endif
+
+.equ offset_lc    , 0
+.equ offset_lp    , 1
+.equ offset_pb    , 2
+.equ offset_dicSize       , 4
+.equ offset_probs         , 4 + offset_dicSize
+.equ offset_probs_1664    , 8 + offset_probs
+.equ offset_dic           , 8 + offset_probs_1664
+.equ offset_dicBufSize    , 8 + offset_dic
+.equ offset_dicPos        , 8 + offset_dicBufSize
+.equ offset_buf           , 8 + offset_dicPos
+.equ offset_range         , 8 + offset_buf
+.equ offset_code          , 4 + offset_range
+.equ offset_processedPos  , 4 + offset_code
+.equ offset_checkDicSize  , 4 + offset_processedPos
+.equ offset_rep0          , 4 + offset_checkDicSize
+.equ offset_rep1          , 4 + offset_rep0
+.equ offset_rep2          , 4 + offset_rep1
+.equ offset_rep3          , 4 + offset_rep2
+.equ offset_state         , 4 + offset_rep3
+.equ offset_remainLen     , 4 + offset_state
+.equ offset_TOTAL_SIZE    , 4 + offset_remainLen
+
+.if offset_TOTAL_SIZE != 96
+  .error "Incorrect offset_TOTAL_SIZE"
+.endif
+
+
+.macro IsMatchBranch_Pre
+        # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
+        and     pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT)
+        add     probs_state, probs_IsMatch, state_R
+.endm
+
+
+/*
+.macro IsMatchBranch
+        IsMatchBranch_Pre
+        IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+.endm
+*/        
+
+.macro CheckLimits
+        cmp     buf, bufLimit
+        jae     fin_OK
+        cmp     dicPos, limit
+        jae     fin_OK
+.endm
+
+#define  CheckLimits_lit  CheckLimits
+/*
+.macro CheckLimits_lit
+        cmp     buf, bufLimit
+        jae     fin_OK_lit
+        cmp     dicPos, limit
+        jae     fin_OK_lit
+.endm
+*/
+
+
+#define PARAM_lzma      REG_ABI_PARAM_0
+#define PARAM_limit     REG_ABI_PARAM_1
+#define PARAM_bufLimit  REG_ABI_PARAM_2
+
+
+.macro LOAD_LZMA_VAR reg:req, struct_offs:req
+        ldr     \reg, [PARAM_lzma, \struct_offs]
+.endm
+
+.macro LOAD_LZMA_BYTE reg:req, struct_offs:req
+        ldrb    \reg, [PARAM_lzma, \struct_offs]
+.endm
+
+.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
+        ldp     \reg0, \reg1, [PARAM_lzma, \struct_offs]
+.endm
+
+
+LzmaDec_DecodeReal_3:
+_LzmaDec_DecodeReal_3:
+/*
+.LFB0:
+       .cfi_startproc  
+*/
+
+       stp     x19, x20, [sp, -128]!
+       stp     x21, x22, [sp, 16]
+       stp     x23, x24, [sp, 32]
+       stp     x25, x26, [sp, 48]
+       stp     x27, x28, [sp, 64]
+       stp     x29, x30, [sp, 80]
+        
+        str     PARAM_lzma, [sp, 120]
+        
+        mov     bufLimit, PARAM_bufLimit
+        mov     limit, PARAM_limit
+        
+        LOAD_LZMA_PAIR  dic, dicBufSize, offset_dic
+        LOAD_LZMA_PAIR  dicPos, buf, offset_dicPos
+        LOAD_LZMA_PAIR  rep0, rep1, offset_rep0
+        LOAD_LZMA_PAIR  rep2, rep3, offset_rep2
+        
+        mov     t0, 1 << (kLenNumLowBits + 1 + PSHIFT)
+        LOAD_LZMA_BYTE  pbMask, offset_pb
+        p2_add  limit, dic
+        mov     len, wzr    // we can set it in all requiread branches instead
+        lsl     pbMask, t0, pbMask
+        p2_add  dicPos, dic
+        p2_sub  pbMask, t0
+
+        LOAD_LZMA_BYTE  lc2_lpMask, offset_lc
+        mov     t0, 256 << PSHIFT
+        LOAD_LZMA_BYTE  t1, offset_lp
+        p2_add  t1, lc2_lpMask
+        p2_sub  lc2_lpMask, (256 << PSHIFT) - PSHIFT
+        shl     t0, t1
+        p2_add  lc2_lpMask, t0
+        
+        LOAD_LZMA_VAR   probs_Spec, offset_probs
+        LOAD_LZMA_VAR   checkDicSize, offset_checkDicSize
+        LOAD_LZMA_VAR   processedPos, offset_processedPos
+        LOAD_LZMA_VAR   state, offset_state
+        // range is r0 : this load must be last don't move        
+        LOAD_LZMA_PAIR  range, cod, offset_range    
+        mov     sym, wzr
+        shl     state, PSHIFT
+
+        add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT)
+
+        // if (processedPos != 0 || checkDicSize != 0)
+        orr     t0, checkDicSize, processedPos
+        cbz     t0, 1f
+        add     t0_R, dicBufSize, dic
+        cmp     dicPos, dic
+        cmovne  t0_R, dicPos
+        ldrb    sym, [t0_R, -1]
+1:
+        IsMatchBranch_Pre
+        cmp     state, 4 * PMULT
+        jb      lit_end
+        cmp     state, kNumLitStates * PMULT
+        jb      lit_matched_end
+        jmp     lz_end
+        
+
+        
+#define BIT_0  BIT_0_R prob_reg
+#define BIT_1  BIT_1_R prob_reg
+#define BIT_2  BIT_2_R prob_reg
+
+# ---------- LITERAL ----------
+MY_ALIGN_64
+lit_start:
+        mov     state, wzr
+lit_start_2:
+        LIT_PROBS
+
+    #ifdef _LZMA_SIZE_OPT
+
+        PLOAD_2 prob_reg, probs, 1 * PMULT
+        mov     sym, 1
+        BIT_01        
+MY_ALIGN_FOR_LOOP
+lit_loop:
+        BIT_1
+        tbz     sym, 7, lit_loop
+        
+    #else
+        
+        BIT_0
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+        
+    #endif
+
+        BIT_2
+        IsMatchBranch_Pre
+        strb    sym, [dicPos], 1
+        p2_and  sym, 255
+                
+        CheckLimits_lit
+lit_end:
+        IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start
+
+        # jmp     IsMatch_label
+        
+
+#define FLAG_STATE_BITS (4 + PSHIFT)          
+
+# ---------- MATCHES ----------
+# MY_ALIGN_FOR_ENTRY
+IsMatch_label:
+        UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch)
+        IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label
+
+        SET_probs LenCoder
+        or      state, (1 << FLAG_STATE_BITS)
+
+# ---------- LEN DECODE ----------
+len_decode:
+        mov     len, 8 - kMatchMinLen
+        IF_BIT_0_NOUP_1 probs, len_mid_0
+        UPDATE_1 probs, 0, 0
+        p2_add  probs, (1 << (kLenNumLowBits + PSHIFT))
+        mov     len, 0 - kMatchMinLen
+        IF_BIT_0_NOUP_1 probs, len_mid_0
+        UPDATE_1 probs, 0, 0
+        p2_add  probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT))
+        
+    #if 0 == 1
+        BIT_0
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+   #else
+        PLOAD_2 prob_reg, probs, 1 * PMULT
+        mov     sym, 1
+        BIT_01
+MY_ALIGN_FOR_LOOP
+len8_loop:
+        BIT_1
+        tbz     sym, 6, len8_loop
+   #endif        
+        
+        mov     len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen
+        jmp     len_mid_2 
+        
+MY_ALIGN_FOR_ENTRY
+len_mid_0:
+        UPDATE_0 probs, 0, 0
+        p2_add  probs, pbPos_R
+        BIT_0
+len_mid_2:
+        BIT_1
+        BIT_2
+        sub     len, sym, len
+        tbz     state, FLAG_STATE_BITS, copy_match
+        
+# ---------- DECODE DISTANCE ----------
+        // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
+
+        mov     t0, 3 + kMatchMinLen
+        cmp     len, 3 + kMatchMinLen
+        cmovb   t0, len
+        SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits))
+        add     probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT)
+        
+    #ifdef _LZMA_SIZE_OPT
+
+        PLOAD_2 prob_reg, probs, 1 * PMULT
+        mov     sym, 1
+        BIT_01
+MY_ALIGN_FOR_LOOP
+slot_loop:
+        BIT_1
+        tbz     sym, 5, slot_loop
+        
+    #else
+        
+        BIT_0
+        BIT_1
+        BIT_1
+        BIT_1
+        BIT_1
+        
+    #endif
+        
+    #define numBits t4
+        mov     numBits, sym
+        BIT_2
+        // we need only low bits
+        p2_and  sym, 3
+        cmp     numBits, 32 + kEndPosModelIndex / 2
+        jb      short_dist
+
+        SET_probs kAlign
+
+        #  unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
+        p2_sub  numBits, (32 + 1 + kNumAlignBits)
+        #  distance = (2 | (distance & 1));
+        or      sym, 2
+        PLOAD_2 prob_reg, probs, 1 * PMULT
+        add     sym2_R, probs, 2 * PMULT
+        
+# ---------- DIRECT DISTANCE ----------
+
+.macro DIRECT_1
+        shr     range, 1
+        subs    t0, cod, range
+        p2_add  sym, sym
+        // add     t1, sym, 1
+        csel    cod, cod, t0, mi
+        csinc   sym, sym, sym, mi
+        // csel    sym, t1, sym, pl
+        // adc     sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams
+        dec_s   numBits
+        je      direct_end
+.endm
+
+    #ifdef _LZMA_SIZE_OPT
+
+        jmp     direct_norm
+MY_ALIGN_FOR_ENTRY
+direct_loop:
+        DIRECT_1
+direct_norm:
+        TEST_HIGH_BYTE_range
+        jnz     direct_loop
+        NORM_2
+        jmp     direct_loop
+
+    #else        
+
+.macro DIRECT_2
+        TEST_HIGH_BYTE_range
+        jz      direct_unroll
+        DIRECT_1
+.endm
+
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        DIRECT_2
+        
+direct_unroll:
+        NORM_2
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        DIRECT_1
+        jmp     direct_unroll
+    
+    #endif
+
+MY_ALIGN_FOR_ENTRY
+direct_end:
+        shl     sym, kNumAlignBits
+        REV_0   prob_reg
+        REV_1   prob_reg, 2
+        REV_1   prob_reg, 4
+        REV_2   prob_reg, 8
+
+decode_dist_end:
+
+    // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
+
+        tst     checkDicSize, checkDicSize
+        csel    t0, processedPos, checkDicSize, eq
+        cmp     sym, t0
+        jae     end_of_payload
+        // jmp     end_of_payload # for debug
+        
+        mov     rep3, rep2
+        mov     rep2, rep1
+        mov     rep1, rep0
+        add     rep0, sym, 1
+
+.macro  STATE_UPDATE_FOR_MATCH
+        // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
+        // cmp     state, (kNumStates + kNumLitStates) * PMULT
+        cmp     state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS)
+        mov     state, kNumLitStates * PMULT
+        mov     t0, (kNumLitStates + 3) * PMULT
+        cmovae  state, t0
+.endm
+        STATE_UPDATE_FOR_MATCH
+        
+# ---------- COPY MATCH ----------
+copy_match:
+
+    // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA;
+        subs    cnt_R, limit, dicPos
+        // jz      fin_dicPos_LIMIT
+        jz      fin_OK
+
+    // curLen = ((rem < len) ? (unsigned)rem : len);
+        cmp     cnt_R, len_R
+        cmovae  cnt, len
+
+        sub     t0_R, dicPos, dic
+        p2_add  dicPos, cnt_R
+        p2_add  processedPos, cnt
+        p2_sub  len, cnt
+        
+    // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
+        p2_sub_s  t0_R, rep0_R
+        jae     1f
+
+        cmn     t0_R, cnt_R
+        p2_add  t0_R, dicBufSize
+        ja      copy_match_cross
+1:
+# ---------- COPY MATCH FAST ----------
+    # t0_R : src_pos
+        p2_add  t0_R, dic
+        ldrb    sym, [t0_R]
+        p2_add  t0_R, cnt_R
+        p1_neg  cnt_R
+
+copy_common:
+        dec     dicPos
+
+    # dicPos  : (ptr_to_last_dest_BYTE)    
+    # t0_R    : (src_lim)
+    # cnt_R   : (-curLen)
+
+        IsMatchBranch_Pre
+        
+        inc_s   cnt_R
+        jz      copy_end
+        
+        cmp     rep0, 1
+        je      copy_match_0
+   
+    #ifdef LZMA_USE_2BYTES_COPY
+        strb    sym, [dicPos, cnt_R]
+        dec     dicPos
+    # dicPos  : (ptr_to_last_dest_16bitWORD)    
+        p2_and  cnt_R, -2
+        ldrh    sym, [t0_R, cnt_R]
+        adds    cnt_R, cnt_R, 2
+        jz      2f
+MY_ALIGN_FOR_LOOP
+1:
+        /*
+        strh    sym, [dicPos, cnt_R]
+        ldrh    sym, [t0_R, cnt_R]
+        adds    cnt_R, cnt_R, 2
+        jz      2f
+        */
+
+        strh    sym, [dicPos, cnt_R]
+        ldrh    sym, [t0_R, cnt_R]
+        adds    cnt_R, cnt_R, 2
+        jnz     1b
+2:
+        
+        /*
+        // for universal little/big endian code, but slow
+        strh    sym, [dicPos]
+        inc     dicPos 
+        ldrb    sym, [t0_R, -1]
+        */
+
+        #if  __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        // we must improve big-endian detection for another compilers 
+        // for big-endian we need to revert bytes
+        rev16   sym, sym         
+        #endif
+        
+        // (sym) must represent as little-endian here:
+        strb    sym, [dicPos], 1
+        shr     sym, 8             
+
+    #else
+
+MY_ALIGN_FOR_LOOP
+1:
+        strb    sym, [dicPos, cnt_R]
+        ldrb    sym, [t0_R, cnt_R]
+        inc_s   cnt_R
+        jz      copy_end
+
+        strb    sym, [dicPos, cnt_R]
+        ldrb    sym, [t0_R, cnt_R]
+        inc_s   cnt_R
+        jnz     1b
+    #endif
+
+copy_end:
+lz_end_match:
+        strb    sym, [dicPos], 1
+  
+        # IsMatchBranch_Pre
+        CheckLimits
+lz_end:
+        IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+
+
+
+# ---------- LITERAL MATCHED ----------
+                
+        LIT_PROBS
+        
+    // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+
+        sub     t0_R, dicPos, dic
+        p2_sub_s t0_R, rep0_R
+    
+    #ifdef LZMA_USE_CMOV_LZ_WRAP
+        add     t1_R, t0_R, dicBufSize
+        cmovb   t0_R, t1_R
+    #else                
+        jae     1f
+        p2_add  t0_R, dicBufSize
+1:
+    #endif                        
+
+        ldrb    match, [dic, t0_R]
+
+    // state -= (state < 10) ? 3 : 6;
+        sub     sym, state, 6 * PMULT
+        cmp     state, 10 * PMULT
+        p2_sub  state, 3 * PMULT
+        cmovae  state, sym
+
+    #ifdef _LZMA_SIZE_OPT
+
+        mov     offs, 256 * PMULT
+        shl     match, (PSHIFT + 1)
+        mov     sym, 1
+        and     bit, match, offs
+        add     prm, probs, offs_R
+
+MY_ALIGN_FOR_LOOP
+litm_loop:
+        LITM
+        tbz     sym, 8, litm_loop
+        
+    #else
+        
+        LITM_0
+        LITM
+        LITM
+        LITM
+        LITM
+        LITM
+        LITM
+        LITM_2
+        
+    #endif
+    
+        IsMatchBranch_Pre
+        strb    sym, [dicPos], 1
+        p2_and  sym, 255
+        
+        // mov     len, wzr // LITM uses same regisetr (len / offs). So we clear it
+        CheckLimits_lit
+lit_matched_end:
+        IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
+        # IsMatchBranch
+        p2_sub  state, 3 * PMULT
+        jmp     lit_start_2
+        
+
+
+# ---------- REP 0 LITERAL ----------
+MY_ALIGN_FOR_ENTRY
+IsRep0Short_label:
+        UPDATE_0 probs_state, pbPos_R, 0
+
+    // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+        sub     t0_R, dicPos, dic
+        
+        // state = state < kNumLitStates ? 9 : 11;
+        or      state, 1 * PMULT
+        
+        # the caller doesn't allow (dicPos >= limit) case for REP_SHORT
+        # so we don't need the following (dicPos == limit) check here:
+        # cmp     dicPos, limit
+        # jae     fin_dicPos_LIMIT_REP_SHORT
+        # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes
+
+        inc     processedPos
+
+        IsMatchBranch_Pre
+       
+        p2_sub_s t0_R, rep0_R
+    #ifdef LZMA_USE_CMOV_LZ_WRAP
+        add     sym_R, t0_R, dicBufSize
+        cmovb   t0_R, sym_R
+    #else       
+        jae     1f
+        p2_add  t0_R, dicBufSize
+1:
+    #endif
+        
+        ldrb    sym, [dic, t0_R]
+        // mov     len, wzr
+        jmp     lz_end_match
+        
+MY_ALIGN_FOR_ENTRY
+IsRep_label:
+        UPDATE_1 probs_state, 0, (IsRep - IsMatch)
+
+        # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
+        # So we don't check it here.
+        
+        # mov     t0, processedPos
+        # or      t0, checkDicSize
+        # jz      fin_ERROR_2
+
+        // state = state < kNumLitStates ? 8 : 11;
+        cmp     state, kNumLitStates * PMULT
+        mov     state, 8 * PMULT
+        mov     probBranch, 11 * PMULT
+        cmovae  state, probBranch
+
+        SET_probs RepLenCoder
+        
+        IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label
+        sub_big  probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT
+        IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label
+        UPDATE_1 probs_state, pbPos_R, 0
+        jmp     len_decode
+
+MY_ALIGN_FOR_ENTRY
+IsRepG0_label:
+        UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch)
+        IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label
+        mov     dist, rep1
+        mov     rep1, rep0
+        mov     rep0, dist
+        jmp     len_decode
+        
+# MY_ALIGN_FOR_ENTRY
+IsRepG1_label:
+        UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch)
+        IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label
+        mov     dist, rep2
+        mov     rep2, rep1
+        mov     rep1, rep0
+        mov     rep0, dist
+        jmp     len_decode
+
+# MY_ALIGN_FOR_ENTRY
+IsRepG2_label:
+        UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch)
+        mov     dist, rep3
+        mov     rep3, rep2
+        mov     rep2, rep1
+        mov     rep1, rep0
+        mov     rep0, dist
+        jmp     len_decode
+
+        
+
+# ---------- SPEC SHORT DISTANCE ----------
+
+MY_ALIGN_FOR_ENTRY
+short_dist:
+        p2_sub_s numBits, 32 + 1
+        jbe     decode_dist_end
+        or      sym, 2
+        shl     sym, numBits
+        add     sym_R, probs_Spec, sym_R, lsl #PSHIFT
+        p2_add  sym_R, SpecPos * PMULT + 1 * PMULT
+        mov     sym2, PMULT // # step
+MY_ALIGN_FOR_LOOP
+spec_loop:
+        REV_1_VAR prob_reg
+        dec_s   numBits
+        jnz     spec_loop
+        
+        p2_add  sym2_R, probs_Spec
+    .if SpecPos != 0
+        p2_add  sym2_R, SpecPos * PMULT
+    .endif
+        p2_sub  sym_R, sym2_R
+        shr     sym, PSHIFT
+        
+        jmp     decode_dist_end
+
+
+
+# ---------- COPY MATCH 0 ----------
+MY_ALIGN_FOR_ENTRY
+copy_match_0:
+    #ifdef LZMA_USE_4BYTES_FILL
+        strb    sym, [dicPos, cnt_R]
+        inc_s   cnt_R
+        jz      copy_end
+        
+        strb    sym, [dicPos, cnt_R]
+        inc_s   cnt_R
+        jz      copy_end
+        
+        strb    sym, [dicPos, cnt_R]
+        inc_s   cnt_R
+        jz      copy_end
+        
+        orr     t3, sym, sym, lsl 8
+        p2_and  cnt_R, -4
+        orr     t3, t3, t3, lsl 16
+MY_ALIGN_FOR_LOOP_16
+1:
+        /*
+        str     t3, [dicPos, cnt_R]
+        adds    cnt_R, cnt_R, 4
+        jz      2f
+        */
+
+        str     t3, [dicPos, cnt_R]
+        adds    cnt_R, cnt_R, 4
+        jnz     1b
+2:
+        // p2_and  sym, 255
+    #else
+
+MY_ALIGN_FOR_LOOP
+1:
+        strb    sym, [dicPos, cnt_R]
+        inc_s   cnt_R
+        jz      copy_end
+
+        strb    sym, [dicPos, cnt_R]
+        inc_s   cnt_R
+        jnz     1b
+    #endif        
+
+    jmp     copy_end
+
+
+# ---------- COPY MATCH CROSS ----------
+copy_match_cross:
+        # t0_R  - src pos
+        # cnt_R - total copy len
+
+        p1_neg  cnt_R
+1:
+        ldrb    sym, [dic, t0_R]
+        inc     t0_R
+        strb    sym, [dicPos, cnt_R]
+        inc     cnt_R
+        cmp     t0_R, dicBufSize
+        jne     1b
+        
+        ldrb    sym, [dic]
+        sub     t0_R, dic, cnt_R
+        jmp     copy_common
+
+
+
+
+/*
+fin_dicPos_LIMIT_REP_SHORT:
+        mov     len, 1
+        jmp     fin_OK
+*/
+
+/*
+fin_dicPos_LIMIT:
+        jmp     fin_OK
+        # For more strict mode we can stop decoding with error
+        # mov     sym, 1
+        # jmp     fin
+*/
+
+fin_ERROR_MATCH_DIST:
+        # rep0 = distance + 1;
+        p2_add  len, kMatchSpecLen_Error_Data
+        mov     rep3, rep2
+        mov     rep2, rep1
+        mov     rep1, rep0
+        mov     rep0, sym
+        STATE_UPDATE_FOR_MATCH
+        # jmp     fin_OK
+        mov     sym, 1
+        jmp     fin
+
+end_of_payload:
+        inc_s   sym
+        jnz     fin_ERROR_MATCH_DIST
+
+        mov     len, kMatchSpecLenStart
+        xor     state, (1 << FLAG_STATE_BITS)
+        jmp     fin_OK
+
+/*
+fin_OK_lit:
+        mov     len, wzr
+*/
+
+fin_OK:
+        mov     sym, wzr
+
+fin:
+        NORM
+
+    #define fin_lzma_reg  t0_R
+
+   .macro STORE_LZMA_VAR reg:req, struct_offs:req
+        str     \reg, [fin_lzma_reg, \struct_offs]
+   .endm
+
+   .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
+        stp     \reg0, \reg1, [fin_lzma_reg, \struct_offs]
+   .endm
+
+        ldr     fin_lzma_reg, [sp, 120]
+        p2_sub  dicPos, dic
+        shr     state, PSHIFT
+
+        STORE_LZMA_PAIR   dicPos, buf,  offset_dicPos
+        STORE_LZMA_PAIR   range, cod,   offset_range
+        STORE_LZMA_VAR    processedPos, offset_processedPos
+        STORE_LZMA_PAIR   rep0, rep1,   offset_rep0
+        STORE_LZMA_PAIR   rep2, rep3,   offset_rep2
+        STORE_LZMA_PAIR   state, len,   offset_state
+
+        mov     w0, sym
+        
+       ldp     x29, x30, [sp, 80]
+       ldp     x27, x28, [sp, 64]
+       ldp     x25, x26, [sp, 48]
+        ldp    x23, x24, [sp, 32]
+       ldp     x21, x22, [sp, 16]
+       ldp     x19, x20, [sp], 128
+
+        ret
+/*
+       .cfi_endproc
+.LFE0:
+       .size   LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3
+       .ident  "TAG_LZMA"
+       .section        .note.GNU-stack,"",@progbits
+*/