// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function
// 2021-04-25 : Igor Pavlov : Public domain

/*
; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
; function for check at link time.
; That code is tightly coupled with LzmaDec_TryDummy()
; and with another functions in LzmaDec.c file.
; CLzmaDec structure, (probs) array layout, input and output of
; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
*/


#include "7zAsm.S"

	// .arch armv8-a
	// .file        "LzmaDecOpt.c"
	.text
	.align	2
	.p2align 4,,15
#ifdef __APPLE__
        .globl _LzmaDec_DecodeReal_3
#else        
	.global LzmaDec_DecodeReal_3
#endif        
	// .type LzmaDec_DecodeReal_3, %function

// #define _LZMA_SIZE_OPT 1

#define LZMA_USE_4BYTES_FILL 1
// #define LZMA_USE_2BYTES_COPY 1
// #define LZMA_USE_CMOV_LZ_WRAP 1
// #define _LZMA_PROB32 1

#define MY_ALIGN_FOR_ENTRY   MY_ALIGN_32
#define MY_ALIGN_FOR_LOOP    MY_ALIGN_32
#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16

#ifdef _LZMA_PROB32
        .equ PSHIFT , 2
        .macro PLOAD dest:req, mem:req
                ldr     \dest, [\mem]
        .endm
        .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
                ldr     \dest, [\mem, \offset]!
        .endm
        .macro PLOAD_2 dest:req, mem1:req, mem2:req
                ldr     \dest, [\mem1, \mem2]
        .endm
        .macro PLOAD_LSL dest:req, mem1:req, mem2:req
                ldr     \dest, [\mem1, \mem2, lsl #PSHIFT]
        .endm
        .macro PSTORE src:req, mem:req
                str     \src, [\mem]
        .endm
        .macro PSTORE_2 src:req, mem1:req, mem2:req
                str     \src, [\mem1, \mem2]
        .endm
        .macro PSTORE_LSL src:req, mem1:req, mem2:req
                str     \src, [\mem1, \mem2, lsl #PSHIFT]
        .endm
        .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
                // you must check that temp_reg is free register when macro is used
                add     \temp_reg, \mem1, \mem2
                str     \src, [\temp_reg, \mem2]
        .endm
#else
        // .equ PSHIFT  , 1
        #define PSHIFT  1
        .macro PLOAD dest:req, mem:req
                ldrh    \dest, [\mem]
        .endm
        .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
                ldrh    \dest, [\mem, \offset]!
        .endm
        .macro PLOAD_2 dest:req, mem1:req, mem2:req
                ldrh    \dest, [\mem1, \mem2]
        .endm
        .macro PLOAD_LSL dest:req, mem1:req, mem2:req
                ldrh    \dest, [\mem1, \mem2, lsl #PSHIFT]
        .endm
        .macro PSTORE src:req, mem:req
                strh    \src, [\mem]
        .endm
        .macro PSTORE_2 src:req, mem1:req, mem2:req
                strh    \src, [\mem1, \mem2]
        .endm
        .macro PSTORE_LSL src:req, mem1:req, mem2:req
                strh    \src, [\mem1, \mem2, lsl #PSHIFT]
        .endm
        .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
                strh    \src, [\mem1, \mem2]
        .endm
#endif

.equ PMULT    , (1 << PSHIFT)
.equ PMULT_2  , (2 << PSHIFT)

.equ kMatchSpecLen_Error_Data , (1 << 9)

#       x7      t0 : NORM_CALC    : prob2 (IF_BIT_1)
#       x6      t1 : NORM_CALC    : probs_state
#       x8      t2 : (LITM) temp  : (TREE) temp
#       x4      t3 : (LITM) bit   : (TREE) temp : UPDATE_0/UPDATE_0 temp
#       x10     t4 : (LITM) offs  : (TREE) probs_PMULT : numBits
#       x9      t5 : (LITM) match : sym2 (ShortDist)
#       x1      t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos
#       x2      t7 : (LITM) prm   : probBranch  : cnt
#       x3      sym : dist
#       x12     len
#       x0      range
#       x5      cod


#define range   w0

// t6
#define pbPos     w1
#define pbPos_R   r1
#define prob_reg  w1
#define litm_prob    prob_reg

// t7
#define probBranch    w2
#define cnt     w2
#define cnt_R   r2
#define prm     r2

#define sym     w3
#define sym_R   r3
#define dist       sym

#define t3      w4
#define bit     w4
#define bit_R   r4
#define update_temp_reg  r4

#define cod     w5

#define t1      w6
#define t1_R    r6
#define probs_state  t1_R

#define t0      w7
#define t0_R    r7
#define prob2      t0

#define t2      w8
#define t2_R    r8 

// t5
#define match   w9
#define sym2    w9
#define sym2_R  r9

#define t4      w10
#define t4_R    r10

#define offs    w10
#define offs_R  r10

#define probs   r11

#define len     w12
#define len_R   x12

#define state   w13
#define state_R r13

#define dicPos          r14
#define buf             r15
#define bufLimit        r16
#define dicBufSize      r17

#define limit           r19
#define rep0            w20
#define rep0_R          r20
#define rep1            w21
#define rep2            w22
#define rep3            w23
#define dic             r24
#define probs_IsMatch   r25
#define probs_Spec      r26
#define checkDicSize    w27
#define processedPos    w28
#define pbMask          w29
#define lc2_lpMask      w30


.equ kNumBitModelTotalBits   , 11
.equ kBitModelTotal          , (1 << kNumBitModelTotalBits)
.equ kNumMoveBits            , 5
.equ kBitModelOffset         , (kBitModelTotal - (1 << kNumMoveBits) + 1)

.macro NORM_2 macro
        ldrb    t0, [buf], 1
        shl     range, 8
        orr     cod, t0, cod, lsl 8
        /*
        mov     t0, cod
        ldrb    cod, [buf], 1
        shl     range, 8
        bfi	cod, t0, #8, #24
        */
.endm

.macro TEST_HIGH_BYTE_range macro
        tst     range, 0xFF000000
.endm   

.macro NORM macro
        TEST_HIGH_BYTE_range
        jnz     1f
        NORM_2
1:
.endm


# ---------- Branch MACROS ----------

.macro UPDATE_0__0
        sub     prob2, probBranch, kBitModelOffset
.endm

.macro UPDATE_0__1
        sub     probBranch, probBranch, prob2, asr #(kNumMoveBits)
.endm

.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req
     .if \probDisp == 0
        PSTORE_2  probBranch, \probsArray, \probOffset
    .elseif \probOffset == 0
        PSTORE_2  probBranch, \probsArray, \probDisp * PMULT
    .else
        .error "unsupported"
        // add     update_temp_reg, \probsArray, \probOffset
        PSTORE_2  probBranch, update_temp_reg, \probDisp * PMULT
    .endif
.endm

.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req
        UPDATE_0__0
        UPDATE_0__1
        UPDATE_0__2 \probsArray, \probOffset, \probDisp
.endm


.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req
        // sub     cod, cod, prob2
        // sub     range, range, prob2
        p2_sub  cod, range
        sub     range, prob2, range
        sub     prob2, probBranch, probBranch, lsr #(kNumMoveBits)
    .if \probDisp == 0
        PSTORE_2  prob2, \probsArray, \probOffset
    .elseif \probOffset == 0
        PSTORE_2  prob2, \probsArray, \probDisp * PMULT
    .else
        .error "unsupported"
        // add     update_temp_reg, \probsArray, \probOffset
        PSTORE_2  prob2, update_temp_reg, \probDisp * PMULT
    .endif
.endm


.macro CMP_COD_BASE
        NORM
        // lsr     prob2, range, kNumBitModelTotalBits
        // imul    prob2, probBranch
        // cmp     cod, prob2
        mov     prob2, range
        shr     range, kNumBitModelTotalBits
        imul    range, probBranch
        cmp     cod, range
.endm

.macro CMP_COD_1 probsArray:req
        PLOAD   probBranch, \probsArray
        CMP_COD_BASE
.endm

.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req
    .if \probDisp == 0
        PLOAD_2 probBranch, \probsArray, \probOffset
    .elseif \probOffset == 0
        PLOAD_2 probBranch, \probsArray, \probDisp * PMULT
    .else
        .error "unsupported"
        add     update_temp_reg, \probsArray, \probOffset
        PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT
    .endif
        CMP_COD_BASE
.endm


.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
        CMP_COD_3 \probsArray, \probOffset, \probDisp
        jae     \toLabel
.endm


.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req
        IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel
        UPDATE_0 \probsArray, \probOffset, \probDisp
.endm


.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
        CMP_COD_3 \probsArray, \probOffset, \probDisp
        jb      \toLabel
.endm

.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req
        CMP_COD_1 \probsArray
        jb      \toLabel
.endm


# ---------- CMOV MACROS ----------

.macro NORM_LSR
        NORM
        lsr     t0, range, #kNumBitModelTotalBits
.endm

.macro COD_RANGE_SUB
        subs    t1, cod, t0
        p2_sub  range, t0
.endm

.macro RANGE_IMUL prob:req
        imul    t0, \prob
.endm

.macro NORM_CALC prob:req
        NORM_LSR
        RANGE_IMUL \prob
        COD_RANGE_SUB
.endm

.macro CMOV_range
        cmovb   range, t0
.endm

.macro CMOV_code
        cmovae  cod, t1
.endm

.macro CMOV_code_Model_Pre prob:req
        sub     t0, \prob, kBitModelOffset
        CMOV_code
        cmovae  t0, \prob
.endm
        

.macro PUP_BASE_2 prob:req, dest_reg:req
        # only sar works for both 16/32 bit prob modes
        sub     \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits)
.endm

.macro PUP prob:req, probPtr:req, mem2:req
        PUP_BASE_2 \prob, t0
        PSTORE_2   t0, \probPtr, \mem2
.endm


#define probs_PMULT t4_R

.macro BIT_01
        add     probs_PMULT, probs, PMULT
.endm


.macro BIT_0_R prob:req
        PLOAD_2 \prob, probs, 1 * PMULT
        NORM_LSR
            sub     t3, \prob, kBitModelOffset
        RANGE_IMUL  \prob
            PLOAD_2 t2, probs, 1 * PMULT_2
        COD_RANGE_SUB
        CMOV_range
            cmovae  t3, \prob
        PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT
            PUP_BASE_2 \prob, t3
        csel   \prob, t2, t0, lo
            CMOV_code
        mov     sym, 2
        PSTORE_2  t3, probs, 1 * PMULT
            adc     sym, sym, wzr
        BIT_01
.endm

.macro BIT_1_R prob:req
        NORM_LSR
            p2_add  sym, sym
            sub     t3, \prob, kBitModelOffset
        RANGE_IMUL  \prob
            PLOAD_LSL t2, probs, sym_R
        COD_RANGE_SUB
        CMOV_range
            cmovae  t3, \prob
        PLOAD_LSL t0, probs_PMULT, sym_R
            PUP_BASE_2 \prob, t3
        csel   \prob, t2, t0, lo
            CMOV_code
        PSTORE_LSL_M1  t3, probs, sym_R, t2_R
            adc     sym, sym, wzr
.endm


.macro BIT_2_R prob:req
        NORM_LSR
            p2_add  sym, sym
            sub     t3, \prob, kBitModelOffset
        RANGE_IMUL  \prob
        COD_RANGE_SUB
        CMOV_range
            cmovae  t3, \prob
            CMOV_code
            PUP_BASE_2 \prob, t3
        PSTORE_LSL_M1  t3, probs, sym_R, t2_R
            adc     sym, sym, wzr
.endm


# ---------- MATCHED LITERAL ----------

.macro LITM_0 macro
        shl     match, (PSHIFT + 1)
        and     bit, match, 256 * PMULT
        add     prm, probs, 256 * PMULT + 1 * PMULT
        p2_add  match, match
        p2_add  prm, bit_R
        eor     offs, bit, 256 * PMULT
        PLOAD   litm_prob, prm
        
        NORM_LSR
            sub     t2, litm_prob, kBitModelOffset
        RANGE_IMUL  litm_prob
        COD_RANGE_SUB
        cmovae  offs, bit
            CMOV_range
        and     bit, match, offs
            cmovae  t2, litm_prob
            CMOV_code
            mov     sym, 2
        PUP_BASE_2 litm_prob, t2
        PSTORE  t2, prm
        add     prm, probs, offs_R
        adc     sym, sym, wzr
.endm

.macro LITM macro
        p2_add  prm, bit_R
            xor     offs, bit
        PLOAD_LSL litm_prob, prm, sym_R
        
        NORM_LSR
            p2_add  match, match
            sub     t2, litm_prob, kBitModelOffset
        RANGE_IMUL  litm_prob
        COD_RANGE_SUB
        cmovae  offs, bit
            CMOV_range
        and     bit, match, offs
            cmovae  t2, litm_prob
            CMOV_code
        PUP_BASE_2 litm_prob, t2
        PSTORE_LSL t2, prm, sym_R
        add     prm, probs, offs_R
        adc     sym, sym, sym
.endm


.macro LITM_2 macro
        p2_add  prm, bit_R
        PLOAD_LSL litm_prob, prm, sym_R
        
        NORM_LSR
            sub     t2, litm_prob, kBitModelOffset
        RANGE_IMUL  litm_prob
        COD_RANGE_SUB
            CMOV_range
            cmovae  t2, litm_prob
            CMOV_code
        PUP_BASE_2 litm_prob, t2
        PSTORE_LSL t2, prm, sym_R
        adc     sym, sym, sym
.endm


# ---------- REVERSE BITS ----------

.macro REV_0 prob:req
        NORM_CALC \prob
        CMOV_range
        PLOAD   t2, sym2_R
        PLOAD_2 t3, probs, 3 * PMULT
        CMOV_code_Model_Pre \prob
        add     t1_R, probs, 3 * PMULT
        cmovae  sym2_R, t1_R
        PUP     \prob, probs, 1 * PMULT
        csel    \prob, t2, t3, lo
.endm


.macro REV_1 prob:req, step:req
        NORM_LSR
            PLOAD_PREINDEXED  t2, sym2_R, (\step * PMULT)
        RANGE_IMUL  \prob
        COD_RANGE_SUB
        CMOV_range
        PLOAD_2 t3, sym2_R, (\step * PMULT)
        sub     t0, \prob, kBitModelOffset
        CMOV_code
        add     t1_R, sym2_R, \step * PMULT
        cmovae  t0, \prob
        cmovae  sym2_R, t1_R
        PUP_BASE_2 \prob, t0
        csel    \prob, t2, t3, lo
        PSTORE_2   t0, t1_R, 0 - \step * PMULT_2
.endm


.macro REV_2 prob:req, step:req
        sub     t1_R, sym2_R, probs
        NORM_LSR
            orr     sym, sym, t1, lsr #PSHIFT
        RANGE_IMUL  \prob
        COD_RANGE_SUB
        sub     t2, sym, \step
        CMOV_range
        cmovb   sym, t2
        CMOV_code_Model_Pre \prob
        PUP     \prob, sym2_R, 0
.endm


.macro REV_1_VAR prob:req
        PLOAD   \prob, sym_R
        mov     probs, sym_R
        p2_add  sym_R, sym2_R
        NORM_LSR
            add     t2_R, sym_R, sym2_R
        RANGE_IMUL  \prob
        COD_RANGE_SUB
        cmovae  sym_R, t2_R
        CMOV_range
        CMOV_code_Model_Pre \prob
        p2_add  sym2, sym2
        PUP     \prob, probs, 0
.endm


.macro add_big dest:req, src:req, param:req
    .if (\param) < (1 << 12)
        add     \dest, \src, \param
    .else
        #ifndef _LZMA_PROB32    
          .error "unexpcted add_big expansion"
        #endif
        add     \dest, \src, (\param) / 2
        add     \dest, \dest, (\param) - (\param) / 2
    .endif
.endm

.macro sub_big dest:req, src:req, param:req
    .if (\param) < (1 << 12)
        sub     \dest, \src, \param
    .else
        #ifndef _LZMA_PROB32    
          .error "unexpcted sub_big expansion"
        #endif
        sub     \dest, \src, (\param) / 2
        sub     \dest, \dest, (\param) - (\param) / 2
    .endif
.endm


.macro SET_probs offset:req
        // add_big probs, probs_Spec, (\offset) * PMULT
        add     probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT
.endm        


.macro LIT_PROBS
        add     sym, sym, processedPos, lsl 8
        inc     processedPos
        UPDATE_0__0
        shl     sym, lc2_lpMask
        SET_probs Literal
        p2_and  sym, lc2_lpMask
        // p2_add  probs_state, pbPos_R
        p2_add  probs, sym_R
        UPDATE_0__1
        add     probs, probs, sym_R, lsl 1
        UPDATE_0__2 probs_state, pbPos_R, 0
.endm


.equ kNumPosBitsMax       , 4
.equ kNumPosStatesMax     , (1 << kNumPosBitsMax)
                         
.equ kLenNumLowBits       , 3
.equ kLenNumLowSymbols    , (1 << kLenNumLowBits)
.equ kLenNumHighBits      , 8
.equ kLenNumHighSymbols   , (1 << kLenNumHighBits)
.equ kNumLenProbs         , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
                         
.equ LenLow               , 0
.equ LenChoice            , LenLow
.equ LenChoice2           , (LenLow + kLenNumLowSymbols)
.equ LenHigh              , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
                         
.equ kNumStates           , 12
.equ kNumStates2          , 16
.equ kNumLitStates        , 7
                         
.equ kStartPosModelIndex  , 4
.equ kEndPosModelIndex    , 14
.equ kNumFullDistances    , (1 << (kEndPosModelIndex >> 1))
                         
.equ kNumPosSlotBits      , 6
.equ kNumLenToPosStates   , 4
                         
.equ kNumAlignBits        , 4
.equ kAlignTableSize      , (1 << kNumAlignBits)
                         
.equ kMatchMinLen         , 2
.equ kMatchSpecLenStart   , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)

// .equ kStartOffset    , 1408
.equ kStartOffset    , 0
.equ SpecPos         , (-kStartOffset)
.equ IsRep0Long      , (SpecPos + kNumFullDistances)
.equ RepLenCoder     , (IsRep0Long + (kNumStates2 << kNumPosBitsMax))
.equ LenCoder        , (RepLenCoder + kNumLenProbs)
.equ IsMatch         , (LenCoder + kNumLenProbs)
.equ kAlign          , (IsMatch + (kNumStates2 << kNumPosBitsMax))
.equ IsRep           , (kAlign + kAlignTableSize)
.equ IsRepG0         , (IsRep + kNumStates)
.equ IsRepG1         , (IsRepG0 + kNumStates)
.equ IsRepG2         , (IsRepG1 + kNumStates)
.equ PosSlot         , (IsRepG2 + kNumStates)
.equ Literal         , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
.equ NUM_BASE_PROBS  , (Literal + kStartOffset)

.if kStartOffset != 0   // && IsMatch != 0
  .error "Stop_Compiling_Bad_StartOffset"
.endif

.if NUM_BASE_PROBS != 1984
  .error "Stop_Compiling_Bad_LZMA_PROBS"
.endif

.equ offset_lc    , 0
.equ offset_lp    , 1
.equ offset_pb    , 2
.equ offset_dicSize       , 4
.equ offset_probs         , 4 + offset_dicSize
.equ offset_probs_1664    , 8 + offset_probs
.equ offset_dic           , 8 + offset_probs_1664
.equ offset_dicBufSize    , 8 + offset_dic
.equ offset_dicPos        , 8 + offset_dicBufSize
.equ offset_buf           , 8 + offset_dicPos
.equ offset_range         , 8 + offset_buf
.equ offset_code          , 4 + offset_range
.equ offset_processedPos  , 4 + offset_code
.equ offset_checkDicSize  , 4 + offset_processedPos
.equ offset_rep0          , 4 + offset_checkDicSize
.equ offset_rep1          , 4 + offset_rep0
.equ offset_rep2          , 4 + offset_rep1
.equ offset_rep3          , 4 + offset_rep2
.equ offset_state         , 4 + offset_rep3
.equ offset_remainLen     , 4 + offset_state
.equ offset_TOTAL_SIZE    , 4 + offset_remainLen

.if offset_TOTAL_SIZE != 96
  .error "Incorrect offset_TOTAL_SIZE"
.endif


.macro IsMatchBranch_Pre
        # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
        and     pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT)
        add     probs_state, probs_IsMatch, state_R
.endm


/*
.macro IsMatchBranch
        IsMatchBranch_Pre
        IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
.endm
*/        

.macro CheckLimits
        cmp     buf, bufLimit
        jae     fin_OK
        cmp     dicPos, limit
        jae     fin_OK
.endm

#define  CheckLimits_lit  CheckLimits
/*
.macro CheckLimits_lit
        cmp     buf, bufLimit
        jae     fin_OK_lit
        cmp     dicPos, limit
        jae     fin_OK_lit
.endm
*/


#define PARAM_lzma      REG_ABI_PARAM_0
#define PARAM_limit     REG_ABI_PARAM_1
#define PARAM_bufLimit  REG_ABI_PARAM_2


.macro LOAD_LZMA_VAR reg:req, struct_offs:req
        ldr     \reg, [PARAM_lzma, \struct_offs]
.endm

.macro LOAD_LZMA_BYTE reg:req, struct_offs:req
        ldrb    \reg, [PARAM_lzma, \struct_offs]
.endm

.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
        ldp     \reg0, \reg1, [PARAM_lzma, \struct_offs]
.endm


LzmaDec_DecodeReal_3:
_LzmaDec_DecodeReal_3:
/*
.LFB0:
	.cfi_startproc  
*/

	stp	x19, x20, [sp, -128]!
	stp	x21, x22, [sp, 16]
	stp	x23, x24, [sp, 32]
	stp	x25, x26, [sp, 48]
	stp	x27, x28, [sp, 64]
	stp	x29, x30, [sp, 80]
        
        str     PARAM_lzma, [sp, 120]
        
        mov     bufLimit, PARAM_bufLimit
        mov     limit, PARAM_limit
        
        LOAD_LZMA_PAIR  dic, dicBufSize, offset_dic
        LOAD_LZMA_PAIR  dicPos, buf, offset_dicPos
        LOAD_LZMA_PAIR  rep0, rep1, offset_rep0
        LOAD_LZMA_PAIR  rep2, rep3, offset_rep2
        
        mov     t0, 1 << (kLenNumLowBits + 1 + PSHIFT)
        LOAD_LZMA_BYTE  pbMask, offset_pb
        p2_add  limit, dic
        mov     len, wzr    // we can set it in all requiread branches instead
        lsl     pbMask, t0, pbMask
        p2_add  dicPos, dic
        p2_sub  pbMask, t0

        LOAD_LZMA_BYTE  lc2_lpMask, offset_lc
        mov     t0, 256 << PSHIFT
        LOAD_LZMA_BYTE  t1, offset_lp
        p2_add  t1, lc2_lpMask
        p2_sub  lc2_lpMask, (256 << PSHIFT) - PSHIFT
        shl     t0, t1
        p2_add  lc2_lpMask, t0
        
        LOAD_LZMA_VAR   probs_Spec, offset_probs
        LOAD_LZMA_VAR   checkDicSize, offset_checkDicSize
        LOAD_LZMA_VAR   processedPos, offset_processedPos
        LOAD_LZMA_VAR   state, offset_state
        // range is r0 : this load must be last don't move        
        LOAD_LZMA_PAIR  range, cod, offset_range    
        mov     sym, wzr
        shl     state, PSHIFT

        add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT)

        // if (processedPos != 0 || checkDicSize != 0)
        orr     t0, checkDicSize, processedPos
        cbz     t0, 1f
        add     t0_R, dicBufSize, dic
        cmp     dicPos, dic
        cmovne  t0_R, dicPos
        ldrb    sym, [t0_R, -1]
1:
        IsMatchBranch_Pre
        cmp     state, 4 * PMULT
        jb      lit_end
        cmp     state, kNumLitStates * PMULT
        jb      lit_matched_end
        jmp     lz_end
        

#define BIT_0  BIT_0_R prob_reg
#define BIT_1  BIT_1_R prob_reg
#define BIT_2  BIT_2_R prob_reg

# ---------- LITERAL ----------
MY_ALIGN_64
lit_start:
        mov     state, wzr
lit_start_2:
        LIT_PROBS

    #ifdef _LZMA_SIZE_OPT

        PLOAD_2 prob_reg, probs, 1 * PMULT
        mov     sym, 1
        BIT_01        
MY_ALIGN_FOR_LOOP
lit_loop:
        BIT_1
        tbz     sym, 7, lit_loop
        
    #else
        
        BIT_0
        BIT_1
        BIT_1
        BIT_1
        BIT_1
        BIT_1
        BIT_1
        
    #endif

        BIT_2
        IsMatchBranch_Pre
        strb    sym, [dicPos], 1
        p2_and  sym, 255
                
        CheckLimits_lit
lit_end:
        IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start

        # jmp     IsMatch_label
        

#define FLAG_STATE_BITS (4 + PSHIFT)          

# ---------- MATCHES ----------
# MY_ALIGN_FOR_ENTRY
IsMatch_label:
        UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch)
        IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label

        SET_probs LenCoder
        or      state, (1 << FLAG_STATE_BITS)

# ---------- LEN DECODE ----------
len_decode:
        mov     len, 8 - kMatchMinLen
        IF_BIT_0_NOUP_1 probs, len_mid_0
        UPDATE_1 probs, 0, 0
        p2_add  probs, (1 << (kLenNumLowBits + PSHIFT))
        mov     len, 0 - kMatchMinLen
        IF_BIT_0_NOUP_1 probs, len_mid_0
        UPDATE_1 probs, 0, 0
        p2_add  probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT))
        
    #if 0 == 1
        BIT_0
        BIT_1
        BIT_1
        BIT_1
        BIT_1
        BIT_1
   #else
        PLOAD_2 prob_reg, probs, 1 * PMULT
        mov     sym, 1
        BIT_01
MY_ALIGN_FOR_LOOP
len8_loop:
        BIT_1
        tbz     sym, 6, len8_loop
   #endif        
        
        mov     len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen
        jmp     len_mid_2 
        
MY_ALIGN_FOR_ENTRY
len_mid_0:
        UPDATE_0 probs, 0, 0
        p2_add  probs, pbPos_R
        BIT_0
len_mid_2:
        BIT_1
        BIT_2
        sub     len, sym, len
        tbz     state, FLAG_STATE_BITS, copy_match
        
# ---------- DECODE DISTANCE ----------
        // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);

        mov     t0, 3 + kMatchMinLen
        cmp     len, 3 + kMatchMinLen
        cmovb   t0, len
        SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits))
        add     probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT)
        
    #ifdef _LZMA_SIZE_OPT

        PLOAD_2 prob_reg, probs, 1 * PMULT
        mov     sym, 1
        BIT_01
MY_ALIGN_FOR_LOOP
slot_loop:
        BIT_1
        tbz     sym, 5, slot_loop
        
    #else
        
        BIT_0
        BIT_1
        BIT_1
        BIT_1
        BIT_1
        
    #endif
        
    #define numBits t4
        mov     numBits, sym
        BIT_2
        // we need only low bits
        p2_and  sym, 3
        cmp     numBits, 32 + kEndPosModelIndex / 2
        jb      short_dist

        SET_probs kAlign

        #  unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
        p2_sub  numBits, (32 + 1 + kNumAlignBits)
        #  distance = (2 | (distance & 1));
        or      sym, 2
        PLOAD_2 prob_reg, probs, 1 * PMULT
        add     sym2_R, probs, 2 * PMULT
        
# ---------- DIRECT DISTANCE ----------

.macro DIRECT_1
        shr     range, 1
        subs    t0, cod, range
        p2_add  sym, sym
        // add     t1, sym, 1
        csel    cod, cod, t0, mi
        csinc   sym, sym, sym, mi
        // csel    sym, t1, sym, pl
        // adc     sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams
        dec_s   numBits
        je      direct_end
.endm

    #ifdef _LZMA_SIZE_OPT

        jmp     direct_norm
MY_ALIGN_FOR_ENTRY
direct_loop:
        DIRECT_1
direct_norm:
        TEST_HIGH_BYTE_range
        jnz     direct_loop
        NORM_2
        jmp     direct_loop

    #else        

.macro DIRECT_2
        TEST_HIGH_BYTE_range
        jz      direct_unroll
        DIRECT_1
.endm

        DIRECT_2
        DIRECT_2
        DIRECT_2
        DIRECT_2
        DIRECT_2
        DIRECT_2
        DIRECT_2
        DIRECT_2
        
direct_unroll:
        NORM_2
        DIRECT_1
        DIRECT_1
        DIRECT_1
        DIRECT_1
        DIRECT_1
        DIRECT_1
        DIRECT_1
        DIRECT_1
        jmp     direct_unroll
    
    #endif

MY_ALIGN_FOR_ENTRY
direct_end:
        shl     sym, kNumAlignBits
        REV_0   prob_reg
        REV_1   prob_reg, 2
        REV_1   prob_reg, 4
        REV_2   prob_reg, 8

decode_dist_end:

    // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))

        tst     checkDicSize, checkDicSize
        csel    t0, processedPos, checkDicSize, eq
        cmp     sym, t0
        jae     end_of_payload
        // jmp     end_of_payload # for debug
        
        mov     rep3, rep2
        mov     rep2, rep1
        mov     rep1, rep0
        add     rep0, sym, 1

.macro  STATE_UPDATE_FOR_MATCH
        // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
        // cmp     state, (kNumStates + kNumLitStates) * PMULT
        cmp     state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS)
        mov     state, kNumLitStates * PMULT
        mov     t0, (kNumLitStates + 3) * PMULT
        cmovae  state, t0
.endm
        STATE_UPDATE_FOR_MATCH
        
# ---------- COPY MATCH ----------
copy_match:

    // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA;
        subs    cnt_R, limit, dicPos
        // jz      fin_dicPos_LIMIT
        jz      fin_OK

    // curLen = ((rem < len) ? (unsigned)rem : len);
        cmp     cnt_R, len_R
        cmovae  cnt, len

        sub     t0_R, dicPos, dic
        p2_add  dicPos, cnt_R
        p2_add  processedPos, cnt
        p2_sub  len, cnt
        
    // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
        p2_sub_s  t0_R, rep0_R
        jae     1f

        cmn     t0_R, cnt_R
        p2_add  t0_R, dicBufSize
        ja      copy_match_cross
1:
# ---------- COPY MATCH FAST ----------
    # t0_R : src_pos
        p2_add  t0_R, dic
        ldrb    sym, [t0_R]
        p2_add  t0_R, cnt_R
        p1_neg  cnt_R

copy_common:
        dec     dicPos

    # dicPos  : (ptr_to_last_dest_BYTE)    
    # t0_R    : (src_lim)
    # cnt_R   : (-curLen)

        IsMatchBranch_Pre
        
        inc_s   cnt_R
        jz      copy_end
        
        cmp     rep0, 1
        je      copy_match_0
   
    #ifdef LZMA_USE_2BYTES_COPY
        strb    sym, [dicPos, cnt_R]
        dec     dicPos
    # dicPos  : (ptr_to_last_dest_16bitWORD)    
        p2_and  cnt_R, -2
        ldrh    sym, [t0_R, cnt_R]
        adds    cnt_R, cnt_R, 2
        jz      2f
MY_ALIGN_FOR_LOOP
1:
        /*
        strh    sym, [dicPos, cnt_R]
        ldrh    sym, [t0_R, cnt_R]
        adds    cnt_R, cnt_R, 2
        jz      2f
        */

        strh    sym, [dicPos, cnt_R]
        ldrh    sym, [t0_R, cnt_R]
        adds    cnt_R, cnt_R, 2
        jnz     1b
2:
        
        /*
        // for universal little/big endian code, but slow
        strh    sym, [dicPos]
        inc     dicPos 
        ldrb    sym, [t0_R, -1]
        */

        #if  __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        // we must improve big-endian detection for another compilers 
        // for big-endian we need to revert bytes
        rev16   sym, sym         
        #endif
        
        // (sym) must represent as little-endian here:
        strb    sym, [dicPos], 1
        shr     sym, 8             

    #else

MY_ALIGN_FOR_LOOP
1:
        strb    sym, [dicPos, cnt_R]
        ldrb    sym, [t0_R, cnt_R]
        inc_s   cnt_R
        jz      copy_end

        strb    sym, [dicPos, cnt_R]
        ldrb    sym, [t0_R, cnt_R]
        inc_s   cnt_R
        jnz     1b
    #endif

copy_end:
lz_end_match:
        strb    sym, [dicPos], 1
  
        # IsMatchBranch_Pre
        CheckLimits
lz_end:
        IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label


# ---------- LITERAL MATCHED ----------
                
        LIT_PROBS
        
    // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];

        sub     t0_R, dicPos, dic
        p2_sub_s t0_R, rep0_R
    
    #ifdef LZMA_USE_CMOV_LZ_WRAP
        add     t1_R, t0_R, dicBufSize
        cmovb   t0_R, t1_R
    #else                
        jae     1f
        p2_add  t0_R, dicBufSize
1:
    #endif                        

        ldrb    match, [dic, t0_R]

    // state -= (state < 10) ? 3 : 6;
        sub     sym, state, 6 * PMULT
        cmp     state, 10 * PMULT
        p2_sub  state, 3 * PMULT
        cmovae  state, sym

    #ifdef _LZMA_SIZE_OPT

        mov     offs, 256 * PMULT
        shl     match, (PSHIFT + 1)
        mov     sym, 1
        and     bit, match, offs
        add     prm, probs, offs_R

MY_ALIGN_FOR_LOOP
litm_loop:
        LITM
        tbz     sym, 8, litm_loop
        
    #else
        
        LITM_0
        LITM
        LITM
        LITM
        LITM
        LITM
        LITM
        LITM_2
        
    #endif
    
        IsMatchBranch_Pre
        strb    sym, [dicPos], 1
        p2_and  sym, 255
        
        // mov     len, wzr // LITM uses same regisetr (len / offs). So we clear it
        CheckLimits_lit
lit_matched_end:
        IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
        # IsMatchBranch
        p2_sub  state, 3 * PMULT
        jmp     lit_start_2
        

# ---------- REP 0 LITERAL ----------
MY_ALIGN_FOR_ENTRY
IsRep0Short_label:
        UPDATE_0 probs_state, pbPos_R, 0

    // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
        sub     t0_R, dicPos, dic
        
        // state = state < kNumLitStates ? 9 : 11;
        or      state, 1 * PMULT
        
        # the caller doesn't allow (dicPos >= limit) case for REP_SHORT
        # so we don't need the following (dicPos == limit) check here:
        # cmp     dicPos, limit
        # jae     fin_dicPos_LIMIT_REP_SHORT
        # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes

        inc     processedPos

        IsMatchBranch_Pre
       
        p2_sub_s t0_R, rep0_R
    #ifdef LZMA_USE_CMOV_LZ_WRAP
        add     sym_R, t0_R, dicBufSize
        cmovb   t0_R, sym_R
    #else       
        jae     1f
        p2_add  t0_R, dicBufSize
1:
    #endif
        
        ldrb    sym, [dic, t0_R]
        // mov     len, wzr
        jmp     lz_end_match
        
MY_ALIGN_FOR_ENTRY
IsRep_label:
        UPDATE_1 probs_state, 0, (IsRep - IsMatch)

        # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
        # So we don't check it here.
        
        # mov     t0, processedPos
        # or      t0, checkDicSize
        # jz      fin_ERROR_2

        // state = state < kNumLitStates ? 8 : 11;
        cmp     state, kNumLitStates * PMULT
        mov     state, 8 * PMULT
        mov     probBranch, 11 * PMULT
        cmovae  state, probBranch

        SET_probs RepLenCoder
        
        IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label
        sub_big  probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT
        IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label
        UPDATE_1 probs_state, pbPos_R, 0
        jmp     len_decode

MY_ALIGN_FOR_ENTRY
IsRepG0_label:
        UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch)
        IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label
        mov     dist, rep1
        mov     rep1, rep0
        mov     rep0, dist
        jmp     len_decode
        
# MY_ALIGN_FOR_ENTRY
IsRepG1_label:
        UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch)
        IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label
        mov     dist, rep2
        mov     rep2, rep1
        mov     rep1, rep0
        mov     rep0, dist
        jmp     len_decode

# MY_ALIGN_FOR_ENTRY
IsRepG2_label:
        UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch)
        mov     dist, rep3
        mov     rep3, rep2
        mov     rep2, rep1
        mov     rep1, rep0
        mov     rep0, dist
        jmp     len_decode

        
# ---------- SPEC SHORT DISTANCE ----------

MY_ALIGN_FOR_ENTRY
short_dist:
        p2_sub_s numBits, 32 + 1
        jbe     decode_dist_end
        or      sym, 2
        shl     sym, numBits
        add     sym_R, probs_Spec, sym_R, lsl #PSHIFT
        p2_add  sym_R, SpecPos * PMULT + 1 * PMULT
        mov     sym2, PMULT // # step
MY_ALIGN_FOR_LOOP
spec_loop:
        REV_1_VAR prob_reg
        dec_s   numBits
        jnz     spec_loop
        
        p2_add  sym2_R, probs_Spec
    .if SpecPos != 0
        p2_add  sym2_R, SpecPos * PMULT
    .endif
        p2_sub  sym_R, sym2_R
        shr     sym, PSHIFT
        
        jmp     decode_dist_end


# ---------- COPY MATCH 0 ----------
MY_ALIGN_FOR_ENTRY
copy_match_0:
    #ifdef LZMA_USE_4BYTES_FILL
        strb    sym, [dicPos, cnt_R]
        inc_s   cnt_R
        jz      copy_end
        
        strb    sym, [dicPos, cnt_R]
        inc_s   cnt_R
        jz      copy_end
        
        strb    sym, [dicPos, cnt_R]
        inc_s   cnt_R
        jz      copy_end
        
        orr     t3, sym, sym, lsl 8
        p2_and  cnt_R, -4
        orr     t3, t3, t3, lsl 16
MY_ALIGN_FOR_LOOP_16
1:
        /*
        str     t3, [dicPos, cnt_R]
        adds    cnt_R, cnt_R, 4
        jz      2f
        */

        str     t3, [dicPos, cnt_R]
        adds    cnt_R, cnt_R, 4
        jnz     1b
2:
        // p2_and  sym, 255
    #else

MY_ALIGN_FOR_LOOP
1:
        strb    sym, [dicPos, cnt_R]
        inc_s   cnt_R
        jz      copy_end

        strb    sym, [dicPos, cnt_R]
        inc_s   cnt_R
        jnz     1b
    #endif        

    jmp     copy_end


# ---------- COPY MATCH CROSS ----------
copy_match_cross:
        # t0_R  - src pos
        # cnt_R - total copy len

        p1_neg  cnt_R
1:
        ldrb    sym, [dic, t0_R]
        inc     t0_R
        strb    sym, [dicPos, cnt_R]
        inc     cnt_R
        cmp     t0_R, dicBufSize
        jne     1b
        
        ldrb    sym, [dic]
        sub     t0_R, dic, cnt_R
        jmp     copy_common


/*
fin_dicPos_LIMIT_REP_SHORT:
        mov     len, 1
        jmp     fin_OK
*/

/*
fin_dicPos_LIMIT:
        jmp     fin_OK
        # For more strict mode we can stop decoding with error
        # mov     sym, 1
        # jmp     fin
*/

fin_ERROR_MATCH_DIST:
        # rep0 = distance + 1;
        p2_add  len, kMatchSpecLen_Error_Data
        mov     rep3, rep2
        mov     rep2, rep1
        mov     rep1, rep0
        mov     rep0, sym
        STATE_UPDATE_FOR_MATCH
        # jmp     fin_OK
        mov     sym, 1
        jmp     fin

end_of_payload:
        inc_s   sym
        jnz     fin_ERROR_MATCH_DIST

        mov     len, kMatchSpecLenStart
        xor     state, (1 << FLAG_STATE_BITS)
        jmp     fin_OK

/*
fin_OK_lit:
        mov     len, wzr
*/

fin_OK:
        mov     sym, wzr

fin:
        NORM

    #define fin_lzma_reg  t0_R

   .macro STORE_LZMA_VAR reg:req, struct_offs:req
        str     \reg, [fin_lzma_reg, \struct_offs]
   .endm

   .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
        stp     \reg0, \reg1, [fin_lzma_reg, \struct_offs]
   .endm

        ldr     fin_lzma_reg, [sp, 120]
        p2_sub  dicPos, dic
        shr     state, PSHIFT

        STORE_LZMA_PAIR   dicPos, buf,  offset_dicPos
        STORE_LZMA_PAIR   range, cod,   offset_range
        STORE_LZMA_VAR    processedPos, offset_processedPos
        STORE_LZMA_PAIR   rep0, rep1,   offset_rep0
        STORE_LZMA_PAIR   rep2, rep3,   offset_rep2
        STORE_LZMA_PAIR   state, len,   offset_state

        mov     w0, sym
        
	ldp	x29, x30, [sp, 80]
	ldp	x27, x28, [sp, 64]
	ldp	x25, x26, [sp, 48]
        ldp	x23, x24, [sp, 32]
	ldp	x21, x22, [sp, 16]
	ldp	x19, x20, [sp], 128

        ret
/*
	.cfi_endproc
.LFE0:
	.size	LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3
	.ident	"TAG_LZMA"
	.section	.note.GNU-stack,"",@progbits
*/