git subrepo pull (merge) --force deps/lightning
[pcsx_rearmed.git] / deps / lightning / lib / jit_arm-cpu.c
index b6ee260..a0852a2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012-2019  Free Software Foundation, Inc.
+ * Copyright (C) 2012-2023  Free Software Foundation, Inc.
  *
  * This file is part of GNU lightning.
  *
@@ -36,8 +36,8 @@
 #  define jit_armv5_p()                        (jit_cpu.version >= 5)
 #  define jit_armv5e_p()               (jit_cpu.version > 5 || (jit_cpu.version == 5 && jit_cpu.extend))
 #  define jit_armv6_p()                        (jit_cpu.version >= 6)
-#  define jit_armv7r_p()               0
-#  define stack_framesize              48
+#  define jit_armv7_p()                        (jit_cpu.version >= 7)
+#  define jit_armv7r_p()               (jit_cpu.version > 7 || (jit_cpu.version == 7 && jit_cpu.extend))
 extern int     __aeabi_idivmod(int, int);
 extern unsigned        __aeabi_uidivmod(unsigned, unsigned);
 #  define _R0_REGNO                    0x00
@@ -140,8 +140,12 @@ extern unsigned    __aeabi_uidivmod(unsigned, unsigned);
 #  define THUMB2_UMULL                 0xfba00000
 #  define ARM_SMULL                    0x00c00090
 #  define THUMB2_SMULL                 0xfb800000
+/* >> ARMv7r */
+#  define ARM_SDIV                     0x07100010
+#  define ARM_UDIV                     0x07300010
 #  define THUMB2_SDIV                  0xfb90f0f0
 #  define THUMB2_UDIV                  0xfbb0f0f0
+/* << ARMv7r */
 #  define ARM_AND                      0x00000000
 #  define THUMB_AND                        0x4000
 #  define THUMB2_AND                   0xea000000
@@ -179,7 +183,29 @@ extern unsigned    __aeabi_uidivmod(unsigned, unsigned);
 #  define ARM_XTR8                     0x00000400 /* ?xt? rotate 8 bits */
 #  define ARM_XTR16                    0x00000800 /* ?xt? rotate 16 bits */
 #  define ARM_XTR24                    0x00000c00 /* ?xt? rotate 24 bits */
+#  define ARM_LDREX                    0x01900090
+#  define THUMB2_LDREX                 0xe8500000
+#  define ARM_STREX                    0x01800090
+#  define THUMB2_STREX                 0xe8400000
 /* << ARMv6* */
+/* >> ARMv6t2 */
+#  define THUMB2_CLZ                   0xfab0f080
+#  define THUMB2_RBIT                  0xfa90f0a0
+#  define ARM_RBIT                     0x06f00030
+/* << ARMv6t2 */
+#  define ARM_CLZ                      0x01600010
+/* >> ARMv7 */
+#  define ARM_DMB                      0xf57ff050
+#  define THUMB2_DMB                   0xf3bf8f50
+#  define DMB_SY                       0xf
+#  define DMB_ST                       0xe
+#  define DMB_ISH                      0xb
+#  define DMB_ISHST                    0xa
+#  define DMB_NSH                      0x7
+#  define DMB_NSHT                     0x6
+#  define DMB_OSH                      0x3
+#  define DMB_OSHST                    0x2
+/* << ARMv7 */
 #  define ARM_SHIFT                    0x01a00000
 #  define ARM_R                                0x00000010 /* register shift */
 #  define ARM_LSL                      0x00000000
@@ -399,6 +425,12 @@ static void _tcit(jit_state_t*,unsigned int,int);
 static void _tpp(jit_state_t*,int,int);
 #  define torl(o,rn,im)                        _torl(_jit,o,rn,im)
 static void _torl(jit_state_t*,int,int,int) maybe_unused;
+#  define DMB(im)                      dmb(im)
+#  define T2_DMB(im)                   tdmb(im)
+#  define dmb(im)                      _dmb(_jit, im)
+static void _dmb(jit_state_t *_jit, int im);
+#  define tdmb(im)                     _tdmb(_jit, im)
+static void _tdmb(jit_state_t *_jit, int im);
 #  define CC_MOV(cc,rd,rm)             corrr(cc,ARM_MOV,0,rd,rm)
 #  define MOV(rd,rm)                   CC_MOV(ARM_CC_AL,rd,rm)
 #  define T1_MOV(rd,rm)                        is(THUMB_MOV|((_u4(rd)&8)<<4)|(_u4(rm)<<3)|(rd&7))
@@ -424,6 +456,12 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define NOT(rd,rm)                   CC_NOT(ARM_CC_AL,rd,rm)
 #  define T1_NOT(rd,rm)                        T1_MVN(rd,rm)
 #  define T2_NOT(rd,rm)                        T2_MVN(rd,rm)
+#  define T2_CLZ(rd,rm)                        torrr(THUMB2_CLZ,rm,rd,rm)
+#  define CC_CLZ(cc,rd,rm)             corrrr(cc,ARM_CLZ,_R15_REGNO,rd,_R15_REGNO,rm)
+#  define CLZ(rd,rm)                   CC_CLZ(ARM_CC_AL,rd,rm)
+#  define T2_RBIT(rd,rm)               torrr(THUMB2_RBIT,rm,rd,rm)
+#  define CC_RBIT(cc,rd,rm)            corrrr(cc,ARM_RBIT,_R15_REGNO,rd,_R15_REGNO,rm)
+#  define RBIT(rd,rm)                  CC_RBIT(ARM_CC_AL,rd,rm)
 #  define NOP()                                MOV(_R0_REGNO, _R0_REGNO)
 #  define T1_NOP()                     is(0xbf00)
 #  define CC_ADD(cc,rd,rn,rm)          corrr(cc,ARM_ADD,rn,rd,rm)
@@ -501,6 +539,10 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define CC_UMULL(cc,rl,rh,rn,rm)     corrrr(cc,ARM_UMULL,rh,rl,rm,rn)
 #  define UMULL(rl,rh,rn,rm)           CC_UMULL(ARM_CC_AL,rl,rh,rn,rm)
 #  define T2_UMULL(rl,rh,rn,rm)                torrrr(THUMB2_UMULL,rn,rl,rh,rm)
+#  define CC_SDIV(cc,rd,rn,rm)         corrrr(cc,ARM_SDIV,rd,15,rn,rm)
+#  define SDIV(rd,rn,rm)               CC_SDIV(ARM_CC_AL,rd,rm,rn)
+#  define CC_UDIV(cc,rd,rn,rm)         corrrr(cc,ARM_UDIV,rd,15,rn,rm)
+#  define UDIV(rd,rn,rm)               CC_UDIV(ARM_CC_AL,rd,rm,rn)
 #  define T2_SDIV(rd,rn,rm)            torrr(THUMB2_SDIV,rn,rd,rm)
 #  define T2_UDIV(rd,rn,rm)            torrr(THUMB2_UDIV,rn,rd,rm)
 #  define CC_AND(cc,rd,rn,rm)          corrr(cc,ARM_AND,rn,rd,rm)
@@ -612,7 +654,7 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define CMNI(rn,im)                  CC_CMNI(ARM_CC_AL,rn,im)
 #  define T2_CMNI(rn,im)               torri(THUMB2_CMNI,rn,_R15_REGNO,im)
 #  define CC_TST(cc,rn,rm)             corrr(cc,ARM_TST,rn,r0,rm)
-#  define TST(rn,rm)                   CC_TST(ARM_CC_AL,rn,rm)
+#  define TST(rn,rm)                   corrr(ARM_CC_AL,ARM_TST,rn,0,rm)
 #  define T1_TST(rn,rm)                        is(THUMB_TST|(_u3(rm)<<3)|_u3(rn))
 #  define T2_TST(rn,rm)                        torrr(THUMB2_TST,rn,_R15_REGNO,rm)
 #  define CC_TSTI(cc,rn,im)            corri(cc,ARM_TST|ARM_I,rn,0,im)
@@ -718,6 +760,9 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define CC_LDRDIN(cc,rt,rn,im)       corri8(cc,ARM_LDRDI,rn,rt,im)
 #  define LDRDIN(rt,rn,im)             CC_LDRDIN(ARM_CC_AL,rt,rn,im)
 #  define T2_LDRDIN(rt,rt2,rn,im)      torrri8(THUMB2_LDRDI,rn,rt,rt2,im)
+#  define CC_LDREX(cc,rt,rn)           corrrr(cc,ARM_LDREX,rn,rt,0xf,0xf)
+#  define LDREX(rt,rn)                 CC_LDREX(ARM_CC_AL,rt,rn)
+#  define T2_LDREX(rt,rn,im)           torrri8(THUMB2_LDREX,rn,rt,0xf,im)
 #  define CC_STRB(cc,rt,rn,rm)         corrr(cc,ARM_STRB|ARM_P,rn,rt,rm)
 #  define STRB(rt,rn,rm)               CC_STRB(ARM_CC_AL,rt,rn,rm)
 #  define T1_STRB(rt,rn,rm)            is(THUMB_STRB|(_u3(rm)<<6)|(_u3(rn)<<3)|_u3(rt))
@@ -771,6 +816,9 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define CC_STRDIN(cc,rt,rn,im)       corri8(cc,ARM_STRDI,rn,rt,im)
 #  define STRDIN(rt,rn,im)             CC_STRDIN(ARM_CC_AL,rt,rn,im)
 #  define T2_STRDIN(rt,rt2,rn,im)      torrri8(THUMB2_STRDI,rn,rt,rt2,im)
+#  define CC_STREX(cc,rd,rt,rn)                corrrr(cc,ARM_STREX,rn,rd,0xf,rt)
+#  define STREX(rd,rt,rn)              CC_STREX(ARM_CC_AL,rd,rt,rn)
+#  define T2_STREX(rd,rt,rn,im)                torrri8(THUMB2_STREX,rn,rt,rd,im)
 #  define CC_LDMIA(cc,rn,im)           corl(cc,ARM_M|ARM_M_L|ARM_M_I,rn,im)
 #  define LDMIA(rn,im)                 CC_LDMIA(ARM_CC_AL,rn,im)
 #  define CC_LDM(cc,rn,im)             CC_LDMIA(cc,rn,im)
@@ -823,6 +871,8 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused;
 #  define T2_POP(im)                   tpp(THUMB2_POP,im)
 #  define jit_get_reg_args()                                           \
     do {                                                               \
+       CHECK_REG_ARGS();                                               \
+       jit_check_frame();                                              \
        (void)jit_get_reg(_R0|jit_class_named|jit_class_gpr);           \
        (void)jit_get_reg(_R1|jit_class_named|jit_class_gpr);           \
        (void)jit_get_reg(_R2|jit_class_named|jit_class_gpr);           \
@@ -843,10 +893,27 @@ static void _movr(jit_state_t*,jit_int32_t,jit_int32_t);
 static void _movi(jit_state_t*,jit_int32_t,jit_word_t);
 #  define movi_p(r0,i0)                        _movi_p(_jit,r0,i0)
 static jit_word_t _movi_p(jit_state_t*,jit_int32_t,jit_word_t);
+#  define movnr(r0,r1,r2)              _movnr(_jit,r0,r1,r2)
+static void _movnr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define movzr(r0,r1,r2)              _movzr(_jit,r0,r1,r2)
+static void _movzr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define casx(r0, r1, r2, r3, i0)     _casx(_jit, r0, r1, r2, r3, i0)
+static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t,
+                 jit_int32_t,jit_int32_t,jit_word_t);
+#define casr(r0, r1, r2, r3)           casx(r0, r1, r2, r3, 0)
+#define casi(r0, i0, r1, r2)           casx(r0, _NOREG, r1, r2, i0)
 #  define comr(r0,r1)                  _comr(_jit,r0,r1)
 static void _comr(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define negr(r0,r1)                  _negr(_jit,r0,r1)
 static void _negr(jit_state_t*,jit_int32_t,jit_int32_t);
+#  define clor(r0, r1)                 _clor(_jit, r0, r1)
+static void _clor(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define clzr(r0, r1)                 _clzr(_jit, r0, r1)
+static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define ctor(r0, r1)                 _ctor(_jit, r0, r1)
+static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define ctzr(r0, r1)                 _ctzr(_jit, r0, r1)
+static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t);
 #  define addr(r0,r1,r2)               _addr(_jit,r0,r1,r2)
 static void _addr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define addi(r0,r1,i0)               _addi(_jit,r0,r1,i0)
@@ -1091,15 +1158,10 @@ static void _sti_i(jit_state_t*,jit_word_t,jit_int32_t);
 static void _stxr_i(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
 #  define stxi_i(r0,r1,i0)             _stxi_i(_jit,r0,r1,i0)
 static void _stxi_i(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
-#  if __BYTE_ORDER == __LITTLE_ENDIAN
-#  define htonr_us(r0,r1)              _htonr_us(_jit,r0,r1)
-static void _htonr_us(jit_state_t*,jit_int32_t,jit_int32_t);
-#  define htonr_ui(r0,r1)              _htonr_ui(_jit,r0,r1)
-static void _htonr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
-#  else
-#    define htonr_us(r0,r1)            extr_us(r0,r1)
-#    define htonr(r0,r1)               movr(r0,r1)
-#  endif
+#  define bswapr_us(r0,r1)             _bswapr_us(_jit,r0,r1)
+static void _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t);
+#  define bswapr_ui(r0,r1)             _bswapr_ui(_jit,r0,r1)
+static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define extr_c(r0,r1)                        _extr_c(_jit,r0,r1)
 static void _extr_c(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define extr_uc(r0,r1)               _extr_uc(_jit,r0,r1)
@@ -1114,10 +1176,10 @@ static void _prolog(jit_state_t*,jit_node_t*);
 static void _epilog(jit_state_t*,jit_node_t*);
 #  define callr(r0)                    _callr(_jit,r0)
 static void _callr(jit_state_t*,jit_int32_t);
-#  define calli(i0)                    _calli(_jit,i0)
-static void _calli(jit_state_t*,jit_word_t);
-#  define calli_p(i0)                  _calli_p(_jit,i0)
-static jit_word_t _calli_p(jit_state_t*,jit_word_t);
+#  define calli(i0,i1)                 _calli(_jit,i0,i1)
+static void _calli(jit_state_t*,jit_word_t,jit_bool_t);
+#  define calli_p(i0,i1)               _calli_p(_jit,i0,i1)
+static jit_word_t _calli_p(jit_state_t*,jit_word_t,jit_bool_t);
 #  define vastart(r0)                  _vastart(_jit, r0)
 static void _vastart(jit_state_t*, jit_int32_t);
 #  define vaarg(r0, r1)                        _vaarg(_jit, r0, r1)
@@ -1493,7 +1555,7 @@ _tpp(jit_state_t *_jit, int o, int im)
     assert(!(o & 0x0000ffff));
     if (o == THUMB2_PUSH)
        assert(!(im & 0x8000));
-    assert(__builtin_popcount(im & 0x1fff) > 1);
+    assert(__builtin_popcount(im & 0x7fff) > 1);
     thumb.i = o|im;
     iss(thumb.s[0], thumb.s[1]);
 }
@@ -1509,6 +1571,22 @@ _torl(jit_state_t *_jit, int o, int rn, int im)
     iss(thumb.s[0], thumb.s[1]);
 }
 
+static void
+_dmb(jit_state_t *_jit, int im)
+{
+    assert(!(im & 0xfffffff0));
+    ii(ARM_DMB|im);
+}
+
+static void
+_tdmb(jit_state_t *_jit, int im)
+{
+    jit_thumb_t        thumb;
+    assert(!(im & 0xfffffff0));
+    thumb.i = THUMB2_DMB | im;
+    iss(thumb.s[0], thumb.s[1]);
+}
+
 static void
 _nop(jit_state_t *_jit, jit_int32_t i0)
 {
@@ -1582,6 +1660,86 @@ _movi_p(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
     return (w);
 }
 
+static void
+_movznr(jit_state_t *_jit, int ct, jit_int32_t r0,
+        jit_int32_t r1, jit_int32_t r2)
+{
+    if (jit_thumb_p()) {
+        if (r2 < 7)
+            T1_CMPI(r2, 0);
+        else
+            T2_CMPI(r2, 0);
+        IT(ct);
+        T1_MOV(r0, r1);
+    } else {
+        CMPI(r2, 0);
+        CC_MOV(ct, r0, r1);
+    }
+}
+
+static void
+_movnr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    _movznr(_jit, ARM_CC_NE, r0, r1, r2);
+}
+
+static void
+_movzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    _movznr(_jit, ARM_CC_EQ, r0, r1, r2);
+}
+
+static void
+_casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+      jit_int32_t r2, jit_int32_t r3, jit_word_t i0)
+{
+    jit_int32_t                r1_reg, iscasi;
+    jit_word_t         retry, done, jump0, jump1;
+    if (!jit_armv7_p())
+       fallback_casx(r0, r1, r2, r3, i0);
+    else {
+       if ((iscasi = (r1 == _NOREG))) {
+           r1_reg = jit_get_reg(jit_class_gpr);
+           r1 = rn(r1_reg);
+           movi(r1, i0);
+       }
+       if (jit_thumb_p()) {
+           T2_DMB(DMB_ISH);
+           /* retry: */
+           retry = _jit->pc.w;
+           T2_LDREX(r0, r1, 0);
+           eqr(r0, r0, r2);
+           jump0 = beqi(_jit->pc.w, r0, 0);    /* beqi done r0 0 */
+           T2_STREX(r0, r3, r1, 0);
+           jump1 = bnei(_jit->pc.w, r0, 0);    /* bnei retry r0 0 */
+           /* r0 = 0 if memory updated, 1 otherwise */
+           xori(r0, r0, 1);
+           /* done: */
+           done = _jit->pc.w;
+           T2_DMB(DMB_ISH);
+       }
+       else {
+           DMB(DMB_ISH);
+           /* retry: */
+           retry = _jit->pc.w;
+           LDREX(r0, r1);
+           eqr(r0, r0, r2);
+           jump0 = beqi(_jit->pc.w, r0, 0);    /* beqi done r0 0 */
+           STREX(r0, r3, r1);
+           jump1 = bnei(_jit->pc.w, r0, 0);    /* bnei retry r0 0 */
+           /* r0 = 0 if memory updated, 1 otherwise */
+           xori(r0, r0, 1);
+           /* done: */
+           done = _jit->pc.w;
+           DMB(DMB_ISH);
+       }
+       patch_at(arm_patch_jump, jump0, done);
+       patch_at(arm_patch_jump, jump1, retry);
+       if (iscasi)
+           jit_unget_reg(r1_reg);
+    }
+}
+
 static void
 _comr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
@@ -1608,6 +1766,53 @@ _negr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
        RSBI(r0, r1, 0);
 }
 
+static void
+_clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (!jit_thumb_p() && jit_armv5e_p())
+       CLZ(r0, r1);
+    else if (jit_thumb_p() && jit_armv7_p()) { /* armv6t2 actually */
+       T2_CLZ(r0, r1);
+    }
+    else
+       fallback_clz(r0, r0);
+}
+
+static void
+_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    comr(r0, r1);
+    clzr(r0, r0);
+}
+
+static void
+_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_armv7_p()) {       /* armv6t2 actually */
+       if (jit_thumb_p())
+           T2_RBIT(r0, r1);
+       else
+           RBIT(r0, r1);
+       clor(r0, r0);
+    }
+    else
+       fallback_cto(r0, r1);
+}
+
+static void
+_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_armv7_p()) {       /* armv6t2 actually */
+       if (jit_thumb_p())
+           T2_RBIT(r0, r1);
+       else
+           RBIT(r0, r1);
+       clzr(r0, r0);
+    }
+    else
+       fallback_ctz(r0, r1);
+}
+
 static void
 _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
@@ -2103,8 +2308,12 @@ _divrem(jit_state_t *_jit, int div, int sign,
 static void
 _divr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    if (jit_armv7r_p() && jit_thumb_p())
-       T2_SDIV(r0, r1, r2);
+    if (jit_armv7r_p()) {
+       if (jit_thumb_p())
+           T2_SDIV(r0, r1, r2);
+       else
+           SDIV(r0, r1, r2);
+    }
     else
        divrem(1, 1, r0, r1, r2);
 }
@@ -2122,8 +2331,12 @@ _divi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 static void
 _divr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    if (jit_armv7r_p() && jit_thumb_p())
-       T2_UDIV(r0, r1, r2);
+    if (jit_armv7r_p()) {
+       if (jit_thumb_p())
+           T2_UDIV(r0, r1, r2);
+       else
+           UDIV(r0, r1, r2);
+    }
     else
        divrem(1, 0, r0, r1, r2);
 }
@@ -2183,7 +2396,23 @@ _iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
 static void
 _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    divrem(0, 1, r0, r1, r2);
+    if (jit_armv7r_p()) {
+       jit_int32_t             reg;
+       if (r0 == r1 || r0 == r2) {
+           reg = jit_get_reg(jit_class_gpr);
+           divr(rn(reg), r1, r2);
+           mulr(rn(reg), r2, rn(reg));
+           subr(r0, r1, rn(reg));
+           jit_unget_reg(reg);
+       }
+       else {
+           divr(r0, r1, r2);
+           mulr(r0, r2, r0);
+           subr(r0, r1, r0);
+       }
+    }
+    else
+       divrem(0, 1, r0, r1, r2);
 }
 
 static void
@@ -2199,7 +2428,23 @@ _remi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 static void
 _remr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    divrem(0, 0, r0, r1, r2);
+    if (jit_armv7r_p()) {
+       jit_int32_t             reg;
+       if (r0 == r1 || r0 == r2) {
+           reg = jit_get_reg(jit_class_gpr);
+           divr_u(rn(reg), r1, r2);
+           mulr(rn(reg), r2, rn(reg));
+           subr(r0, r1, rn(reg));
+           jit_unget_reg(reg);
+       }
+       else {
+           divr_u(r0, r1, r2);
+           mulr(r0, r2, r0);
+           subr(r0, r1, r0);
+       }
+    }
+    else
+       divrem(0, 0, r0, r1, r2);
 }
 
 static void
@@ -2612,8 +2857,8 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1)
     jit_word_t         w;
     jit_word_t         d;
     jit_int32_t                reg;
+    /* i1 means jump is reachable in signed 24 bits  */
     if (i1) {
-       /* Assume jump is not longer than 23 bits if inside jit */
        w = _jit->pc.w;
        /* if thumb and in thumb mode */
        if (jit_thumb_p() && _jitc->thumb) {
@@ -3576,11 +3821,9 @@ _stxi_i(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
     }
 }
 
-#  if __BYTE_ORDER == __LITTLE_ENDIAN
 static void
-_htonr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+_bswapr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
-    jit_int32_t                t0;
     if (jit_thumb_p()) {
        if ((r0|r1) < 8)
            T1_REV(r0, r1);
@@ -3594,20 +3837,14 @@ _htonr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
            rshi_u(r0, r0, 16);
        }
        else {
-           t0 = jit_get_reg(jit_class_gpr);
-           rshi(rn(t0), r1, 8);
-           andi(r0, r1, 0xff);
-           andi(rn(t0), rn(t0), 0xff);
-           lshi(r0, r0, 8);
-           orr(r0, r0, rn(t0));
-           jit_unget_reg(t0);
+               generic_bswapr_us(_jit, r0, r1);
        }
     }
 }
 
 /* inline glibc htonl (without register clobber) */
 static void
-_htonr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+_bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
     jit_int32_t                reg;
     if (jit_thumb_p()) {
@@ -3629,7 +3866,6 @@ _htonr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
        }
     }
 }
-#endif
 
 static void
 _extr_c(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
@@ -3715,14 +3951,29 @@ _callr(jit_state_t *_jit, jit_int32_t r0)
 }
 
 static void
-_calli(jit_state_t *_jit, jit_word_t i0)
+_calli(jit_state_t *_jit, jit_word_t i0, jit_bool_t exchange_p)
 {
     jit_word_t         d;
     jit_int32_t                reg;
-    d = ((i0 - _jit->pc.w) >> 2) - 2;
-    if (!jit_exchange_p() && !jit_thumb_p() && _s24P(d))
-       BLI(d & 0x00ffffff);
+    if (!exchange_p) {
+       if (jit_thumb_p()) {
+           if (jit_exchange_p())
+               /* skip switch from  arm to thumb 
+                * exchange_p set to zero means a jit function
+                * call in the same jit code buffer */
+               d = ((i0 + 8 - _jit->pc.w) >> 1) - 2;
+           else
+               d = ((i0 - _jit->pc.w) >> 1) - 2;
+       }
+       else                    d = ((i0 - _jit->pc.w) >> 2) - 2;
+       if (_s24P(d)) {
+           if (jit_thumb_p())  T2_BLI(encode_thumb_jump(d));
+           else                BLI(d & 0x00ffffff);
+       }
+       else                    goto fallback;
+    }
     else {
+    fallback:
        reg = jit_get_reg(jit_class_gpr);
        movi(rn(reg), i0);
        if (jit_thumb_p())
@@ -3734,28 +3985,44 @@ _calli(jit_state_t *_jit, jit_word_t i0)
 }
 
 static jit_word_t
-_calli_p(jit_state_t *_jit, jit_word_t i0)
+_calli_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1)
 {
     jit_word_t         w;
+    jit_word_t         d;
     jit_int32_t                reg;
-    reg = jit_get_reg(jit_class_gpr);
-    w = _jit->pc.w;
-    movi_p(rn(reg), i0);
-    if (jit_thumb_p())
-       T1_BLX(rn(reg));
-    else
-       BLX(rn(reg));
-    jit_unget_reg(reg);
+    /* i1 means call is reachable in signed 24 bits  */
+    if (i1) {
+       w = _jit->pc.w;
+       if (jit_thumb_p())      d = ((i0 - _jit->pc.w) >> 1) - 2;
+       else                    d = ((i0 - _jit->pc.w) >> 2) - 2;
+       assert(_s24P(d));
+       if (jit_thumb_p())      T2_BLI(encode_thumb_jump(d));
+       else                    BLI(d & 0x00ffffff);
+    }
+    else {
+       reg = jit_get_reg(jit_class_gpr);
+       w = _jit->pc.w;
+       movi_p(rn(reg), i0);
+       if (jit_thumb_p())
+           T1_BLX(rn(reg));
+       else
+           BLX(rn(reg));
+       jit_unget_reg(reg);
+    }
     return (w);
 }
 
 static void
 _prolog(jit_state_t *_jit, jit_node_t *node)
 {
-    jit_int32_t                reg;
+    jit_int32_t                reg, mask, count;
     if (_jitc->function->define_frame || _jitc->function->assume_frame) {
        jit_int32_t     frame = -_jitc->function->frame;
+       jit_check_frame();
        assert(_jitc->function->self.aoff >= frame);
+       if (jit_swf_p())
+           CHECK_SWF_OFFSET();
+       CHECK_REG_ARGS();
        if (_jitc->function->assume_frame) {
            if (jit_thumb_p() && !_jitc->thumb)
                _jitc->thumb = _jit->pc.w;
@@ -3768,38 +4035,66 @@ _prolog(jit_state_t *_jit, jit_node_t *node)
     _jitc->function->stack = ((_jitc->function->self.alen -
                              /* align stack at 8 bytes */
                              _jitc->function->self.aoff) + 7) & -8;
+    /* If this jit_check_frame() succeeds, it actually is just a need_stack,
+     * usually for arguments, so, allocai was not called, but pusharg*
+     * was called increasing stack size, for negative access offsets.
+     * This can be optimized for one less prolog instruction, that is,
+     * do not create the frame pointer, and only add _jitc->function->stack
+     * to sp, and on epilog, instead of moving fp to sp, just add negative
+     * value of _jitc->function->stack. Since this condition requires a
+     * large function body for excess arguments to called function, keep
+     * things a bit simpler for now, as this is the only place need_stack
+     * would be useful. */
+    if (_jitc->function->stack)
+       jit_check_frame();
+
+    for (reg = mask = count = 0; reg < jit_size(iregs); reg++) {
+       if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+           mask |= 1 << rn(iregs[reg]);
+           ++count;
+       }
+    }
+    /* One extra register to keep stack 8 bytes aligned */
+    if (count & 1) {
+       for (reg = 4; reg < 10; reg++) {
+           if (!(mask & (1 << reg))) {
+               mask |= 1 << reg;
+               break;
+           }
+       }
+    }
+    if (_jitc->function->need_frame || _jitc->function->need_return)
+       mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO);
+    if (!jit_swf_p() && _jitc->function->save_reg_args &&
+       !(_jitc->function->self.call & jit_call_varargs))
+       mask |= 0xf;
 
     if (jit_thumb_p()) {
        /*  switch to thumb mode (better approach would be to
         * ORR 1 address being called, but no clear distinction
         * of what is a pointer to a jit function, or if patching
         * a pointer to a jit function) */
-       ADDI(_R12_REGNO, _R15_REGNO, 1);
-       BX(_R12_REGNO);
+       if (jit_exchange_p()) {
+           ADDI(_R12_REGNO, _R15_REGNO, 1);
+           BX(_R12_REGNO);
+       }
        if (!_jitc->thumb)
            _jitc->thumb = _jit->pc.w;
-       if (jit_cpu.abi) {
-           T2_PUSH(0xf);
-           T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-           VPUSH_F64(_D8_REGNO, 8);
-       }
-       else {
+       if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                           (_jitc->function->self.call & jit_call_varargs)))
            T2_PUSH(0xf);
-           T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-       }
+       if (mask)
+           T2_PUSH(mask);
     }
     else {
-       if (jit_cpu.abi) {
+       if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                           (_jitc->function->self.call & jit_call_varargs)))
            PUSH(0xf);
-           PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-           VPUSH_F64(_D8_REGNO, 8);
-       }
-       else {
-           PUSH(0xf);
-           PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-       }
+       if (mask)
+           PUSH(mask);
     }
-    movr(_FP_REGNO, _SP_REGNO);
+    if (_jitc->function->need_frame)
+       movr(_FP_REGNO, _SP_REGNO);
     if (_jitc->function->stack)
        subi(_SP_REGNO, _SP_REGNO, _jitc->function->stack);
     if (_jitc->function->allocar) {
@@ -3813,17 +4108,41 @@ _prolog(jit_state_t *_jit, jit_node_t *node)
 static void
 _epilog(jit_state_t *_jit, jit_node_t *node)
 {
+    jit_int32_t                reg, mask, count;
     if (_jitc->function->assume_frame)
        return;
 
-    movr(_SP_REGNO, _FP_REGNO);
-    if (jit_cpu.abi)
-       VPOP_F64(_D8_REGNO, 8);
-    if (jit_thumb_p())
-       T2_POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-    else
-       POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-    addi(_SP_REGNO, _SP_REGNO, 16);
+    for (reg = mask = count = 0; reg < jit_size(iregs); reg++) {
+       if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+           mask |= 1 << rn(iregs[reg]);
+           ++count;
+       }
+    }
+    /* One extra register to keep stack 8 bytes aligned */
+    if (count & 1) {
+       for (reg = 4; reg < 10; reg++) {
+           if (!(mask & (1 << reg))) {
+               mask |= 1 << reg;
+               break;
+           }
+       }
+    }
+    if (_jitc->function->need_frame || _jitc->function->need_return)
+       mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO);
+    if (_jitc->function->need_frame)
+       movr(_SP_REGNO, _FP_REGNO);
+    if (!jit_swf_p() && _jitc->function->save_reg_args &&
+       !(_jitc->function->self.call & jit_call_varargs))
+       addi(_SP_REGNO, _SP_REGNO, 16);
+    if (mask) {
+       if (jit_thumb_p())
+           T2_POP(mask);
+       else
+           POP(mask);
+    }
+    if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                       (_jitc->function->self.call & jit_call_varargs)))
+       addi(_SP_REGNO, _SP_REGNO, 16);
     if (jit_thumb_p())
        T1_BX(_LR_REGNO);
     else
@@ -3841,8 +4160,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0)
      * The -16 is to account for the 4 argument registers
      * always saved, and _jitc->function->vagp is to account
      * for declared arguments. */
-    addi(r0, _FP_REGNO, _jitc->function->self.size -
-        16 + _jitc->function->vagp);
+    addi(r0, _FP_REGNO, jit_selfsize() - 16 + _jitc->function->vagp);
 }
 
 static void
@@ -3869,7 +4187,28 @@ _patch_at(jit_state_t *_jit,
        jit_word_t       w;
     } u;
     u.w = instr;
-    if (kind == arm_patch_jump) {
+    if (kind == arm_patch_call) {
+       if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) {
+           code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
+           assert((thumb.i & THUMB2_BLI) == THUMB2_BLI);
+           /* skip code to switch from arm to thumb mode */
+           if (jit_exchange_p())
+               d = ((label + 8 - instr) >> 1) - 2;
+           else
+               d = ((label - instr) >> 1) - 2;
+           assert(_s24P(d));
+           thumb.i = THUMB2_BLI | encode_thumb_jump(d);
+           thumb2code(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
+       }
+       else {
+           thumb.i = u.i[0];
+           assert((thumb.i & 0x0f000000) == ARM_BLI);
+           d = ((label - instr) >> 2) - 2;
+           assert(_s24P(d));
+           u.i[0] = (thumb.i & 0xff000000) | (d & 0x00ffffff);
+       }
+    }
+    else if (kind == arm_patch_jump) {
        if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) {
            code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
            if ((thumb.i & THUMB2_B) == THUMB2_B) {