git subrepo pull (merge) --force deps/lightning
[pcsx_rearmed.git] / deps / lightning / lib / jit_arm-cpu.c
index 91bb17c..a0852a2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012-2019  Free Software Foundation, Inc.
+ * Copyright (C) 2012-2023  Free Software Foundation, Inc.
  *
  * This file is part of GNU lightning.
  *
@@ -37,8 +37,7 @@
 #  define jit_armv5e_p()               (jit_cpu.version > 5 || (jit_cpu.version == 5 && jit_cpu.extend))
 #  define jit_armv6_p()                        (jit_cpu.version >= 6)
 #  define jit_armv7_p()                        (jit_cpu.version >= 7)
-#  define jit_armv7r_p()               0
-#  define stack_framesize              48
+#  define jit_armv7r_p()               (jit_cpu.version > 7 || (jit_cpu.version == 7 && jit_cpu.extend))
 extern int     __aeabi_idivmod(int, int);
 extern unsigned        __aeabi_uidivmod(unsigned, unsigned);
 #  define _R0_REGNO                    0x00
@@ -141,8 +140,12 @@ extern unsigned    __aeabi_uidivmod(unsigned, unsigned);
 #  define THUMB2_UMULL                 0xfba00000
 #  define ARM_SMULL                    0x00c00090
 #  define THUMB2_SMULL                 0xfb800000
+/* >> ARMv7r */
+#  define ARM_SDIV                     0x07100010
+#  define ARM_UDIV                     0x07300010
 #  define THUMB2_SDIV                  0xfb90f0f0
 #  define THUMB2_UDIV                  0xfbb0f0f0
+/* << ARMv7r */
 #  define ARM_AND                      0x00000000
 #  define THUMB_AND                        0x4000
 #  define THUMB2_AND                   0xea000000
@@ -185,6 +188,12 @@ extern unsigned    __aeabi_uidivmod(unsigned, unsigned);
 #  define ARM_STREX                    0x01800090
 #  define THUMB2_STREX                 0xe8400000
 /* << ARMv6* */
+/* >> ARMv6t2 */
+#  define THUMB2_CLZ                   0xfab0f080
+#  define THUMB2_RBIT                  0xfa90f0a0
+#  define ARM_RBIT                     0x06f00030
+/* << ARMv6t2 */
+#  define ARM_CLZ                      0x01600010
 /* >> ARMv7 */
 #  define ARM_DMB                      0xf57ff050
 #  define THUMB2_DMB                   0xf3bf8f50
@@ -447,6 +456,12 @@ static void _tdmb(jit_state_t *_jit, int im);
 #  define NOT(rd,rm)                   CC_NOT(ARM_CC_AL,rd,rm)
 #  define T1_NOT(rd,rm)                        T1_MVN(rd,rm)
 #  define T2_NOT(rd,rm)                        T2_MVN(rd,rm)
+#  define T2_CLZ(rd,rm)                        torrr(THUMB2_CLZ,rm,rd,rm)
+#  define CC_CLZ(cc,rd,rm)             corrrr(cc,ARM_CLZ,_R15_REGNO,rd,_R15_REGNO,rm)
+#  define CLZ(rd,rm)                   CC_CLZ(ARM_CC_AL,rd,rm)
+#  define T2_RBIT(rd,rm)               torrr(THUMB2_RBIT,rm,rd,rm)
+#  define CC_RBIT(cc,rd,rm)            corrrr(cc,ARM_RBIT,_R15_REGNO,rd,_R15_REGNO,rm)
+#  define RBIT(rd,rm)                  CC_RBIT(ARM_CC_AL,rd,rm)
 #  define NOP()                                MOV(_R0_REGNO, _R0_REGNO)
 #  define T1_NOP()                     is(0xbf00)
 #  define CC_ADD(cc,rd,rn,rm)          corrr(cc,ARM_ADD,rn,rd,rm)
@@ -524,6 +539,10 @@ static void _tdmb(jit_state_t *_jit, int im);
 #  define CC_UMULL(cc,rl,rh,rn,rm)     corrrr(cc,ARM_UMULL,rh,rl,rm,rn)
 #  define UMULL(rl,rh,rn,rm)           CC_UMULL(ARM_CC_AL,rl,rh,rn,rm)
 #  define T2_UMULL(rl,rh,rn,rm)                torrrr(THUMB2_UMULL,rn,rl,rh,rm)
+#  define CC_SDIV(cc,rd,rn,rm)         corrrr(cc,ARM_SDIV,rd,15,rn,rm)
+#  define SDIV(rd,rn,rm)               CC_SDIV(ARM_CC_AL,rd,rm,rn)
+#  define CC_UDIV(cc,rd,rn,rm)         corrrr(cc,ARM_UDIV,rd,15,rn,rm)
+#  define UDIV(rd,rn,rm)               CC_UDIV(ARM_CC_AL,rd,rm,rn)
 #  define T2_SDIV(rd,rn,rm)            torrr(THUMB2_SDIV,rn,rd,rm)
 #  define T2_UDIV(rd,rn,rm)            torrr(THUMB2_UDIV,rn,rd,rm)
 #  define CC_AND(cc,rd,rn,rm)          corrr(cc,ARM_AND,rn,rd,rm)
@@ -852,6 +871,8 @@ static void _tdmb(jit_state_t *_jit, int im);
 #  define T2_POP(im)                   tpp(THUMB2_POP,im)
 #  define jit_get_reg_args()                                           \
     do {                                                               \
+       CHECK_REG_ARGS();                                               \
+       jit_check_frame();                                              \
        (void)jit_get_reg(_R0|jit_class_named|jit_class_gpr);           \
        (void)jit_get_reg(_R1|jit_class_named|jit_class_gpr);           \
        (void)jit_get_reg(_R2|jit_class_named|jit_class_gpr);           \
@@ -885,6 +906,14 @@ static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t,
 static void _comr(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define negr(r0,r1)                  _negr(_jit,r0,r1)
 static void _negr(jit_state_t*,jit_int32_t,jit_int32_t);
+#  define clor(r0, r1)                 _clor(_jit, r0, r1)
+static void _clor(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define clzr(r0, r1)                 _clzr(_jit, r0, r1)
+static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define ctor(r0, r1)                 _ctor(_jit, r0, r1)
+static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define ctzr(r0, r1)                 _ctzr(_jit, r0, r1)
+static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t);
 #  define addr(r0,r1,r2)               _addr(_jit,r0,r1,r2)
 static void _addr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define addi(r0,r1,i0)               _addi(_jit,r0,r1,i0)
@@ -1147,10 +1176,10 @@ static void _prolog(jit_state_t*,jit_node_t*);
 static void _epilog(jit_state_t*,jit_node_t*);
 #  define callr(r0)                    _callr(_jit,r0)
 static void _callr(jit_state_t*,jit_int32_t);
-#  define calli(i0)                    _calli(_jit,i0)
-static void _calli(jit_state_t*,jit_word_t);
-#  define calli_p(i0)                  _calli_p(_jit,i0)
-static jit_word_t _calli_p(jit_state_t*,jit_word_t);
+#  define calli(i0,i1)                 _calli(_jit,i0,i1)
+static void _calli(jit_state_t*,jit_word_t,jit_bool_t);
+#  define calli_p(i0,i1)               _calli_p(_jit,i0,i1)
+static jit_word_t _calli_p(jit_state_t*,jit_word_t,jit_bool_t);
 #  define vastart(r0)                  _vastart(_jit, r0)
 static void _vastart(jit_state_t*, jit_int32_t);
 #  define vaarg(r0, r1)                        _vaarg(_jit, r0, r1)
@@ -1526,7 +1555,7 @@ _tpp(jit_state_t *_jit, int o, int im)
     assert(!(o & 0x0000ffff));
     if (o == THUMB2_PUSH)
        assert(!(im & 0x8000));
-    assert(__builtin_popcount(im & 0x1fff) > 1);
+    assert(__builtin_popcount(im & 0x7fff) > 1);
     thumb.i = o|im;
     iss(thumb.s[0], thumb.s[1]);
 }
@@ -1679,13 +1708,14 @@ _casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
            /* retry: */
            retry = _jit->pc.w;
            T2_LDREX(r0, r1, 0);
-           jump0 = bner(_jit->pc.w, r0, r2);   /* bne done r0 r2 */
+           eqr(r0, r0, r2);
+           jump0 = beqi(_jit->pc.w, r0, 0);    /* beqi done r0 0 */
            T2_STREX(r0, r3, r1, 0);
            jump1 = bnei(_jit->pc.w, r0, 0);    /* bnei retry r0 0 */
-           /* done: */
-           done = _jit->pc.w;
            /* r0 = 0 if memory updated, 1 otherwise */
            xori(r0, r0, 1);
+           /* done: */
+           done = _jit->pc.w;
            T2_DMB(DMB_ISH);
        }
        else {
@@ -1693,13 +1723,14 @@ _casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
            /* retry: */
            retry = _jit->pc.w;
            LDREX(r0, r1);
-           jump0 = bner(_jit->pc.w, r0, r2);   /* bne done r0 r2 */
+           eqr(r0, r0, r2);
+           jump0 = beqi(_jit->pc.w, r0, 0);    /* beqi done r0 0 */
            STREX(r0, r3, r1);
            jump1 = bnei(_jit->pc.w, r0, 0);    /* bnei retry r0 0 */
-           /* done: */
-           done = _jit->pc.w;
            /* r0 = 0 if memory updated, 1 otherwise */
            xori(r0, r0, 1);
+           /* done: */
+           done = _jit->pc.w;
            DMB(DMB_ISH);
        }
        patch_at(arm_patch_jump, jump0, done);
@@ -1735,6 +1766,53 @@ _negr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
        RSBI(r0, r1, 0);
 }
 
+static void
+_clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (!jit_thumb_p() && jit_armv5e_p())
+       CLZ(r0, r1);
+    else if (jit_thumb_p() && jit_armv7_p()) { /* armv6t2 actually */
+       T2_CLZ(r0, r1);
+    }
+    else
+       fallback_clz(r0, r0);
+}
+
+static void
+_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    comr(r0, r1);
+    clzr(r0, r0);
+}
+
+static void
+_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_armv7_p()) {       /* armv6t2 actually */
+       if (jit_thumb_p())
+           T2_RBIT(r0, r1);
+       else
+           RBIT(r0, r1);
+       clor(r0, r0);
+    }
+    else
+       fallback_cto(r0, r1);
+}
+
+static void
+_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_armv7_p()) {       /* armv6t2 actually */
+       if (jit_thumb_p())
+           T2_RBIT(r0, r1);
+       else
+           RBIT(r0, r1);
+       clzr(r0, r0);
+    }
+    else
+       fallback_ctz(r0, r1);
+}
+
 static void
 _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
@@ -2230,8 +2308,12 @@ _divrem(jit_state_t *_jit, int div, int sign,
 static void
 _divr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    if (jit_armv7r_p() && jit_thumb_p())
-       T2_SDIV(r0, r1, r2);
+    if (jit_armv7r_p()) {
+       if (jit_thumb_p())
+           T2_SDIV(r0, r1, r2);
+       else
+           SDIV(r0, r1, r2);
+    }
     else
        divrem(1, 1, r0, r1, r2);
 }
@@ -2249,8 +2331,12 @@ _divi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 static void
 _divr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    if (jit_armv7r_p() && jit_thumb_p())
-       T2_UDIV(r0, r1, r2);
+    if (jit_armv7r_p()) {
+       if (jit_thumb_p())
+           T2_UDIV(r0, r1, r2);
+       else
+           UDIV(r0, r1, r2);
+    }
     else
        divrem(1, 0, r0, r1, r2);
 }
@@ -2310,7 +2396,23 @@ _iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
 static void
 _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    divrem(0, 1, r0, r1, r2);
+    if (jit_armv7r_p()) {
+       jit_int32_t             reg;
+       if (r0 == r1 || r0 == r2) {
+           reg = jit_get_reg(jit_class_gpr);
+           divr(rn(reg), r1, r2);
+           mulr(rn(reg), r2, rn(reg));
+           subr(r0, r1, rn(reg));
+           jit_unget_reg(reg);
+       }
+       else {
+           divr(r0, r1, r2);
+           mulr(r0, r2, r0);
+           subr(r0, r1, r0);
+       }
+    }
+    else
+       divrem(0, 1, r0, r1, r2);
 }
 
 static void
@@ -2326,7 +2428,23 @@ _remi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 static void
 _remr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
-    divrem(0, 0, r0, r1, r2);
+    if (jit_armv7r_p()) {
+       jit_int32_t             reg;
+       if (r0 == r1 || r0 == r2) {
+           reg = jit_get_reg(jit_class_gpr);
+           divr_u(rn(reg), r1, r2);
+           mulr(rn(reg), r2, rn(reg));
+           subr(r0, r1, rn(reg));
+           jit_unget_reg(reg);
+       }
+       else {
+           divr_u(r0, r1, r2);
+           mulr(r0, r2, r0);
+           subr(r0, r1, r0);
+       }
+    }
+    else
+       divrem(0, 0, r0, r1, r2);
 }
 
 static void
@@ -2739,8 +2857,8 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1)
     jit_word_t         w;
     jit_word_t         d;
     jit_int32_t                reg;
+    /* i1 means jump is reachable in signed 24 bits  */
     if (i1) {
-       /* Assume jump is not longer than 23 bits if inside jit */
        w = _jit->pc.w;
        /* if thumb and in thumb mode */
        if (jit_thumb_p() && _jitc->thumb) {
@@ -3833,14 +3951,29 @@ _callr(jit_state_t *_jit, jit_int32_t r0)
 }
 
 static void
-_calli(jit_state_t *_jit, jit_word_t i0)
+_calli(jit_state_t *_jit, jit_word_t i0, jit_bool_t exchange_p)
 {
     jit_word_t         d;
     jit_int32_t                reg;
-    d = ((i0 - _jit->pc.w) >> 2) - 2;
-    if (!jit_exchange_p() && !jit_thumb_p() && _s24P(d))
-       BLI(d & 0x00ffffff);
+    if (!exchange_p) {
+       if (jit_thumb_p()) {
+           if (jit_exchange_p())
+               /* skip switch from  arm to thumb 
+                * exchange_p set to zero means a jit function
+                * call in the same jit code buffer */
+               d = ((i0 + 8 - _jit->pc.w) >> 1) - 2;
+           else
+               d = ((i0 - _jit->pc.w) >> 1) - 2;
+       }
+       else                    d = ((i0 - _jit->pc.w) >> 2) - 2;
+       if (_s24P(d)) {
+           if (jit_thumb_p())  T2_BLI(encode_thumb_jump(d));
+           else                BLI(d & 0x00ffffff);
+       }
+       else                    goto fallback;
+    }
     else {
+    fallback:
        reg = jit_get_reg(jit_class_gpr);
        movi(rn(reg), i0);
        if (jit_thumb_p())
@@ -3852,28 +3985,44 @@ _calli(jit_state_t *_jit, jit_word_t i0)
 }
 
 static jit_word_t
-_calli_p(jit_state_t *_jit, jit_word_t i0)
+_calli_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1)
 {
     jit_word_t         w;
+    jit_word_t         d;
     jit_int32_t                reg;
-    reg = jit_get_reg(jit_class_gpr);
-    w = _jit->pc.w;
-    movi_p(rn(reg), i0);
-    if (jit_thumb_p())
-       T1_BLX(rn(reg));
-    else
-       BLX(rn(reg));
-    jit_unget_reg(reg);
+    /* i1 means call is reachable in signed 24 bits  */
+    if (i1) {
+       w = _jit->pc.w;
+       if (jit_thumb_p())      d = ((i0 - _jit->pc.w) >> 1) - 2;
+       else                    d = ((i0 - _jit->pc.w) >> 2) - 2;
+       assert(_s24P(d));
+       if (jit_thumb_p())      T2_BLI(encode_thumb_jump(d));
+       else                    BLI(d & 0x00ffffff);
+    }
+    else {
+       reg = jit_get_reg(jit_class_gpr);
+       w = _jit->pc.w;
+       movi_p(rn(reg), i0);
+       if (jit_thumb_p())
+           T1_BLX(rn(reg));
+       else
+           BLX(rn(reg));
+       jit_unget_reg(reg);
+    }
     return (w);
 }
 
 static void
 _prolog(jit_state_t *_jit, jit_node_t *node)
 {
-    jit_int32_t                reg;
+    jit_int32_t                reg, mask, count;
     if (_jitc->function->define_frame || _jitc->function->assume_frame) {
        jit_int32_t     frame = -_jitc->function->frame;
+       jit_check_frame();
        assert(_jitc->function->self.aoff >= frame);
+       if (jit_swf_p())
+           CHECK_SWF_OFFSET();
+       CHECK_REG_ARGS();
        if (_jitc->function->assume_frame) {
            if (jit_thumb_p() && !_jitc->thumb)
                _jitc->thumb = _jit->pc.w;
@@ -3886,38 +4035,66 @@ _prolog(jit_state_t *_jit, jit_node_t *node)
     _jitc->function->stack = ((_jitc->function->self.alen -
                              /* align stack at 8 bytes */
                              _jitc->function->self.aoff) + 7) & -8;
+    /* If this jit_check_frame() succeeds, it actually is just a need_stack,
+     * usually for arguments, so, allocai was not called, but pusharg*
+     * was called increasing stack size, for negative access offsets.
+     * This can be optimized for one less prolog instruction, that is,
+     * do not create the frame pointer, and only add _jitc->function->stack
+     * to sp, and on epilog, instead of moving fp to sp, just add negative
+     * value of _jitc->function->stack. Since this condition requires a
+     * large function body for excess arguments to called function, keep
+     * things a bit simpler for now, as this is the only place need_stack
+     * would be useful. */
+    if (_jitc->function->stack)
+       jit_check_frame();
+
+    for (reg = mask = count = 0; reg < jit_size(iregs); reg++) {
+       if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+           mask |= 1 << rn(iregs[reg]);
+           ++count;
+       }
+    }
+    /* One extra register to keep stack 8 bytes aligned */
+    if (count & 1) {
+       for (reg = 4; reg < 10; reg++) {
+           if (!(mask & (1 << reg))) {
+               mask |= 1 << reg;
+               break;
+           }
+       }
+    }
+    if (_jitc->function->need_frame || _jitc->function->need_return)
+       mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO);
+    if (!jit_swf_p() && _jitc->function->save_reg_args &&
+       !(_jitc->function->self.call & jit_call_varargs))
+       mask |= 0xf;
 
     if (jit_thumb_p()) {
        /*  switch to thumb mode (better approach would be to
         * ORR 1 address being called, but no clear distinction
         * of what is a pointer to a jit function, or if patching
         * a pointer to a jit function) */
-       ADDI(_R12_REGNO, _R15_REGNO, 1);
-       BX(_R12_REGNO);
+       if (jit_exchange_p()) {
+           ADDI(_R12_REGNO, _R15_REGNO, 1);
+           BX(_R12_REGNO);
+       }
        if (!_jitc->thumb)
            _jitc->thumb = _jit->pc.w;
-       if (jit_cpu.abi) {
-           T2_PUSH(0xf);
-           T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-           VPUSH_F64(_D8_REGNO, 8);
-       }
-       else {
+       if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                           (_jitc->function->self.call & jit_call_varargs)))
            T2_PUSH(0xf);
-           T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-       }
+       if (mask)
+           T2_PUSH(mask);
     }
     else {
-       if (jit_cpu.abi) {
-           PUSH(0xf);
-           PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-           VPUSH_F64(_D8_REGNO, 8);
-       }
-       else {
+       if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                           (_jitc->function->self.call & jit_call_varargs)))
            PUSH(0xf);
-           PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-       }
+       if (mask)
+           PUSH(mask);
     }
-    movr(_FP_REGNO, _SP_REGNO);
+    if (_jitc->function->need_frame)
+       movr(_FP_REGNO, _SP_REGNO);
     if (_jitc->function->stack)
        subi(_SP_REGNO, _SP_REGNO, _jitc->function->stack);
     if (_jitc->function->allocar) {
@@ -3931,17 +4108,41 @@ _prolog(jit_state_t *_jit, jit_node_t *node)
 static void
 _epilog(jit_state_t *_jit, jit_node_t *node)
 {
+    jit_int32_t                reg, mask, count;
     if (_jitc->function->assume_frame)
        return;
 
-    movr(_SP_REGNO, _FP_REGNO);
-    if (jit_cpu.abi)
-       VPOP_F64(_D8_REGNO, 8);
-    if (jit_thumb_p())
-       T2_POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-    else
-       POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO));
-    addi(_SP_REGNO, _SP_REGNO, 16);
+    for (reg = mask = count = 0; reg < jit_size(iregs); reg++) {
+       if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+           mask |= 1 << rn(iregs[reg]);
+           ++count;
+       }
+    }
+    /* One extra register to keep stack 8 bytes aligned */
+    if (count & 1) {
+       for (reg = 4; reg < 10; reg++) {
+           if (!(mask & (1 << reg))) {
+               mask |= 1 << reg;
+               break;
+           }
+       }
+    }
+    if (_jitc->function->need_frame || _jitc->function->need_return)
+       mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO);
+    if (_jitc->function->need_frame)
+       movr(_SP_REGNO, _FP_REGNO);
+    if (!jit_swf_p() && _jitc->function->save_reg_args &&
+       !(_jitc->function->self.call & jit_call_varargs))
+       addi(_SP_REGNO, _SP_REGNO, 16);
+    if (mask) {
+       if (jit_thumb_p())
+           T2_POP(mask);
+       else
+           POP(mask);
+    }
+    if (jit_swf_p() || (_jitc->function->save_reg_args &&
+                       (_jitc->function->self.call & jit_call_varargs)))
+       addi(_SP_REGNO, _SP_REGNO, 16);
     if (jit_thumb_p())
        T1_BX(_LR_REGNO);
     else
@@ -3959,8 +4160,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0)
      * The -16 is to account for the 4 argument registers
      * always saved, and _jitc->function->vagp is to account
      * for declared arguments. */
-    addi(r0, _FP_REGNO, _jitc->function->self.size -
-        16 + _jitc->function->vagp);
+    addi(r0, _FP_REGNO, jit_selfsize() - 16 + _jitc->function->vagp);
 }
 
 static void
@@ -3987,7 +4187,28 @@ _patch_at(jit_state_t *_jit,
        jit_word_t       w;
     } u;
     u.w = instr;
-    if (kind == arm_patch_jump) {
+    if (kind == arm_patch_call) {
+       if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) {
+           code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
+           assert((thumb.i & THUMB2_BLI) == THUMB2_BLI);
+           /* skip code to switch from arm to thumb mode */
+           if (jit_exchange_p())
+               d = ((label + 8 - instr) >> 1) - 2;
+           else
+               d = ((label - instr) >> 1) - 2;
+           assert(_s24P(d));
+           thumb.i = THUMB2_BLI | encode_thumb_jump(d);
+           thumb2code(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
+       }
+       else {
+           thumb.i = u.i[0];
+           assert((thumb.i & 0x0f000000) == ARM_BLI);
+           d = ((label - instr) >> 2) - 2;
+           assert(_s24P(d));
+           u.i[0] = (thumb.i & 0xff000000) | (d & 0x00ffffff);
+       }
+    }
+    else if (kind == arm_patch_jump) {
        if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) {
            code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]);
            if ((thumb.i & THUMB2_B) == THUMB2_B) {