X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flightning%2Flib%2Fjit_arm-cpu.c;h=a0852a2cd55c8716487ec55276f1fc1c016c4b44;hb=79bfeef6160be4b228a7998ac2b43cd83d882532;hp=14ba36bb98ccf1149f77a39df38b659a791c06e6;hpb=437b1e617808119c3a24a72c77cd2fa86a5d3220;p=pcsx_rearmed.git diff --git a/deps/lightning/lib/jit_arm-cpu.c b/deps/lightning/lib/jit_arm-cpu.c index 14ba36bb..a0852a2c 100644 --- a/deps/lightning/lib/jit_arm-cpu.c +++ b/deps/lightning/lib/jit_arm-cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2023 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -36,8 +36,8 @@ # define jit_armv5_p() (jit_cpu.version >= 5) # define jit_armv5e_p() (jit_cpu.version > 5 || (jit_cpu.version == 5 && jit_cpu.extend)) # define jit_armv6_p() (jit_cpu.version >= 6) -# define jit_armv7r_p() 0 -# define stack_framesize 48 +# define jit_armv7_p() (jit_cpu.version >= 7) +# define jit_armv7r_p() (jit_cpu.version > 7 || (jit_cpu.version == 7 && jit_cpu.extend)) extern int __aeabi_idivmod(int, int); extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define _R0_REGNO 0x00 @@ -140,8 +140,12 @@ extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define THUMB2_UMULL 0xfba00000 # define ARM_SMULL 0x00c00090 # define THUMB2_SMULL 0xfb800000 +/* >> ARMv7r */ +# define ARM_SDIV 0x07100010 +# define ARM_UDIV 0x07300010 # define THUMB2_SDIV 0xfb90f0f0 # define THUMB2_UDIV 0xfbb0f0f0 +/* << ARMv7r */ # define ARM_AND 0x00000000 # define THUMB_AND 0x4000 # define THUMB2_AND 0xea000000 @@ -179,7 +183,29 @@ extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define ARM_XTR8 0x00000400 /* ?xt? rotate 8 bits */ # define ARM_XTR16 0x00000800 /* ?xt? rotate 16 bits */ # define ARM_XTR24 0x00000c00 /* ?xt? rotate 24 bits */ +# define ARM_LDREX 0x01900090 +# define THUMB2_LDREX 0xe8500000 +# define ARM_STREX 0x01800090 +# define THUMB2_STREX 0xe8400000 /* << ARMv6* */ +/* >> ARMv6t2 */ +# define THUMB2_CLZ 0xfab0f080 +# define THUMB2_RBIT 0xfa90f0a0 +# define ARM_RBIT 0x06f00030 +/* << ARMv6t2 */ +# define ARM_CLZ 0x01600010 +/* >> ARMv7 */ +# define ARM_DMB 0xf57ff050 +# define THUMB2_DMB 0xf3bf8f50 +# define DMB_SY 0xf +# define DMB_ST 0xe +# define DMB_ISH 0xb +# define DMB_ISHST 0xa +# define DMB_NSH 0x7 +# define DMB_NSHT 0x6 +# define DMB_OSH 0x3 +# define DMB_OSHST 0x2 +/* << ARMv7 */ # define ARM_SHIFT 0x01a00000 # define ARM_R 0x00000010 /* register shift */ # define ARM_LSL 0x00000000 @@ -399,6 +425,12 @@ static void _tcit(jit_state_t*,unsigned int,int); static void _tpp(jit_state_t*,int,int); # define torl(o,rn,im) _torl(_jit,o,rn,im) static void _torl(jit_state_t*,int,int,int) maybe_unused; +# define DMB(im) dmb(im) +# define T2_DMB(im) tdmb(im) +# define dmb(im) _dmb(_jit, im) +static void _dmb(jit_state_t *_jit, int im); +# define tdmb(im) _tdmb(_jit, im) +static void _tdmb(jit_state_t *_jit, int im); # define CC_MOV(cc,rd,rm) corrr(cc,ARM_MOV,0,rd,rm) # define MOV(rd,rm) CC_MOV(ARM_CC_AL,rd,rm) # define T1_MOV(rd,rm) is(THUMB_MOV|((_u4(rd)&8)<<4)|(_u4(rm)<<3)|(rd&7)) @@ -424,6 +456,12 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused; # define NOT(rd,rm) CC_NOT(ARM_CC_AL,rd,rm) # define T1_NOT(rd,rm) T1_MVN(rd,rm) # define T2_NOT(rd,rm) T2_MVN(rd,rm) +# define T2_CLZ(rd,rm) torrr(THUMB2_CLZ,rm,rd,rm) +# define CC_CLZ(cc,rd,rm) corrrr(cc,ARM_CLZ,_R15_REGNO,rd,_R15_REGNO,rm) +# define CLZ(rd,rm) CC_CLZ(ARM_CC_AL,rd,rm) +# define T2_RBIT(rd,rm) torrr(THUMB2_RBIT,rm,rd,rm) +# define CC_RBIT(cc,rd,rm) corrrr(cc,ARM_RBIT,_R15_REGNO,rd,_R15_REGNO,rm) +# define RBIT(rd,rm) CC_RBIT(ARM_CC_AL,rd,rm) # define NOP() MOV(_R0_REGNO, _R0_REGNO) # define T1_NOP() is(0xbf00) # define CC_ADD(cc,rd,rn,rm) corrr(cc,ARM_ADD,rn,rd,rm) @@ -501,6 +539,10 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused; # define CC_UMULL(cc,rl,rh,rn,rm) corrrr(cc,ARM_UMULL,rh,rl,rm,rn) # define UMULL(rl,rh,rn,rm) CC_UMULL(ARM_CC_AL,rl,rh,rn,rm) # define T2_UMULL(rl,rh,rn,rm) torrrr(THUMB2_UMULL,rn,rl,rh,rm) +# define CC_SDIV(cc,rd,rn,rm) corrrr(cc,ARM_SDIV,rd,15,rn,rm) +# define SDIV(rd,rn,rm) CC_SDIV(ARM_CC_AL,rd,rm,rn) +# define CC_UDIV(cc,rd,rn,rm) corrrr(cc,ARM_UDIV,rd,15,rn,rm) +# define UDIV(rd,rn,rm) CC_UDIV(ARM_CC_AL,rd,rm,rn) # define T2_SDIV(rd,rn,rm) torrr(THUMB2_SDIV,rn,rd,rm) # define T2_UDIV(rd,rn,rm) torrr(THUMB2_UDIV,rn,rd,rm) # define CC_AND(cc,rd,rn,rm) corrr(cc,ARM_AND,rn,rd,rm) @@ -718,6 +760,9 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused; # define CC_LDRDIN(cc,rt,rn,im) corri8(cc,ARM_LDRDI,rn,rt,im) # define LDRDIN(rt,rn,im) CC_LDRDIN(ARM_CC_AL,rt,rn,im) # define T2_LDRDIN(rt,rt2,rn,im) torrri8(THUMB2_LDRDI,rn,rt,rt2,im) +# define CC_LDREX(cc,rt,rn) corrrr(cc,ARM_LDREX,rn,rt,0xf,0xf) +# define LDREX(rt,rn) CC_LDREX(ARM_CC_AL,rt,rn) +# define T2_LDREX(rt,rn,im) torrri8(THUMB2_LDREX,rn,rt,0xf,im) # define CC_STRB(cc,rt,rn,rm) corrr(cc,ARM_STRB|ARM_P,rn,rt,rm) # define STRB(rt,rn,rm) CC_STRB(ARM_CC_AL,rt,rn,rm) # define T1_STRB(rt,rn,rm) is(THUMB_STRB|(_u3(rm)<<6)|(_u3(rn)<<3)|_u3(rt)) @@ -771,6 +816,9 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused; # define CC_STRDIN(cc,rt,rn,im) corri8(cc,ARM_STRDI,rn,rt,im) # define STRDIN(rt,rn,im) CC_STRDIN(ARM_CC_AL,rt,rn,im) # define T2_STRDIN(rt,rt2,rn,im) torrri8(THUMB2_STRDI,rn,rt,rt2,im) +# define CC_STREX(cc,rd,rt,rn) corrrr(cc,ARM_STREX,rn,rd,0xf,rt) +# define STREX(rd,rt,rn) CC_STREX(ARM_CC_AL,rd,rt,rn) +# define T2_STREX(rd,rt,rn,im) torrri8(THUMB2_STREX,rn,rt,rd,im) # define CC_LDMIA(cc,rn,im) corl(cc,ARM_M|ARM_M_L|ARM_M_I,rn,im) # define LDMIA(rn,im) CC_LDMIA(ARM_CC_AL,rn,im) # define CC_LDM(cc,rn,im) CC_LDMIA(cc,rn,im) @@ -823,6 +871,8 @@ static void _torl(jit_state_t*,int,int,int) maybe_unused; # define T2_POP(im) tpp(THUMB2_POP,im) # define jit_get_reg_args() \ do { \ + CHECK_REG_ARGS(); \ + jit_check_frame(); \ (void)jit_get_reg(_R0|jit_class_named|jit_class_gpr); \ (void)jit_get_reg(_R1|jit_class_named|jit_class_gpr); \ (void)jit_get_reg(_R2|jit_class_named|jit_class_gpr); \ @@ -847,10 +897,23 @@ static jit_word_t _movi_p(jit_state_t*,jit_int32_t,jit_word_t); static void _movnr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define movzr(r0,r1,r2) _movzr(_jit,r0,r1,r2) static void _movzr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); +# define casx(r0, r1, r2, r3, i0) _casx(_jit, r0, r1, r2, r3, i0) +static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t, + jit_int32_t,jit_int32_t,jit_word_t); +#define casr(r0, r1, r2, r3) casx(r0, r1, r2, r3, 0) +#define casi(r0, i0, r1, r2) casx(r0, _NOREG, r1, r2, i0) # define comr(r0,r1) _comr(_jit,r0,r1) static void _comr(jit_state_t*,jit_int32_t,jit_int32_t); # define negr(r0,r1) _negr(_jit,r0,r1) static void _negr(jit_state_t*,jit_int32_t,jit_int32_t); +# define clor(r0, r1) _clor(_jit, r0, r1) +static void _clor(jit_state_t*, jit_int32_t, jit_int32_t); +# define clzr(r0, r1) _clzr(_jit, r0, r1) +static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctor(r0, r1) _ctor(_jit, r0, r1) +static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctzr(r0, r1) _ctzr(_jit, r0, r1) +static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); # define addr(r0,r1,r2) _addr(_jit,r0,r1,r2) static void _addr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define addi(r0,r1,i0) _addi(_jit,r0,r1,i0) @@ -1113,10 +1176,10 @@ static void _prolog(jit_state_t*,jit_node_t*); static void _epilog(jit_state_t*,jit_node_t*); # define callr(r0) _callr(_jit,r0) static void _callr(jit_state_t*,jit_int32_t); -# define calli(i0) _calli(_jit,i0) -static void _calli(jit_state_t*,jit_word_t); -# define calli_p(i0) _calli_p(_jit,i0) -static jit_word_t _calli_p(jit_state_t*,jit_word_t); +# define calli(i0,i1) _calli(_jit,i0,i1) +static void _calli(jit_state_t*,jit_word_t,jit_bool_t); +# define calli_p(i0,i1) _calli_p(_jit,i0,i1) +static jit_word_t _calli_p(jit_state_t*,jit_word_t,jit_bool_t); # define vastart(r0) _vastart(_jit, r0) static void _vastart(jit_state_t*, jit_int32_t); # define vaarg(r0, r1) _vaarg(_jit, r0, r1) @@ -1492,7 +1555,7 @@ _tpp(jit_state_t *_jit, int o, int im) assert(!(o & 0x0000ffff)); if (o == THUMB2_PUSH) assert(!(im & 0x8000)); - assert(__builtin_popcount(im & 0x1fff) > 1); + assert(__builtin_popcount(im & 0x7fff) > 1); thumb.i = o|im; iss(thumb.s[0], thumb.s[1]); } @@ -1508,6 +1571,22 @@ _torl(jit_state_t *_jit, int o, int rn, int im) iss(thumb.s[0], thumb.s[1]); } +static void +_dmb(jit_state_t *_jit, int im) +{ + assert(!(im & 0xfffffff0)); + ii(ARM_DMB|im); +} + +static void +_tdmb(jit_state_t *_jit, int im) +{ + jit_thumb_t thumb; + assert(!(im & 0xfffffff0)); + thumb.i = THUMB2_DMB | im; + iss(thumb.s[0], thumb.s[1]); +} + static void _nop(jit_state_t *_jit, jit_int32_t i0) { @@ -1610,6 +1689,57 @@ _movzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) _movznr(_jit, ARM_CC_EQ, r0, r1, r2); } +static void +_casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, + jit_int32_t r2, jit_int32_t r3, jit_word_t i0) +{ + jit_int32_t r1_reg, iscasi; + jit_word_t retry, done, jump0, jump1; + if (!jit_armv7_p()) + fallback_casx(r0, r1, r2, r3, i0); + else { + if ((iscasi = (r1 == _NOREG))) { + r1_reg = jit_get_reg(jit_class_gpr); + r1 = rn(r1_reg); + movi(r1, i0); + } + if (jit_thumb_p()) { + T2_DMB(DMB_ISH); + /* retry: */ + retry = _jit->pc.w; + T2_LDREX(r0, r1, 0); + eqr(r0, r0, r2); + jump0 = beqi(_jit->pc.w, r0, 0); /* beqi done r0 0 */ + T2_STREX(r0, r3, r1, 0); + jump1 = bnei(_jit->pc.w, r0, 0); /* bnei retry r0 0 */ + /* r0 = 0 if memory updated, 1 otherwise */ + xori(r0, r0, 1); + /* done: */ + done = _jit->pc.w; + T2_DMB(DMB_ISH); + } + else { + DMB(DMB_ISH); + /* retry: */ + retry = _jit->pc.w; + LDREX(r0, r1); + eqr(r0, r0, r2); + jump0 = beqi(_jit->pc.w, r0, 0); /* beqi done r0 0 */ + STREX(r0, r3, r1); + jump1 = bnei(_jit->pc.w, r0, 0); /* bnei retry r0 0 */ + /* r0 = 0 if memory updated, 1 otherwise */ + xori(r0, r0, 1); + /* done: */ + done = _jit->pc.w; + DMB(DMB_ISH); + } + patch_at(arm_patch_jump, jump0, done); + patch_at(arm_patch_jump, jump1, retry); + if (iscasi) + jit_unget_reg(r1_reg); + } +} + static void _comr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) { @@ -1636,6 +1766,53 @@ _negr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) RSBI(r0, r1, 0); } +static void +_clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (!jit_thumb_p() && jit_armv5e_p()) + CLZ(r0, r1); + else if (jit_thumb_p() && jit_armv7_p()) { /* armv6t2 actually */ + T2_CLZ(r0, r1); + } + else + fallback_clz(r0, r0); +} + +static void +_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + comr(r0, r1); + clzr(r0, r0); +} + +static void +_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_armv7_p()) { /* armv6t2 actually */ + if (jit_thumb_p()) + T2_RBIT(r0, r1); + else + RBIT(r0, r1); + clor(r0, r0); + } + else + fallback_cto(r0, r1); +} + +static void +_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_armv7_p()) { /* armv6t2 actually */ + if (jit_thumb_p()) + T2_RBIT(r0, r1); + else + RBIT(r0, r1); + clzr(r0, r0); + } + else + fallback_ctz(r0, r1); +} + static void _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { @@ -2131,8 +2308,12 @@ _divrem(jit_state_t *_jit, int div, int sign, static void _divr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - if (jit_armv7r_p() && jit_thumb_p()) - T2_SDIV(r0, r1, r2); + if (jit_armv7r_p()) { + if (jit_thumb_p()) + T2_SDIV(r0, r1, r2); + else + SDIV(r0, r1, r2); + } else divrem(1, 1, r0, r1, r2); } @@ -2150,8 +2331,12 @@ _divi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) static void _divr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - if (jit_armv7r_p() && jit_thumb_p()) - T2_UDIV(r0, r1, r2); + if (jit_armv7r_p()) { + if (jit_thumb_p()) + T2_UDIV(r0, r1, r2); + else + UDIV(r0, r1, r2); + } else divrem(1, 0, r0, r1, r2); } @@ -2211,7 +2396,23 @@ _iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, static void _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - divrem(0, 1, r0, r1, r2); + if (jit_armv7r_p()) { + jit_int32_t reg; + if (r0 == r1 || r0 == r2) { + reg = jit_get_reg(jit_class_gpr); + divr(rn(reg), r1, r2); + mulr(rn(reg), r2, rn(reg)); + subr(r0, r1, rn(reg)); + jit_unget_reg(reg); + } + else { + divr(r0, r1, r2); + mulr(r0, r2, r0); + subr(r0, r1, r0); + } + } + else + divrem(0, 1, r0, r1, r2); } static void @@ -2227,7 +2428,23 @@ _remi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) static void _remr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - divrem(0, 0, r0, r1, r2); + if (jit_armv7r_p()) { + jit_int32_t reg; + if (r0 == r1 || r0 == r2) { + reg = jit_get_reg(jit_class_gpr); + divr_u(rn(reg), r1, r2); + mulr(rn(reg), r2, rn(reg)); + subr(r0, r1, rn(reg)); + jit_unget_reg(reg); + } + else { + divr_u(r0, r1, r2); + mulr(r0, r2, r0); + subr(r0, r1, r0); + } + } + else + divrem(0, 0, r0, r1, r2); } static void @@ -2640,8 +2857,8 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1) jit_word_t w; jit_word_t d; jit_int32_t reg; + /* i1 means jump is reachable in signed 24 bits */ if (i1) { - /* Assume jump is not longer than 23 bits if inside jit */ w = _jit->pc.w; /* if thumb and in thumb mode */ if (jit_thumb_p() && _jitc->thumb) { @@ -3734,14 +3951,29 @@ _callr(jit_state_t *_jit, jit_int32_t r0) } static void -_calli(jit_state_t *_jit, jit_word_t i0) +_calli(jit_state_t *_jit, jit_word_t i0, jit_bool_t exchange_p) { jit_word_t d; jit_int32_t reg; - d = ((i0 - _jit->pc.w) >> 2) - 2; - if (!jit_exchange_p() && !jit_thumb_p() && _s24P(d)) - BLI(d & 0x00ffffff); + if (!exchange_p) { + if (jit_thumb_p()) { + if (jit_exchange_p()) + /* skip switch from arm to thumb + * exchange_p set to zero means a jit function + * call in the same jit code buffer */ + d = ((i0 + 8 - _jit->pc.w) >> 1) - 2; + else + d = ((i0 - _jit->pc.w) >> 1) - 2; + } + else d = ((i0 - _jit->pc.w) >> 2) - 2; + if (_s24P(d)) { + if (jit_thumb_p()) T2_BLI(encode_thumb_jump(d)); + else BLI(d & 0x00ffffff); + } + else goto fallback; + } else { + fallback: reg = jit_get_reg(jit_class_gpr); movi(rn(reg), i0); if (jit_thumb_p()) @@ -3753,28 +3985,44 @@ _calli(jit_state_t *_jit, jit_word_t i0) } static jit_word_t -_calli_p(jit_state_t *_jit, jit_word_t i0) +_calli_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1) { jit_word_t w; + jit_word_t d; jit_int32_t reg; - reg = jit_get_reg(jit_class_gpr); - w = _jit->pc.w; - movi_p(rn(reg), i0); - if (jit_thumb_p()) - T1_BLX(rn(reg)); - else - BLX(rn(reg)); - jit_unget_reg(reg); + /* i1 means call is reachable in signed 24 bits */ + if (i1) { + w = _jit->pc.w; + if (jit_thumb_p()) d = ((i0 - _jit->pc.w) >> 1) - 2; + else d = ((i0 - _jit->pc.w) >> 2) - 2; + assert(_s24P(d)); + if (jit_thumb_p()) T2_BLI(encode_thumb_jump(d)); + else BLI(d & 0x00ffffff); + } + else { + reg = jit_get_reg(jit_class_gpr); + w = _jit->pc.w; + movi_p(rn(reg), i0); + if (jit_thumb_p()) + T1_BLX(rn(reg)); + else + BLX(rn(reg)); + jit_unget_reg(reg); + } return (w); } static void _prolog(jit_state_t *_jit, jit_node_t *node) { - jit_int32_t reg; + jit_int32_t reg, mask, count; if (_jitc->function->define_frame || _jitc->function->assume_frame) { jit_int32_t frame = -_jitc->function->frame; + jit_check_frame(); assert(_jitc->function->self.aoff >= frame); + if (jit_swf_p()) + CHECK_SWF_OFFSET(); + CHECK_REG_ARGS(); if (_jitc->function->assume_frame) { if (jit_thumb_p() && !_jitc->thumb) _jitc->thumb = _jit->pc.w; @@ -3787,38 +4035,66 @@ _prolog(jit_state_t *_jit, jit_node_t *node) _jitc->function->stack = ((_jitc->function->self.alen - /* align stack at 8 bytes */ _jitc->function->self.aoff) + 7) & -8; + /* If this jit_check_frame() succeeds, it actually is just a need_stack, + * usually for arguments, so, allocai was not called, but pusharg* + * was called increasing stack size, for negative access offsets. + * This can be optimized for one less prolog instruction, that is, + * do not create the frame pointer, and only add _jitc->function->stack + * to sp, and on epilog, instead of moving fp to sp, just add negative + * value of _jitc->function->stack. Since this condition requires a + * large function body for excess arguments to called function, keep + * things a bit simpler for now, as this is the only place need_stack + * would be useful. */ + if (_jitc->function->stack) + jit_check_frame(); + + for (reg = mask = count = 0; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + mask |= 1 << rn(iregs[reg]); + ++count; + } + } + /* One extra register to keep stack 8 bytes aligned */ + if (count & 1) { + for (reg = 4; reg < 10; reg++) { + if (!(mask & (1 << reg))) { + mask |= 1 << reg; + break; + } + } + } + if (_jitc->function->need_frame || _jitc->function->need_return) + mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO); + if (!jit_swf_p() && _jitc->function->save_reg_args && + !(_jitc->function->self.call & jit_call_varargs)) + mask |= 0xf; if (jit_thumb_p()) { /* switch to thumb mode (better approach would be to * ORR 1 address being called, but no clear distinction * of what is a pointer to a jit function, or if patching * a pointer to a jit function) */ - ADDI(_R12_REGNO, _R15_REGNO, 1); - BX(_R12_REGNO); + if (jit_exchange_p()) { + ADDI(_R12_REGNO, _R15_REGNO, 1); + BX(_R12_REGNO); + } if (!_jitc->thumb) _jitc->thumb = _jit->pc.w; - if (jit_cpu.abi) { - T2_PUSH(0xf); - T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - VPUSH_F64(_D8_REGNO, 8); - } - else { + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) T2_PUSH(0xf); - T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - } + if (mask) + T2_PUSH(mask); } else { - if (jit_cpu.abi) { - PUSH(0xf); - PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - VPUSH_F64(_D8_REGNO, 8); - } - else { + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) PUSH(0xf); - PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - } + if (mask) + PUSH(mask); } - movr(_FP_REGNO, _SP_REGNO); + if (_jitc->function->need_frame) + movr(_FP_REGNO, _SP_REGNO); if (_jitc->function->stack) subi(_SP_REGNO, _SP_REGNO, _jitc->function->stack); if (_jitc->function->allocar) { @@ -3832,17 +4108,41 @@ _prolog(jit_state_t *_jit, jit_node_t *node) static void _epilog(jit_state_t *_jit, jit_node_t *node) { + jit_int32_t reg, mask, count; if (_jitc->function->assume_frame) return; - movr(_SP_REGNO, _FP_REGNO); - if (jit_cpu.abi) - VPOP_F64(_D8_REGNO, 8); - if (jit_thumb_p()) - T2_POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - else - POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - addi(_SP_REGNO, _SP_REGNO, 16); + for (reg = mask = count = 0; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + mask |= 1 << rn(iregs[reg]); + ++count; + } + } + /* One extra register to keep stack 8 bytes aligned */ + if (count & 1) { + for (reg = 4; reg < 10; reg++) { + if (!(mask & (1 << reg))) { + mask |= 1 << reg; + break; + } + } + } + if (_jitc->function->need_frame || _jitc->function->need_return) + mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO); + if (_jitc->function->need_frame) + movr(_SP_REGNO, _FP_REGNO); + if (!jit_swf_p() && _jitc->function->save_reg_args && + !(_jitc->function->self.call & jit_call_varargs)) + addi(_SP_REGNO, _SP_REGNO, 16); + if (mask) { + if (jit_thumb_p()) + T2_POP(mask); + else + POP(mask); + } + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) + addi(_SP_REGNO, _SP_REGNO, 16); if (jit_thumb_p()) T1_BX(_LR_REGNO); else @@ -3860,8 +4160,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) * The -16 is to account for the 4 argument registers * always saved, and _jitc->function->vagp is to account * for declared arguments. */ - addi(r0, _FP_REGNO, _jitc->function->self.size - - 16 + _jitc->function->vagp); + addi(r0, _FP_REGNO, jit_selfsize() - 16 + _jitc->function->vagp); } static void @@ -3888,7 +4187,28 @@ _patch_at(jit_state_t *_jit, jit_word_t w; } u; u.w = instr; - if (kind == arm_patch_jump) { + if (kind == arm_patch_call) { + if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) { + code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); + assert((thumb.i & THUMB2_BLI) == THUMB2_BLI); + /* skip code to switch from arm to thumb mode */ + if (jit_exchange_p()) + d = ((label + 8 - instr) >> 1) - 2; + else + d = ((label - instr) >> 1) - 2; + assert(_s24P(d)); + thumb.i = THUMB2_BLI | encode_thumb_jump(d); + thumb2code(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); + } + else { + thumb.i = u.i[0]; + assert((thumb.i & 0x0f000000) == ARM_BLI); + d = ((label - instr) >> 2) - 2; + assert(_s24P(d)); + u.i[0] = (thumb.i & 0xff000000) | (d & 0x00ffffff); + } + } + else if (kind == arm_patch_jump) { if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) { code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); if ((thumb.i & THUMB2_B) == THUMB2_B) {