X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flightning%2Flib%2Fjit_arm-cpu.c;h=a0852a2cd55c8716487ec55276f1fc1c016c4b44;hb=79bfeef6160be4b228a7998ac2b43cd83d882532;hp=12f9a2f7d76eb4684ad33802807bd3f24f518ebc;hpb=c0c162422385a60ea7c8fa1dfe439e83e0a13d88;p=pcsx_rearmed.git diff --git a/deps/lightning/lib/jit_arm-cpu.c b/deps/lightning/lib/jit_arm-cpu.c index 12f9a2f7..a0852a2c 100644 --- a/deps/lightning/lib/jit_arm-cpu.c +++ b/deps/lightning/lib/jit_arm-cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2022 Free Software Foundation, Inc. + * Copyright (C) 2012-2023 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -37,8 +37,7 @@ # define jit_armv5e_p() (jit_cpu.version > 5 || (jit_cpu.version == 5 && jit_cpu.extend)) # define jit_armv6_p() (jit_cpu.version >= 6) # define jit_armv7_p() (jit_cpu.version >= 7) -# define jit_armv7r_p() 0 -# define stack_framesize 48 +# define jit_armv7r_p() (jit_cpu.version > 7 || (jit_cpu.version == 7 && jit_cpu.extend)) extern int __aeabi_idivmod(int, int); extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define _R0_REGNO 0x00 @@ -141,8 +140,12 @@ extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define THUMB2_UMULL 0xfba00000 # define ARM_SMULL 0x00c00090 # define THUMB2_SMULL 0xfb800000 +/* >> ARMv7r */ +# define ARM_SDIV 0x07100010 +# define ARM_UDIV 0x07300010 # define THUMB2_SDIV 0xfb90f0f0 # define THUMB2_UDIV 0xfbb0f0f0 +/* << ARMv7r */ # define ARM_AND 0x00000000 # define THUMB_AND 0x4000 # define THUMB2_AND 0xea000000 @@ -185,6 +188,12 @@ extern unsigned __aeabi_uidivmod(unsigned, unsigned); # define ARM_STREX 0x01800090 # define THUMB2_STREX 0xe8400000 /* << ARMv6* */ +/* >> ARMv6t2 */ +# define THUMB2_CLZ 0xfab0f080 +# define THUMB2_RBIT 0xfa90f0a0 +# define ARM_RBIT 0x06f00030 +/* << ARMv6t2 */ +# define ARM_CLZ 0x01600010 /* >> ARMv7 */ # define ARM_DMB 0xf57ff050 # define THUMB2_DMB 0xf3bf8f50 @@ -447,6 +456,12 @@ static void _tdmb(jit_state_t *_jit, int im); # define NOT(rd,rm) CC_NOT(ARM_CC_AL,rd,rm) # define T1_NOT(rd,rm) T1_MVN(rd,rm) # define T2_NOT(rd,rm) T2_MVN(rd,rm) +# define T2_CLZ(rd,rm) torrr(THUMB2_CLZ,rm,rd,rm) +# define CC_CLZ(cc,rd,rm) corrrr(cc,ARM_CLZ,_R15_REGNO,rd,_R15_REGNO,rm) +# define CLZ(rd,rm) CC_CLZ(ARM_CC_AL,rd,rm) +# define T2_RBIT(rd,rm) torrr(THUMB2_RBIT,rm,rd,rm) +# define CC_RBIT(cc,rd,rm) corrrr(cc,ARM_RBIT,_R15_REGNO,rd,_R15_REGNO,rm) +# define RBIT(rd,rm) CC_RBIT(ARM_CC_AL,rd,rm) # define NOP() MOV(_R0_REGNO, _R0_REGNO) # define T1_NOP() is(0xbf00) # define CC_ADD(cc,rd,rn,rm) corrr(cc,ARM_ADD,rn,rd,rm) @@ -524,6 +539,10 @@ static void _tdmb(jit_state_t *_jit, int im); # define CC_UMULL(cc,rl,rh,rn,rm) corrrr(cc,ARM_UMULL,rh,rl,rm,rn) # define UMULL(rl,rh,rn,rm) CC_UMULL(ARM_CC_AL,rl,rh,rn,rm) # define T2_UMULL(rl,rh,rn,rm) torrrr(THUMB2_UMULL,rn,rl,rh,rm) +# define CC_SDIV(cc,rd,rn,rm) corrrr(cc,ARM_SDIV,rd,15,rn,rm) +# define SDIV(rd,rn,rm) CC_SDIV(ARM_CC_AL,rd,rm,rn) +# define CC_UDIV(cc,rd,rn,rm) corrrr(cc,ARM_UDIV,rd,15,rn,rm) +# define UDIV(rd,rn,rm) CC_UDIV(ARM_CC_AL,rd,rm,rn) # define T2_SDIV(rd,rn,rm) torrr(THUMB2_SDIV,rn,rd,rm) # define T2_UDIV(rd,rn,rm) torrr(THUMB2_UDIV,rn,rd,rm) # define CC_AND(cc,rd,rn,rm) corrr(cc,ARM_AND,rn,rd,rm) @@ -852,6 +871,8 @@ static void _tdmb(jit_state_t *_jit, int im); # define T2_POP(im) tpp(THUMB2_POP,im) # define jit_get_reg_args() \ do { \ + CHECK_REG_ARGS(); \ + jit_check_frame(); \ (void)jit_get_reg(_R0|jit_class_named|jit_class_gpr); \ (void)jit_get_reg(_R1|jit_class_named|jit_class_gpr); \ (void)jit_get_reg(_R2|jit_class_named|jit_class_gpr); \ @@ -885,6 +906,14 @@ static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t, static void _comr(jit_state_t*,jit_int32_t,jit_int32_t); # define negr(r0,r1) _negr(_jit,r0,r1) static void _negr(jit_state_t*,jit_int32_t,jit_int32_t); +# define clor(r0, r1) _clor(_jit, r0, r1) +static void _clor(jit_state_t*, jit_int32_t, jit_int32_t); +# define clzr(r0, r1) _clzr(_jit, r0, r1) +static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctor(r0, r1) _ctor(_jit, r0, r1) +static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctzr(r0, r1) _ctzr(_jit, r0, r1) +static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); # define addr(r0,r1,r2) _addr(_jit,r0,r1,r2) static void _addr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define addi(r0,r1,i0) _addi(_jit,r0,r1,i0) @@ -1147,10 +1176,10 @@ static void _prolog(jit_state_t*,jit_node_t*); static void _epilog(jit_state_t*,jit_node_t*); # define callr(r0) _callr(_jit,r0) static void _callr(jit_state_t*,jit_int32_t); -# define calli(i0) _calli(_jit,i0) -static void _calli(jit_state_t*,jit_word_t); -# define calli_p(i0) _calli_p(_jit,i0) -static jit_word_t _calli_p(jit_state_t*,jit_word_t); +# define calli(i0,i1) _calli(_jit,i0,i1) +static void _calli(jit_state_t*,jit_word_t,jit_bool_t); +# define calli_p(i0,i1) _calli_p(_jit,i0,i1) +static jit_word_t _calli_p(jit_state_t*,jit_word_t,jit_bool_t); # define vastart(r0) _vastart(_jit, r0) static void _vastart(jit_state_t*, jit_int32_t); # define vaarg(r0, r1) _vaarg(_jit, r0, r1) @@ -1526,7 +1555,7 @@ _tpp(jit_state_t *_jit, int o, int im) assert(!(o & 0x0000ffff)); if (o == THUMB2_PUSH) assert(!(im & 0x8000)); - assert(__builtin_popcount(im & 0x1fff) > 1); + assert(__builtin_popcount(im & 0x7fff) > 1); thumb.i = o|im; iss(thumb.s[0], thumb.s[1]); } @@ -1737,6 +1766,53 @@ _negr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) RSBI(r0, r1, 0); } +static void +_clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (!jit_thumb_p() && jit_armv5e_p()) + CLZ(r0, r1); + else if (jit_thumb_p() && jit_armv7_p()) { /* armv6t2 actually */ + T2_CLZ(r0, r1); + } + else + fallback_clz(r0, r0); +} + +static void +_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + comr(r0, r1); + clzr(r0, r0); +} + +static void +_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_armv7_p()) { /* armv6t2 actually */ + if (jit_thumb_p()) + T2_RBIT(r0, r1); + else + RBIT(r0, r1); + clor(r0, r0); + } + else + fallback_cto(r0, r1); +} + +static void +_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_armv7_p()) { /* armv6t2 actually */ + if (jit_thumb_p()) + T2_RBIT(r0, r1); + else + RBIT(r0, r1); + clzr(r0, r0); + } + else + fallback_ctz(r0, r1); +} + static void _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { @@ -2232,8 +2308,12 @@ _divrem(jit_state_t *_jit, int div, int sign, static void _divr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - if (jit_armv7r_p() && jit_thumb_p()) - T2_SDIV(r0, r1, r2); + if (jit_armv7r_p()) { + if (jit_thumb_p()) + T2_SDIV(r0, r1, r2); + else + SDIV(r0, r1, r2); + } else divrem(1, 1, r0, r1, r2); } @@ -2251,8 +2331,12 @@ _divi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) static void _divr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - if (jit_armv7r_p() && jit_thumb_p()) - T2_UDIV(r0, r1, r2); + if (jit_armv7r_p()) { + if (jit_thumb_p()) + T2_UDIV(r0, r1, r2); + else + UDIV(r0, r1, r2); + } else divrem(1, 0, r0, r1, r2); } @@ -2312,7 +2396,23 @@ _iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, static void _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - divrem(0, 1, r0, r1, r2); + if (jit_armv7r_p()) { + jit_int32_t reg; + if (r0 == r1 || r0 == r2) { + reg = jit_get_reg(jit_class_gpr); + divr(rn(reg), r1, r2); + mulr(rn(reg), r2, rn(reg)); + subr(r0, r1, rn(reg)); + jit_unget_reg(reg); + } + else { + divr(r0, r1, r2); + mulr(r0, r2, r0); + subr(r0, r1, r0); + } + } + else + divrem(0, 1, r0, r1, r2); } static void @@ -2328,7 +2428,23 @@ _remi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) static void _remr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { - divrem(0, 0, r0, r1, r2); + if (jit_armv7r_p()) { + jit_int32_t reg; + if (r0 == r1 || r0 == r2) { + reg = jit_get_reg(jit_class_gpr); + divr_u(rn(reg), r1, r2); + mulr(rn(reg), r2, rn(reg)); + subr(r0, r1, rn(reg)); + jit_unget_reg(reg); + } + else { + divr_u(r0, r1, r2); + mulr(r0, r2, r0); + subr(r0, r1, r0); + } + } + else + divrem(0, 0, r0, r1, r2); } static void @@ -2741,8 +2857,8 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1) jit_word_t w; jit_word_t d; jit_int32_t reg; + /* i1 means jump is reachable in signed 24 bits */ if (i1) { - /* Assume jump is not longer than 23 bits if inside jit */ w = _jit->pc.w; /* if thumb and in thumb mode */ if (jit_thumb_p() && _jitc->thumb) { @@ -3835,14 +3951,29 @@ _callr(jit_state_t *_jit, jit_int32_t r0) } static void -_calli(jit_state_t *_jit, jit_word_t i0) +_calli(jit_state_t *_jit, jit_word_t i0, jit_bool_t exchange_p) { jit_word_t d; jit_int32_t reg; - d = ((i0 - _jit->pc.w) >> 2) - 2; - if (!jit_exchange_p() && !jit_thumb_p() && _s24P(d)) - BLI(d & 0x00ffffff); + if (!exchange_p) { + if (jit_thumb_p()) { + if (jit_exchange_p()) + /* skip switch from arm to thumb + * exchange_p set to zero means a jit function + * call in the same jit code buffer */ + d = ((i0 + 8 - _jit->pc.w) >> 1) - 2; + else + d = ((i0 - _jit->pc.w) >> 1) - 2; + } + else d = ((i0 - _jit->pc.w) >> 2) - 2; + if (_s24P(d)) { + if (jit_thumb_p()) T2_BLI(encode_thumb_jump(d)); + else BLI(d & 0x00ffffff); + } + else goto fallback; + } else { + fallback: reg = jit_get_reg(jit_class_gpr); movi(rn(reg), i0); if (jit_thumb_p()) @@ -3854,28 +3985,44 @@ _calli(jit_state_t *_jit, jit_word_t i0) } static jit_word_t -_calli_p(jit_state_t *_jit, jit_word_t i0) +_calli_p(jit_state_t *_jit, jit_word_t i0, jit_bool_t i1) { jit_word_t w; + jit_word_t d; jit_int32_t reg; - reg = jit_get_reg(jit_class_gpr); - w = _jit->pc.w; - movi_p(rn(reg), i0); - if (jit_thumb_p()) - T1_BLX(rn(reg)); - else - BLX(rn(reg)); - jit_unget_reg(reg); + /* i1 means call is reachable in signed 24 bits */ + if (i1) { + w = _jit->pc.w; + if (jit_thumb_p()) d = ((i0 - _jit->pc.w) >> 1) - 2; + else d = ((i0 - _jit->pc.w) >> 2) - 2; + assert(_s24P(d)); + if (jit_thumb_p()) T2_BLI(encode_thumb_jump(d)); + else BLI(d & 0x00ffffff); + } + else { + reg = jit_get_reg(jit_class_gpr); + w = _jit->pc.w; + movi_p(rn(reg), i0); + if (jit_thumb_p()) + T1_BLX(rn(reg)); + else + BLX(rn(reg)); + jit_unget_reg(reg); + } return (w); } static void _prolog(jit_state_t *_jit, jit_node_t *node) { - jit_int32_t reg; + jit_int32_t reg, mask, count; if (_jitc->function->define_frame || _jitc->function->assume_frame) { jit_int32_t frame = -_jitc->function->frame; + jit_check_frame(); assert(_jitc->function->self.aoff >= frame); + if (jit_swf_p()) + CHECK_SWF_OFFSET(); + CHECK_REG_ARGS(); if (_jitc->function->assume_frame) { if (jit_thumb_p() && !_jitc->thumb) _jitc->thumb = _jit->pc.w; @@ -3888,38 +4035,66 @@ _prolog(jit_state_t *_jit, jit_node_t *node) _jitc->function->stack = ((_jitc->function->self.alen - /* align stack at 8 bytes */ _jitc->function->self.aoff) + 7) & -8; + /* If this jit_check_frame() succeeds, it actually is just a need_stack, + * usually for arguments, so, allocai was not called, but pusharg* + * was called increasing stack size, for negative access offsets. + * This can be optimized for one less prolog instruction, that is, + * do not create the frame pointer, and only add _jitc->function->stack + * to sp, and on epilog, instead of moving fp to sp, just add negative + * value of _jitc->function->stack. Since this condition requires a + * large function body for excess arguments to called function, keep + * things a bit simpler for now, as this is the only place need_stack + * would be useful. */ + if (_jitc->function->stack) + jit_check_frame(); + + for (reg = mask = count = 0; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + mask |= 1 << rn(iregs[reg]); + ++count; + } + } + /* One extra register to keep stack 8 bytes aligned */ + if (count & 1) { + for (reg = 4; reg < 10; reg++) { + if (!(mask & (1 << reg))) { + mask |= 1 << reg; + break; + } + } + } + if (_jitc->function->need_frame || _jitc->function->need_return) + mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO); + if (!jit_swf_p() && _jitc->function->save_reg_args && + !(_jitc->function->self.call & jit_call_varargs)) + mask |= 0xf; if (jit_thumb_p()) { /* switch to thumb mode (better approach would be to * ORR 1 address being called, but no clear distinction * of what is a pointer to a jit function, or if patching * a pointer to a jit function) */ - ADDI(_R12_REGNO, _R15_REGNO, 1); - BX(_R12_REGNO); + if (jit_exchange_p()) { + ADDI(_R12_REGNO, _R15_REGNO, 1); + BX(_R12_REGNO); + } if (!_jitc->thumb) _jitc->thumb = _jit->pc.w; - if (jit_cpu.abi) { - T2_PUSH(0xf); - T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - VPUSH_F64(_D8_REGNO, 8); - } - else { + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) T2_PUSH(0xf); - T2_PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - } + if (mask) + T2_PUSH(mask); } else { - if (jit_cpu.abi) { - PUSH(0xf); - PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - VPUSH_F64(_D8_REGNO, 8); - } - else { + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) PUSH(0xf); - PUSH(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - } + if (mask) + PUSH(mask); } - movr(_FP_REGNO, _SP_REGNO); + if (_jitc->function->need_frame) + movr(_FP_REGNO, _SP_REGNO); if (_jitc->function->stack) subi(_SP_REGNO, _SP_REGNO, _jitc->function->stack); if (_jitc->function->allocar) { @@ -3933,17 +4108,41 @@ _prolog(jit_state_t *_jit, jit_node_t *node) static void _epilog(jit_state_t *_jit, jit_node_t *node) { + jit_int32_t reg, mask, count; if (_jitc->function->assume_frame) return; - movr(_SP_REGNO, _FP_REGNO); - if (jit_cpu.abi) - VPOP_F64(_D8_REGNO, 8); - if (jit_thumb_p()) - T2_POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - else - POP(0x3f0|(1<<_FP_REGNO)|(1<<_LR_REGNO)); - addi(_SP_REGNO, _SP_REGNO, 16); + for (reg = mask = count = 0; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + mask |= 1 << rn(iregs[reg]); + ++count; + } + } + /* One extra register to keep stack 8 bytes aligned */ + if (count & 1) { + for (reg = 4; reg < 10; reg++) { + if (!(mask & (1 << reg))) { + mask |= 1 << reg; + break; + } + } + } + if (_jitc->function->need_frame || _jitc->function->need_return) + mask |= (1 << _FP_REGNO) | (1 << _LR_REGNO); + if (_jitc->function->need_frame) + movr(_SP_REGNO, _FP_REGNO); + if (!jit_swf_p() && _jitc->function->save_reg_args && + !(_jitc->function->self.call & jit_call_varargs)) + addi(_SP_REGNO, _SP_REGNO, 16); + if (mask) { + if (jit_thumb_p()) + T2_POP(mask); + else + POP(mask); + } + if (jit_swf_p() || (_jitc->function->save_reg_args && + (_jitc->function->self.call & jit_call_varargs))) + addi(_SP_REGNO, _SP_REGNO, 16); if (jit_thumb_p()) T1_BX(_LR_REGNO); else @@ -3961,8 +4160,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) * The -16 is to account for the 4 argument registers * always saved, and _jitc->function->vagp is to account * for declared arguments. */ - addi(r0, _FP_REGNO, _jitc->function->self.size - - 16 + _jitc->function->vagp); + addi(r0, _FP_REGNO, jit_selfsize() - 16 + _jitc->function->vagp); } static void @@ -3989,7 +4187,28 @@ _patch_at(jit_state_t *_jit, jit_word_t w; } u; u.w = instr; - if (kind == arm_patch_jump) { + if (kind == arm_patch_call) { + if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) { + code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); + assert((thumb.i & THUMB2_BLI) == THUMB2_BLI); + /* skip code to switch from arm to thumb mode */ + if (jit_exchange_p()) + d = ((label + 8 - instr) >> 1) - 2; + else + d = ((label - instr) >> 1) - 2; + assert(_s24P(d)); + thumb.i = THUMB2_BLI | encode_thumb_jump(d); + thumb2code(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); + } + else { + thumb.i = u.i[0]; + assert((thumb.i & 0x0f000000) == ARM_BLI); + d = ((label - instr) >> 2) - 2; + assert(_s24P(d)); + u.i[0] = (thumb.i & 0xff000000) | (d & 0x00ffffff); + } + } + else if (kind == arm_patch_jump) { if (jit_thumb_p() && (jit_uword_t)instr >= _jitc->thumb) { code2thumb(thumb.s[0], thumb.s[1], u.s[0], u.s[1]); if ((thumb.i & THUMB2_B) == THUMB2_B) {