X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flightning%2Flib%2Fjit_x86-cpu.c;h=76f90ec871add4ac03fdfad9d43ff4755ee1bf99;hb=d481fb64f2aac7a36532142cda11fa43f5ca792f;hp=81534f0894d79c82dbb04a4f6af3d06a9520f393;hpb=437b1e617808119c3a24a72c77cd2fa86a5d3220;p=pcsx_rearmed.git diff --git a/deps/lightning/lib/jit_x86-cpu.c b/deps/lightning/lib/jit_x86-cpu.c index 81534f08..76f90ec8 100644 --- a/deps/lightning/lib/jit_x86-cpu.c +++ b/deps/lightning/lib/jit_x86-cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2023 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -21,13 +21,20 @@ #define USE_INC_DEC 0 #if PROTO +# if __WORDSIZE == 64 && _WIN32 +# define ONE 1LL +# else +# define ONE 1L +# endif # if __X32 || __X64_32 # define WIDE 0 # define ldi(u, v) ldi_i(u, v) # define ldr(u, v) ldr_i(u, v) # define ldxr(u, v, w) ldxr_i(u, v, w) # define ldxi(u, v, w) ldxi_i(u, v, w) +# define str(u, v) str_i(u, v) # define sti(u, v) sti_i(u, v) +# define stxr(u, v, w) stxr_i(u, v, w) # define stxi(u, v, w) stxi_i(u, v, w) # define can_sign_extend_int_p(im) 1 # define can_zero_extend_int_p(im) 1 @@ -38,11 +45,13 @@ # define ldr(u, v) ldr_l(u, v) # define ldxr(u, v, w) ldxr_l(u, v, w) # define ldxi(u, v, w) ldxi_l(u, v, w) +# define str(u, v) str_l(u, v) # define sti(u, v) sti_l(u, v) +# define stxr(u, v, w) stxr_l(u, v, w) # define stxi(u, v, w) stxi_l(u, v, w) # define can_sign_extend_int_p(im) \ - (((im) >= 0 && (long long)(im) <= 0x7fffffffLL) || \ - ((im) < 0 && (long long)(im) > -0x80000000LL)) + (((long long)(im) >= 0 && (long long)(im) <= 0x7fffffffLL) || \ + ((long long)(im) < 0 && (long long)(im) > -0x80000000LL)) # define can_zero_extend_int_p(im) \ ((im) >= 0 && (im) < 0x80000000LL) # define fits_uint32_p(im) (((im) & 0xffffffff00000000LL) == 0) @@ -136,18 +145,46 @@ # else # define il(l) ii(l) # endif -# define patch_abs(instr, label) \ - *(jit_word_t *)(instr - sizeof(jit_word_t)) = label -# define patch_rel(instr, label) \ - *(jit_int32_t *)(instr - 4) = label - instr -# define patch_rel_char(instr, label) \ - *(jit_int8_t *)(instr - 1) = label - instr # define rex(l, w, r, x, b) _rex(_jit, l, w, r, x, b) static void _rex(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define rx(rd, md, rb, ri, ms) _rx(_jit, rd, md, rb, ri, ms) static void _rx(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +/* + * prefix 8 bits 0xc4 Three byte VEX + * 0xc5 Two byte VEX + * 0x8f Three byte XOP + * ~R 1 bit Inverted REX.R + * ~X 1 bit Inverted REX.X + * ~B 1 bit Inverted REX.B + * map 5 bits Opcode map to use + * W 1 bit REX.W for integer, otherwise opcode extension + * ~vvvv 4 bits Inverted XMM or YMM registers + * L 1 bit 128 bit vector if 0, 256 otherwise + * pp 2 bits Mandatory prefix + * 00 none + * 01 0x66 + * 10 0xf3 + * 11 0xf2 + * + * Three byte VEX: + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + * | 1 1 0 0 0 1 0 0 | |~R |~X |~B | map | | W | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + * Three byte XOP: + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + * | 1 0 0 0 1 1 1 1 | |~R |~X |~B | map | | W | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + * Two byte VEX: + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + * | 1 1 0 0 0 1 0 1 | |~R | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ +---+---+---+---+---+---+---+---+ + */ +# define vex(r,x,b,map,w,vvvv,l,pp) _vex(_jit,r,x,b,map,w,vvvv,l,pp) +static void +_vex(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define nop(n) _nop(_jit, n) static void _nop(jit_state_t*, jit_int32_t); # define emms() is(0x770f) @@ -186,7 +223,8 @@ static void _addi(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); static void _addcr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); #define addci(r0, r1, i0) _addci(_jit, r0, r1, i0) static void _addci(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); -# define iaddxr(r0, r1) alur(X86_ADC, r0, r1) +# define iaddxr(r0, r1) _iaddxr(_jit, r0, r1) +static void _iaddxr(jit_state_t*, jit_int32_t, jit_int32_t); # define addxr(r0, r1, r2) _addxr(_jit, r0, r1, r2) static void _addxr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); # define iaddxi(r0, i0) alui(X86_ADC, r0, i0) @@ -218,6 +256,10 @@ static void _imuli(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); static void _mulr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); # define muli(r0, r1, i0) _muli(_jit, r0, r1, i0) static void _muli(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); +# define hmulr(r0, r1, r2) _iqmulr(_jit, JIT_NOREG, r0, r1, r2, 1) +# define hmulr_u(r0, r1, r2) _iqmulr(_jit, JIT_NOREG, r0, r1, r2, 0) +# define hmuli(r0, r1, i0) _iqmuli(_jit, JIT_NOREG, r0, r1, i0, 1) +# define hmuli_u(r0, r1, i0) _iqmuli(_jit, JIT_NOREG, r0, r1, i0, 0) # define umulr(r0) unr(X86_IMUL, r0) # define umulr_u(r0) unr(X86_MUL, r0) # define qmulr(r0, r1, r2, r3) _iqmulr(_jit, r0, r1, r2, r3, 1) @@ -288,12 +330,36 @@ static void _irotshi(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); static void _rotshi(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t); # define lshr(r0, r1, r2) rotshr(X86_SHL, r0, r1, r2) +# define qlshr(r0, r1, r2, r3) xlshr(1, r0, r1, r2, r3) +# define xlshr(s, r0, r1, r2, r3) _xlshr(_jit, s, r0, r1, r2, r3) +static void +_xlshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define lshi(r0, r1, i0) _lshi(_jit, r0, r1, i0) static void _lshi(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); +# define qlshi(r0, r1, r2, i0) xlshi(1, r0, r1, r2, i0) +# define xlshi(s, r0, r1, r2, i0) _xlshi(_jit, s, r0, r1, r2, i0) +static void +_xlshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t); +# define qlshr_u(r0, r1, r2, r3) xlshr(0, r0, r1, r2, r3) +# define qlshi_u(r0, r1, r2, i0) xlshi(0, r0, r1, r2, i0) # define rshr(r0, r1, r2) rotshr(X86_SAR, r0, r1, r2) # define rshi(r0, r1, i0) rotshi(X86_SAR, r0, r1, i0) # define rshr_u(r0, r1, r2) rotshr(X86_SHR, r0, r1, r2) # define rshi_u(r0, r1, i0) rotshi(X86_SHR, r0, r1, i0) +# define qrshr(r0, r1, r2, r3) xrshr(1, r0, r1, r2, r3) +# define qrshr_u(r0, r1, r2, r3) xrshr(0, r0, r1, r2, r3) +# define xrshr(s, r0, r1, r2, r3) _xrshr(_jit, s, r0, r1, r2, r3) +static void +_xrshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define qrshi(r0, r1, r2, i0) xrshi(1, r0, r1, r2, i0) +# define qrshi_u(r0, r1, r2, i0) xrshi(0, r0, r1, r2, i0) +# define xrshi(s, r0, r1, r2, i0) _xrshi(_jit, s, r0, r1, r2, i0) +static void +_xrshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t); +# define lrotr(r0, r1, r2) rotshr(X86_ROL, r0, r1, r2) +# define lroti(r0, r1, i0) rotshi(X86_ROL, r0, r1, i0) +# define rrotr(r0, r1, r2) rotshr(X86_ROR, r0, r1, r2) +# define rroti(r0, r1, i0) rotshi(X86_ROR, r0, r1, i0) # define unr(code, r0) _unr(_jit, code, r0) static void _unr(jit_state_t*, jit_int32_t, jit_int32_t); # define inegr(r0) unr(X86_NEG, r0) @@ -308,6 +374,18 @@ static void _incr(jit_state_t*, jit_int32_t, jit_int32_t); # define decr(r0, r1) _decr(_jit, r0, r1) static void _decr(jit_state_t*, jit_int32_t, jit_int32_t); # endif +# define clor(r0, r1) _clor(_jit, r0, r1) +static void _clor(jit_state_t*, jit_int32_t, jit_int32_t); +# define clzr(r0, r1) _clzr(_jit, r0, r1) +static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctor(r0, r1) _ctor(_jit, r0, r1) +static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctzr(r0, r1) _ctzr(_jit, r0, r1) +static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define rbitr(r0, r1) _rbitr(_jit, r0, r1) +static void _rbitr(jit_state_t*, jit_int32_t, jit_int32_t); +# define popcntr(r0, r1) _popcntr(_jit, r0, r1) +static void _popcntr(jit_state_t*, jit_int32_t, jit_int32_t); # define cr(code, r0, r1, r2) _cr(_jit, code, r0, r1, r2) static void _cr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t, jit_int32_t); @@ -358,7 +436,13 @@ static void _movr(jit_state_t*, jit_int32_t, jit_int32_t); # define imovi(r0, i0) _imovi(_jit, r0, i0) static void _imovi(jit_state_t*, jit_int32_t, jit_word_t); # define movi(r0, i0) _movi(_jit, r0, i0) -static void _movi(jit_state_t*, jit_int32_t, jit_word_t); +static +# if CAN_RIP_ADDRESS +jit_word_t +# else +void +# endif +_movi(jit_state_t*, jit_int32_t, jit_word_t); # define movi_p(r0, i0) _movi_p(_jit, r0, i0) static jit_word_t _movi_p(jit_state_t*, jit_int32_t, jit_word_t); # define movcr(r0, r1) _movcr(_jit, r0, r1) @@ -369,6 +453,11 @@ static void _movcr_u(jit_state_t*,jit_int32_t,jit_int32_t); static void _movsr(jit_state_t*,jit_int32_t,jit_int32_t); # define movsr_u(r0, r1) _movsr_u(_jit, r0, r1) static void _movsr_u(jit_state_t*,jit_int32_t,jit_int32_t); +# define casx(r0, r1, r2, r3, i0) _casx(_jit, r0, r1, r2, r3, i0) +static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t, + jit_int32_t,jit_int32_t,jit_word_t); +#define casr(r0, r1, r2, r3) casx(r0, r1, r2, r3, 0) +#define casi(r0, i0, r1, r2) casx(r0, _NOREG, r1, r2, i0) #define movnr(r0, r1, r2) _movnr(_jit, r0, r1, r2) static void _movnr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); #define movzr(r0, r1, r2) _movzr(_jit, r0, r1, r2) @@ -387,6 +476,12 @@ static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t); #define bswapr_ul(r0, r1) _bswapr_ul(_jit, r0, r1) static void _bswapr_ul(jit_state_t*,jit_int32_t,jit_int32_t); #endif +# define extr(r0, r1, i0, i1) _extr(_jit, r0, r1, i0, i1) +static void _extr(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); +# define extr_u(r0, r1, i0, i1) _extr_u(_jit, r0, r1, i0, i1) +static void _extr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); +# define depr(r0, r1, i0, i1) _depr(_jit, r0, r1, i0, i1) +static void _depr(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); # define extr_c(r0, r1) _extr_c(_jit, r0, r1) static void _extr_c(jit_state_t*,jit_int32_t,jit_int32_t); # define extr_uc(r0, r1) _extr_uc(_jit, r0, r1) @@ -475,6 +570,10 @@ static void _ldxr_l(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); static void _ldxi_l(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); # endif # endif +# define unldr(r0, r1, i0) generic_unldr(r0, r1, i0) +# define unldi(r0, i0, i1) generic_unldi(r0, i0, i1) +# define unldr_u(r0, r1, i0) generic_unldr_u(r0, r1, i0) +# define unldi_u(r0, i0, i1) generic_unldi_u(r0, i0, i1) # define str_c(r0, r1) _str_c(_jit, r0, r1) static void _str_c(jit_state_t*, jit_int32_t, jit_int32_t); # define sti_c(i0, r0) _sti_c(_jit, i0, r0) @@ -511,6 +610,8 @@ static void _stxr_l(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); # define stxi_l(i0, r0, r1) _stxi_l(_jit, i0, r0, r1) static void _stxi_l(jit_state_t*, jit_word_t, jit_int32_t, jit_int32_t); # endif +#define unstr(r0, r1, i0) generic_unstr(r0, r1, i0) +#define unsti(i0, r0, i1) generic_unsti(i0, r0, i1) # define jcc(code, i0) _jcc(_jit, code, i0) # define jo(i0) jcc(X86_CC_O, i0) # define jno(i0) jcc(X86_CC_NO, i0) @@ -542,7 +643,7 @@ static void _stxi_l(jit_state_t*, jit_word_t, jit_int32_t, jit_int32_t); # define jng(i0) jcc(X86_CC_NG, i0) # define jg(i0) jcc(X86_CC_G, i0) # define jnle(i0) jcc(X86_CC_NLE, i0) -static void _jcc(jit_state_t*, jit_int32_t, jit_word_t); +static jit_word_t _jcc(jit_state_t*, jit_int32_t, jit_word_t); # define jccs(code, i0) _jccs(_jit, code, i0) # define jos(i0) jccs(X86_CC_O, i0) # define jnos(i0) jccs(X86_CC_NO, i0) @@ -574,13 +675,15 @@ static void _jcc(jit_state_t*, jit_int32_t, jit_word_t); # define jngs(i0) jccs(X86_CC_NG, i0) # define jgs(i0) jccs(X86_CC_G, i0) # define jnles(i0) jccs(X86_CC_NLE, i0) -static void _jccs(jit_state_t*, jit_int32_t, jit_word_t); +static jit_word_t _jccs(jit_state_t*, jit_int32_t, jit_word_t); # define jcr(code, i0, r0, r1) _jcr(_jit, code, i0, r0, r1) -static void _jcr(jit_state_t*,jit_int32_t,jit_word_t,jit_int32_t,jit_int32_t); +static jit_word_t _jcr(jit_state_t*, + jit_int32_t,jit_word_t,jit_int32_t,jit_int32_t); # define jci(code, i0, r0, i1) _jci(_jit, code, i0, r0, i1) -static void _jci(jit_state_t*,jit_int32_t,jit_word_t,jit_int32_t,jit_word_t); +static jit_word_t _jci(jit_state_t*, + jit_int32_t,jit_word_t,jit_int32_t,jit_word_t); # define jci0(code, i0, r0) _jci0(_jit, code, i0, r0) -static void _jci0(jit_state_t*, jit_int32_t, jit_word_t, jit_int32_t); +static jit_word_t _jci0(jit_state_t*, jit_int32_t, jit_word_t, jit_int32_t); # define bltr(i0, r0, r1) _bltr(_jit, i0, r0, r1) static jit_word_t _bltr(jit_state_t*, jit_word_t, jit_int32_t, jit_int32_t); # define blti(i0, r0, i1) _blti(_jit, i0, r0, i1) @@ -682,7 +785,7 @@ static jit_word_t _jmpi_p(jit_state_t*, jit_word_t); # define jmpi_p(i0) jmpi(i0) # endif # define jmpsi(i0) _jmpsi(_jit, i0) -static void _jmpsi(jit_state_t*, jit_uint8_t); +static jit_word_t _jmpsi(jit_state_t*, jit_uint8_t); # define prolog(node) _prolog(_jit, node) static void _prolog(jit_state_t*, jit_node_t*); # define epilog(node) _epilog(_jit, node) @@ -693,8 +796,8 @@ static void _vastart(jit_state_t*, jit_int32_t); static void _vaarg(jit_state_t*, jit_int32_t, jit_int32_t); # define vaarg_d(r0, r1, i0) _vaarg_d(_jit, r0, r1, i0) static void _vaarg_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_bool_t); -# define patch_at(node, instr, label) _patch_at(_jit, node, instr, label) -static void _patch_at(jit_state_t*, jit_node_t*, jit_word_t, jit_word_t); +# define patch_at(instr, label) _patch_at(_jit, instr, label) +static void _patch_at(jit_state_t*, jit_word_t, jit_word_t); # if !defined(HAVE_FFSL) # if __X32 # define ffsl(i) __builtin_ffs(i) @@ -730,11 +833,16 @@ _rx(jit_state_t *_jit, jit_int32_t rd, jit_int32_t md, { if (ri == _NOREG) { if (rb == _NOREG) { -#if __X32 - mrm(0x00, r7(rd), 0x05); -#else - mrm(0x00, r7(rd), 0x04); - sib(_SCL1, 0x04, 0x05); + /* Use ms == _SCL8 to tell it is a %rip relative displacement */ +#if __X64 + if (ms == _SCL8) +#endif + mrm(0x00, r7(rd), 0x05); +#if __X64 + else { + mrm(0x00, r7(rd), 0x04); + sib(_SCL1, 0x04, 0x05); + } #endif ii(md); } @@ -795,46 +903,93 @@ _rx(jit_state_t *_jit, jit_int32_t rd, jit_int32_t md, } static void -_nop(jit_state_t *_jit, jit_int32_t count) +_vex(jit_state_t *_jit, jit_int32_t r, jit_int32_t x, jit_int32_t b, + jit_int32_t map, jit_int32_t w, jit_int32_t vvvv, jit_int32_t l, + jit_int32_t pp) { - switch (count) { - case 0: - break; - case 1: /* NOP */ - ic(0x90); break; - case 2: /* 66 NOP */ - ic(0x66); ic(0x90); - break; - case 3: /* NOP DWORD ptr [EAX] */ - ic(0x0f); ic(0x1f); ic(0x00); - break; - case 4: /* NOP DWORD ptr [EAX + 00H] */ - ic(0x0f); ic(0x1f); ic(0x40); ic(0x00); - break; - case 5: /* NOP DWORD ptr [EAX + EAX*1 + 00H] */ - ic(0x0f); ic(0x1f); ic(0x44); ic(0x00); - ic(0x00); - break; - case 6: /* 66 NOP DWORD ptr [EAX + EAX*1 + 00H] */ - ic(0x66); ic(0x0f); ic(0x1f); ic(0x44); - ic(0x00); ic(0x00); - break; - case 7: /* NOP DWORD ptr [EAX + 00000000H] */ - ic(0x0f); ic(0x1f); ic(0x80); ii(0x0000); - break; - case 8: /* NOP DWORD ptr [EAX + EAX*1 + 00000000H] */ - ic(0x0f); ic(0x1f); ic(0x84); ic(0x00); - ii(0x0000); - break; - case 9: /* 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] */ - ic(0x66); ic(0x0f); ic(0x1f); ic(0x84); - ic(0x00); ii(0x0000); - break; - default: - abort(); + jit_int32_t v; + if (r == _NOREG) r = 0; + if (x == _NOREG) x = 0; + if (b == _NOREG) b = 0; + if (map == 1 && w == 0 && ((x|b) & 8) == 0) { + /* Two byte prefix */ + ic(0xc5); + /* ~R */ + v = (r & 8) ? 0 : 0x80; } + else { + /* Three byte prefix */ + if (map >= 8) + ic(0x8f); + else + ic(0xc4); + /* map_select */ + v = map; + /* ~R */ + if (!(r & 8)) v |= 0x80; + /* ~X */ + if (!(x & 8)) v |= 0x40; + /* ~B */ + if (!(b & 8)) v |= 0x20; + ic(v); + /* W */ + v = w ? 0x80 : 0; + } + /* ~vvvv */ + v |= (~vvvv & 0x0f) << 3; + /* L */ + if (l) v |= 0x04; + /* pp */ + v |= pp; + ic(v); } +static void +_nop(jit_state_t *_jit, jit_int32_t count) +{ + jit_int32_t i; + while (count) { + if (count > 9) + i = 9; + else + i = count; + switch (i) { + case 0: + break; + case 1: /* NOP */ + ic(0x90); break; + case 2: /* 66 NOP */ + ic(0x66); ic(0x90); + break; + case 3: /* NOP DWORD ptr [EAX] */ + ic(0x0f); ic(0x1f); ic(0x00); + break; + case 4: /* NOP DWORD ptr [EAX + 00H] */ + ic(0x0f); ic(0x1f); ic(0x40); ic(0x00); + break; + case 5: /* NOP DWORD ptr [EAX + EAX*1 + 00H] */ + ic(0x0f); ic(0x1f); ic(0x44); ic(0x00); + ic(0x00); + break; + case 6: /* 66 NOP DWORD ptr [EAX + EAX*1 + 00H] */ + ic(0x66); ic(0x0f); ic(0x1f); ic(0x44); + ic(0x00); ic(0x00); + break; + case 7: /* NOP DWORD ptr [EAX + 00000000H] */ + ic(0x0f); ic(0x1f); ic(0x80); ii(0x0000); + break; + case 8: /* NOP DWORD ptr [EAX + EAX*1 + 00000000H] */ + ic(0x0f); ic(0x1f); ic(0x84); ic(0x00); + ii(0x0000); + break; + case 9: /* 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] */ + ic(0x66); ic(0x0f); ic(0x1f); ic(0x84); + ic(0x00); ii(0x0000); + break; + } + count -= i; + } +} static void _lea(jit_state_t *_jit, jit_int32_t md, jit_int32_t rb, jit_int32_t ri, jit_int32_t ms, jit_int32_t rd) @@ -1026,6 +1181,49 @@ _addci(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) } } +static void +_iaddxr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + /* FIXME: this is not doing what I did expect for the simple test case: + * mov $0xffffffffffffffff, %rax -- rax = 0xffffffffffffffff (-1) + * mov $0xffffffffffffffff, %r10 -- r10 = 0xffffffffffffffff (-1) + * mov $0x1, %r11d -- r11 = 1 + * xor %rbx, %rbx -- rbx = 0 + * (gdb) p $eflags + * $1 = [ PF ZF IF ] + * add %r11, %rax -- r11 = 0x10000000000000000 (0) + * does not fit in 64 bit ^ + * (gdb) p $eflags + * $2 = [ CF PF AF ZF IF ] + * adcx %r10, %rbx -- r10 = 0xffffffffffffffff (-1) + * (gdb) p $eflags + * $3 = [ CF PF AF ZF IF ] + * (gdb) p/x $r10 + * $4 = 0xffffffffffffffff + * but, r10 should be zero, as it is: + * -1 (%r10) + 0 (%rbx) + carry (!!eflags.CF) + * FIXME: maybe should only use ADCX in the third operation onward, that + * is, after the first ADC? In either case, the add -1+0+carry should + * have used and consumed the carry? At least this is what is expected + * in Lightning... + */ +#if 0 + /* Significantly longer instruction, but avoid cpu stalls as only + * the carry flag is used in a sequence. */ + if (jit_cpu.adx) { + /* ADCX */ + ic(0x66); + rex(0, WIDE, r1, _NOREG, r0); + ic(0x0f); + ic(0x38); + ic(0xf6); + mrm(0x03, r7(r1), r7(r0)); + } + else +#endif + alur(X86_ADC, r0, r1); +} + static void _addxr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { @@ -1041,7 +1239,12 @@ static void _addxi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { + if ( +#if 0 + /* Do not mix ADC and ADCX */ + !jit_cpu.adx && +#endif + can_sign_extend_int_p(i0)) { movr(r0, r1); iaddxi(r0, i0); } @@ -1258,38 +1461,49 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) } #define savset(rn) \ - if (r0 != rn) { \ - sav |= 1 << rn; \ - if (r1 != rn && r2 != rn) \ - set |= 1 << rn; \ - } + do { \ + if (r0 != rn) { \ + sav |= 1 << rn; \ + if (r1 != rn && r2 != rn) \ + set |= 1 << rn; \ + } \ + } while (0) #define isavset(rn) \ - if (r0 != rn) { \ - sav |= 1 << rn; \ - if (r1 != rn) \ - set |= 1 << rn; \ - } + do { \ + if (r0 != rn) { \ + sav |= 1 << rn; \ + if (r1 != rn) \ + set |= 1 << rn; \ + } \ + } while (0) #define qsavset(rn) \ - if (r0 != rn && r1 != rn) { \ - sav |= 1 << rn; \ - if (r2 != rn && r3 != rn) \ - set |= 1 << rn; \ - } + do { \ + if (r0 != rn && r1 != rn) { \ + sav |= 1 << rn; \ + if (r2 != rn && r3 != rn) \ + set |= 1 << rn; \ + } \ + } while (0) #define allocr(rn, rv) \ - if (set & (1 << rn)) \ - (void)jit_get_reg(rv|jit_class_gpr|jit_class_named); \ - if (sav & (1 << rn)) { \ - if ( jit_regset_tstbit(&_jitc->regsav, rv) || \ - !jit_regset_tstbit(&_jitc->reglive, rv)) \ - sav &= ~(1 << rn); \ - else \ - save(rv); \ - } + do { \ + if (set & (1 << rn)) \ + (void)jit_get_reg(rv|jit_class_gpr|jit_class_named); \ + if (sav & (1 << rn)) { \ + if ( jit_regset_tstbit(&_jitc->regsav, rv) || \ + !jit_regset_tstbit(&_jitc->reglive, rv)) \ + sav &= ~(1 << rn); \ + else \ + save(rv); \ + } \ + } while (0) #define clear(rn, rv) \ - if (set & (1 << rn)) \ - jit_unget_reg(rv); \ - if (sav & (1 << rn)) \ - load(rv); + do { \ + if (set & (1 << rn)) \ + jit_unget_reg(rv); \ + if (sav & (1 << rn)) \ + load(rv); \ + } while (0) + static void _iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3, jit_bool_t sign) @@ -1315,14 +1529,20 @@ _iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, else umulr_u(mul); - if (r0 == _RDX_REGNO && r1 == _RAX_REGNO) - xchgr(_RAX_REGNO, _RDX_REGNO); + if (r0 != JIT_NOREG) { + if (r0 == _RDX_REGNO && r1 == _RAX_REGNO) + xchgr(_RAX_REGNO, _RDX_REGNO); + else { + if (r0 != _RDX_REGNO) + movr(r0, _RAX_REGNO); + movr(r1, _RDX_REGNO); + if (r0 == _RDX_REGNO) + movr(r0, _RAX_REGNO); + } + } else { - if (r0 != _RDX_REGNO) - movr(r0, _RAX_REGNO); + assert(r1 != JIT_NOREG); movr(r1, _RDX_REGNO); - if (r0 == _RDX_REGNO) - movr(r0, _RAX_REGNO); } clear(_RDX_REGNO, _RDX); @@ -1636,9 +1856,6 @@ _iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, qdivr_u(r0, r1, r2, rn(reg)); jit_unget_reg(reg); } -#undef clear -#undef allocr -#undef savset static void _andr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) @@ -1838,6 +2055,115 @@ _rotshi(jit_state_t *_jit, jit_int32_t code, irotshi(code, r0, i0); } +static void +_xlshr(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t sav, set; + jit_int32_t t0, s0, t1, s1, t2, s2, t3, s3; + jit_word_t over, zero, over_done, done; + sav = set = 0; + /* %RCX must be used for shift. */ + qsavset(_RCX_REGNO); + allocr(_RCX_REGNO, _RCX); + /* Almost certainly not %RCX */ + t1 = r1; + if (r0 == _RCX_REGNO) { + s0 = jit_get_reg(jit_class_gpr); + t0 = rn(s0); + } + else { + t0 = r0; + /* r0 == r1 is undefined behavior */ + if (r1 == _RCX_REGNO) { + s1 = jit_get_reg(jit_class_gpr); + t1 = rn(s1); + } + } + /* Allocate a temporary if a register is used more than once, or if + * the value to shift is %RCX */ + if (r0 == r2 || r1 == r2 || r2 == _RCX_REGNO) { + s2 = jit_get_reg(jit_class_gpr); + t2 = rn(s2); + movr(t2, r2); + } + else + t2 = r2; + /* Allocate temporary if shift is also one of the outputs */ + if (r0 == r3 || r1 == r3) { + s3 = jit_get_reg(jit_class_gpr); + t3 = rn(s3); + movr(t3, r3); + } + else + t3 = r3; + /* Bits to shift right */ + movi(t1, 0); + /* Shift in %RCX */ + /* Shift < 0 or > __WORDSIZE is undefined behavior and not tested */ + movr(_RCX_REGNO, t3); + /* Copy value to low register */ + movr(t0, t2); + /* SHLD shifts t0 left pulling extra bits in the right from t1. + * It is very handly to shift bignums, but lightning does not support + * these, nor 128 bit integers. The use of q{l,}sh{r,i} is to verify + * if there precision loss in a shift and/or have it as a quick way + * to multiply or divide by powers of two. */ + /* SHLD */ + rex(0, WIDE, t1, _NOREG, t0); + ic(0xf); + ic(0xa5); + mrm(0x03, r7(t1), r7(t0)); + /* Must swap results if shift value is __WORDSIZE */ + alui(X86_CMP, t3, __WORDSIZE); + over = jes(_jit->pc.w); + /* Calculate bits to shift right and fill high register */ + rsbi(_RCX_REGNO, _RCX_REGNO, __WORDSIZE); + if (sign) + rshr(t1, t2, _RCX_REGNO); + else + rshr_u(t1, t2, _RCX_REGNO); + /* FIXME t3 == %rcx only happens in 32 bit as %a3 (JIT_A3) is not + * available -- it might be made available at some point, to + * allow optimizing usage or arguments in registers. For now + * keep the code, as one might cheat and use _RCX directly, + * what is not officially supported, but *must* work. */ + /* Need to sign extend high register if shift value is zero */ + if (t3 == _RCX_REGNO) + alui(X86_CMP, t3, __WORDSIZE); + else + alui(X86_CMP, t3, 0); + /* Finished. */ + zero = jes(_jit->pc.w); + done = jmpsi(_jit->pc.w); + /* Swap registers if shift is __WORDSIZE */ + patch_at(over, _jit->pc.w); + xchgr(t0, t1); + over_done = jmpsi(_jit->pc.w); + /* If shift value is zero */ + patch_at(zero, _jit->pc.w); + if (sign) + rshi(t1, t2, __WORDSIZE - 1); + else + movi(t1, 0); + patch_at(over_done, _jit->pc.w); + patch_at(done, _jit->pc.w); + /* Release %RCX (if spilled) after branches */ + clear(_RCX_REGNO, _RCX); + if (t3 != r3) + jit_unget_reg(s3); + if (t2 != r2) + jit_unget_reg(s2); + if (t1 != r1) { + movr(r1, t1); + jit_unget_reg(s1); + } + if (t0 != r0) { + movr(r0, t0); + jit_unget_reg(s0); + } +} + static void _lshi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) { @@ -1849,6 +2175,152 @@ _lshi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) rotshi(X86_SHL, r0, r1, i0); } +static void +_xlshi(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0) +{ + if (i0 == 0) { + movr(r0, r2); + if (sign) + rshi(r1, r2, __WORDSIZE - 1); + else + movi(r1, 0); + } + else if (i0 == __WORDSIZE) { + movr(r1, r2); + movi(r0, 0); + } + else { + assert((jit_uword_t)i0 <= __WORDSIZE); + if (sign) + rshi(r1, r2, __WORDSIZE - i0); + else + rshi_u(r1, r2, __WORDSIZE - i0); + lshi(r0, r2, i0); + } +} + +static void +_xrshr(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t sav, set; + jit_int32_t t0, s0, t1, s1, t2, s2, t3, s3; + jit_word_t over, zero, done; + sav = set = 0; + /* %RCX must be used for shift. */ + qsavset(_RCX_REGNO); + allocr(_RCX_REGNO, _RCX); + /* Almost certainly not %RCX */ + t1 = r1; + if (r0 == _RCX_REGNO) { + s0 = jit_get_reg(jit_class_gpr); + t0 = rn(s0); + } + else { + t0 = r0; + /* r0 == r1 is undefined behavior */ + if (r1 == _RCX_REGNO) { + s1 = jit_get_reg(jit_class_gpr); + t1 = rn(s1); + } + } + /* Allocate a temporary if a register is used more than once, or if + * the value to shift is %RCX */ + if (r0 == r2 || r1 == r2 || r2 == _RCX_REGNO) { + s2 = jit_get_reg(jit_class_gpr); + t2 = rn(s2); + movr(t2, r2); + } + else + t2 = r2; + /* Allocate temporary if shift is also one of the outputs */ + if (r0 == r3 || r1 == r3) { + s3 = jit_get_reg(jit_class_gpr); + t3 = rn(s3); + movr(t3, r3); + } + else + t3 = r3; + /* Bits to shift left */ + if (sign) { + rshi(t1, t2, __WORDSIZE - 1); + /* Special case for negative value and zero shift */ + alui(X86_CMP, t3, 0); + zero = jnes(_jit->pc.w); + movi(t1, 0); + patch_at(zero, _jit->pc.w); + } + else + movi(t1, 0); + /* Shift in %RCX */ + /* Shift < 0 or > __WORDSIZE is undefined behavior and not tested */ + movr(_RCX_REGNO, t3); + /* Copy value to low register */ + movr(t0, t2); + /* SHRD shifts t0 right pulling extra bits in the left from t1 */ + /* SHRD */ + rex(0, WIDE, t1, _NOREG, t0); + ic(0xf); + ic(0xad); + mrm(0x03, r7(t1), r7(t0)); + /* Must swap results if shift value is __WORDSIZE */ + alui(X86_CMP, t3, __WORDSIZE); + over = jes(_jit->pc.w); + /* Already zero if shift value is zero */ + alui(X86_CMP, t3, 0); + zero = jes(_jit->pc.w); + /* Calculate bits to shift left and fill high register */ + rsbi(_RCX_REGNO, _RCX_REGNO, __WORDSIZE); + lshr(t1, t2, _RCX_REGNO); + done = jmpsi(_jit->pc.w); + /* Swap registers if shift is __WORDSIZE */ + patch_at(over, _jit->pc.w); + xchgr(t0, t1); + /* If shift value is zero */ + patch_at(zero, _jit->pc.w); + patch_at(done, _jit->pc.w); + /* Release %RCX (if spilled) after branches */ + clear(_RCX_REGNO, _RCX); + if (t3 != r3) + jit_unget_reg(s3); + if (t2 != r2) + jit_unget_reg(s2); + if (t1 != r1) { + movr(r1, t1); + jit_unget_reg(s1); + } + if (t0 != r0) { + movr(r0, t0); + jit_unget_reg(s0); + } +} + +static void +_xrshi(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0) +{ + if (i0 == 0) { + movr(r0, r2); + movi(r1, 0); + } + else if (i0 == __WORDSIZE) { + movr(r1, r2); + if (sign) + rshi(r0, r2, __WORDSIZE - 1); + else + movi(r0, 0); + } + else { + assert((jit_uword_t)i0 <= __WORDSIZE); + lshi(r1, r2, __WORDSIZE - i0); + if (sign) + rshi(r0, r2, i0); + else + rshi_u(r0, r2, i0); + } +} + static void _unr(jit_state_t *_jit, jit_int32_t code, jit_int32_t r0) { @@ -1903,6 +2375,260 @@ _decr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) } #endif +static void +_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + comr(r0, r1); + clzr(r0, r0); +} + +static void +_clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + jit_word_t w, x; + /* LZCNT */ + if (jit_cpu.abm) + ic(0xf3); + /* else BSR */ + rex(0, WIDE, r0, _NOREG, r1); + ic(0x0f); + ic(0xbd); + mrm(0x3, r7(r0), r7(r1)); + if (!jit_cpu.abm) { + /* jump if undefined: r1 == 0 */ + w = jccs(X86_CC_E, _jit->pc.w); + /* count leading zeros */ + rsbi(r0, r0, __WORDSIZE - 1); + /* done */ + x = jmpsi(_jit->pc.w); + /* if r1 == 0 */ + patch_at(w, _jit->pc.w); + movi(r0, __WORDSIZE); + /* not undefined */ + patch_at(x, _jit->pc.w); + } + /* LZCNT has defined behavior for value zero and count leading zeros */ +} + +static void +_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + comr(r0, r1); + ctzr(r0, r0); +} + +static void +_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + jit_word_t w; + jit_int32_t t0; + if (!jit_cpu.abm) { + if (jit_cmov_p()) + t0 = jit_get_reg(jit_class_gpr|jit_class_nospill|jit_class_chk); + else + t0 = _NOREG; + if (t0 != _NOREG) + movi(rn(t0), __WORDSIZE); + } + /* TZCNT */ + if (jit_cpu.abm) + ic(0xf3); + /* else BSF */ + rex(0, WIDE, r0, _NOREG, r1); + ic(0x0f); + ic(0xbc); + mrm(0x3, r7(r0), r7(r1)); + if (!jit_cpu.abm) { + /* No conditional move or need spill/reload a temporary */ + if (t0 == _NOREG) { + w = jccs(X86_CC_E, _jit->pc.w); + movi(r0, __WORDSIZE); + patch_at(w, _jit->pc.w); + } + else { + /* CMOVE */ + rex(0, WIDE, r0, _NOREG, rn(t0)); + ic(0x0f); + ic(0x44); + mrm(0x3, r7(r0), r7(rn(t0))); + jit_unget_reg(t0); + } + } + /* TZCNT has defined behavior for value zero */ +} + +static void +_rbitr(jit_state_t * _jit, jit_int32_t r0, jit_int32_t r1) +{ + jit_word_t loop; + jit_int32_t sav, set; + jit_int32_t r0_reg, t0, r1_reg, t1, t2, t3; + static const unsigned char swap_tab[256] = { + 0, 128, 64, 192, 32, 160, 96, 224, + 16, 144, 80, 208, 48, 176, 112, 240, + 8, 136, 72, 200, 40, 168, 104, 232, + 24, 152, 88, 216 ,56, 184, 120, 248, + 4, 132, 68, 196, 36, 164, 100, 228, + 20, 148, 84, 212, 52, 180, 116, 244, + 12, 140, 76, 204, 44, 172, 108, 236, + 28, 156, 92, 220, 60, 188, 124, 252, + 2, 130, 66, 194, 34, 162, 98, 226, + 18, 146, 82, 210, 50, 178, 114, 242, + 10, 138, 74, 202, 42, 170, 106, 234, + 26, 154, 90, 218, 58, 186, 122, 250, + 6, 134, 70, 198, 38, 166, 102, 230, + 22, 150, 86, 214, 54, 182, 118, 246, + 14, 142, 78, 206, 46, 174, 110, 238, + 30, 158, 94, 222, 62, 190, 126, 254, + 1, 129, 65, 193, 33, 161, 97, 225, + 17, 145, 81, 209, 49, 177, 113, 241, + 9, 137, 73, 201, 41, 169, 105, 233, + 25, 153, 89, 217, 57, 185, 121, 249, + 5, 133, 69, 197, 37, 165, 101, 229, + 21, 149, 85, 213, 53, 181, 117, 245, + 13, 141, 77, 205, 45, 173, 109, 237, + 29, 157, 93, 221, 61, 189, 125, 253, + 3, 131, 67, 195, 35, 163, 99, 227, + 19, 147, 83, 211, 51, 179, 115, 243, + 11, 139, 75, 203, 43, 171, 107, 235, + 27, 155, 91, 219, 59, 187, 123, 251, + 7, 135, 71, 199, 39, 167, 103, 231, + 23, 151, 87, 215, 55, 183, 119, 247, + 15, 143, 79, 207, 47, 175, 111, 239, + 31, 159, 95, 223, 63, 191, 127, 255 + }; + sav = set = 0; + isavset(_RCX_REGNO); + allocr(_RCX_REGNO, _RCX); + if (r0 == _RCX_REGNO) { + t0 = jit_get_reg(jit_class_gpr); + r0_reg = rn(t0); + } + else { + t0 = JIT_NOREG; + r0_reg = r0; + } + if (r1 == _RCX_REGNO || r0 == r1) { + t1 = jit_get_reg(jit_class_gpr); + r1_reg = rn(t1); + movr(r1_reg, r1); + } + else { + t1 = JIT_NOREG; + r1_reg = r1; + } + t2 = jit_get_reg(jit_class_gpr); + t3 = jit_get_reg(jit_class_gpr); +#if __WORDSIZE == 32 + /* Avoid condition that causes running out of registers */ + if (!reg8_p(r1_reg)) { + movi(rn(t2), 0xff); + andr(rn(t2), r1_reg, rn(t2)); + } + else +#endif + extr_uc(rn(t2), r1_reg); + movi(rn(t3), (jit_word_t)swap_tab); + ldxr_uc(r0_reg, rn(t3), rn(t2)); + movi(_RCX_REGNO, 8); + loop = _jit->pc.w; + rshr(rn(t2), r1_reg, _RCX_REGNO); + extr_uc(rn(t2), rn(t2)); + lshi(r0_reg, r0_reg, 8); + ldxr_uc(rn(t2), rn(t3), rn(t2)); + orr(r0_reg, r0_reg, rn(t2)); + addi(_RCX_REGNO, _RCX_REGNO, 8); + alui(X86_CMP, _RCX_REGNO, __WORDSIZE); + jls(loop); + clear(_RCX_REGNO, _RCX); + jit_unget_reg(t3); + jit_unget_reg(t2); + if (t1 != JIT_NOREG) + jit_unget_reg(t1); + if (t0 != JIT_NOREG) { + movr(r0, r0_reg); + jit_unget_reg(t0); + } +} + +static void +_popcntr(jit_state_t * _jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_cpu.abm) { + ic(0xf3); + rex(0, WIDE, r0, _NOREG, r1); + ic(0x0f); + ic(0xb8); + mrm(0x3, r7(r0), r7(r1)); + } + else { + jit_word_t loop; + jit_int32_t sav, set; + jit_int32_t r0_reg, t0, r1_reg, t1, t2, t3; + static const unsigned char pop_tab[256] = { + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 + }; + sav = set = 0; + isavset(_RCX_REGNO); + allocr(_RCX_REGNO, _RCX); + if (r0 == _RCX_REGNO) { + t0 = jit_get_reg(jit_class_gpr); + r0_reg = rn(t0); + } + else { + t0 = JIT_NOREG; + r0_reg = r0; + } + if (r1 == _RCX_REGNO || r0 == r1) { + t1 = jit_get_reg(jit_class_gpr); + r1_reg = rn(t1); + movr(r1_reg, r1); + } + else { + t1 = JIT_NOREG; + r1_reg = r1; + } + t2 = jit_get_reg(jit_class_gpr); + t3 = jit_get_reg(jit_class_gpr); +#if __WORDSIZE == 32 + /* Avoid condition that causes running out of registers */ + if (!reg8_p(r1_reg)) { + movi(rn(t2), 0xff); + andr(rn(t2), r1_reg, rn(t2)); + } + else +#endif + extr_uc(rn(t2), r1_reg); + movi(rn(t3), (jit_word_t)pop_tab); + ldxr_uc(r0_reg, rn(t3), rn(t2)); + movi(_RCX_REGNO, 8); + loop = _jit->pc.w; + rshr(rn(t2), r1_reg, _RCX_REGNO); + extr_uc(rn(t2), rn(t2)); + ldxr_uc(rn(t2), rn(t3), rn(t2)); + addr(r0_reg, r0_reg, rn(t2)); + addi(_RCX_REGNO, _RCX_REGNO, 8); + alui(X86_CMP, _RCX_REGNO, __WORDSIZE); + jls(loop); + clear(_RCX_REGNO, _RCX); + jit_unget_reg(t3); + jit_unget_reg(t2); + if (t1 != JIT_NOREG) + jit_unget_reg(t1); + if (t0 != JIT_NOREG) { + movr(r0, r0_reg); + jit_unget_reg(t0); + } + } +} + static void _cr(jit_state_t *_jit, jit_int32_t code, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) @@ -2152,6 +2878,12 @@ _imovi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) ii(i0); # if !__X64_32 } + else if (can_sign_extend_int_p(i0)) { + rex(0, 1, _NOREG, _NOREG, r0); + ic(0xc7); + ic(0xc0 | r7(r0)); + ii(i0); + } else { rex(0, 1, _NOREG, _NOREG, r0); ic(0xb8 | r7(r0)); @@ -2164,22 +2896,45 @@ _imovi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) #endif } +#if CAN_RIP_ADDRESS +static jit_word_t +#else static void +#endif _movi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { +#if CAN_RIP_ADDRESS + jit_word_t w, rel; + w = _jit->pc.w; + rel = i0 - (w + 8); + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + /* lea rel(%rip), %r0 */ + rex(0, WIDE, r0, _NOREG, _NOREG); + w = _jit->pc.w; + ic(0x8d); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif if (i0) imovi(r0, i0); else ixorr(r0, r0); +#if CAN_RIP_ADDRESS + return (w); +#endif } static jit_word_t _movi_p(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { + jit_word_t w; rex(0, WIDE, _NOREG, _NOREG, r0); + w = _jit->pc.w; ic(0xb8 | r7(r0)); il(i0); - return (_jit->pc.w); + return (w); } static void @@ -2218,6 +2973,66 @@ _movsr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) mrm(0x03, r7(r0), r7(r1)); } +static void +_casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, + jit_int32_t r2, jit_int32_t r3, jit_word_t i0) +{ + jit_int32_t save_rax, restore_rax; + jit_int32_t ascasr_reg, ascasr_use; + if (r0 != _RAX_REGNO) { /* result not in %rax */ + if (r2 != _RAX_REGNO) { /* old value not in %rax */ + save_rax = jit_get_reg(jit_class_gpr); + movr(rn(save_rax), _RAX_REGNO); + restore_rax = 1; + } + else + restore_rax = 0; + } + else + restore_rax = 0; + if (r2 != _RAX_REGNO) + movr(_RAX_REGNO, r2); + if (r1 == _NOREG) { /* using immediate address */ + if (!can_sign_extend_int_p(i0)) { + ascasr_reg = jit_get_reg(jit_class_gpr); + if (ascasr_reg == _RAX) { + ascasr_reg = jit_get_reg(jit_class_gpr); + jit_unget_reg(_RAX); + } + ascasr_use = 1; + movi(rn(ascasr_reg), i0); + } + else + ascasr_use = 0; + } + else + ascasr_use = 0; + ic(0xf0); /* lock */ + if (ascasr_use) + rex(0, WIDE, r3, _NOREG, rn(ascasr_reg)); + else + rex(0, WIDE, r3, _NOREG, r1); + ic(0x0f); + ic(0xb1); + if (r1 != _NOREG) /* casr */ + rx(r3, 0, r1, _NOREG, _SCL1); + else { /* casi */ + if (ascasr_use) + rx(r3, 0, rn(ascasr_reg), _NOREG, _SCL1); /* address in reg */ + else + rx(r3, i0, _NOREG, _NOREG, _SCL1); /* address in offset */ + } + cc(X86_CC_E, r0); + if (r0 != _RAX_REGNO) + movr(r0, _RAX_REGNO); + if (restore_rax) { + movr(_RAX_REGNO, rn(save_rax)); + jit_unget_reg(save_rax); + } + if (ascasr_use) + jit_unget_reg(ascasr_reg); +} + static void _movnr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { @@ -2293,6 +3108,92 @@ _bswapr_ul(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) } #endif +static void +_extr(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + jit_word_t mask; + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if (i1 == __WORDSIZE) + movr(r0, r1); + else { + if (__WORDSIZE - (i0 + i1)) { + lshi(r0, r1, __WORDSIZE - (i0 + i1)); + rshi(r0, r0, __WORDSIZE - i1); + } + else + rshi(r0, r1, __WORDSIZE - i1); + } +} + +static void +_extr_u(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + jit_int32_t t0; + jit_word_t mask; + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if (i1 == __WORDSIZE) + movr(r0, r1); + /* Only cheaper in code size or number of instructions if i0 is not zero */ + /* Number of cpu cicles not tested */ + else if (i0 && jit_cpu.bmi2) { + mask = ((ONE << i1) - 1) << i0; + t0 = jit_get_reg(jit_class_gpr); + movi(rn(t0), mask); + /* PEXT */ + vex(r0, _NOREG, rn(t0), 2, WIDE, r1, 0, 2); + ic(0xf5); + mrm(0x03, r7(r0), r7(rn(t0))); + jit_unget_reg(t0); + } + else { + if (i0) + rshi_u(r0, r1, i0); + andi(r0, r0, (ONE << i1) - 1); + } +} + +static void +_depr(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + jit_word_t mask; + jit_int32_t t0, t1; + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if (i1 == __WORDSIZE) + movr(r0, r1); + /* Only cheaper in code size or number of instructions if i0 is not zero */ + /* Number of cpu cicles not tested */ + else if (i0 && jit_cpu.bmi2) { + mask = ((ONE << i1) - 1) << i0; + t0 = jit_get_reg(jit_class_gpr); + t1 = jit_get_reg(jit_class_gpr); + movi(rn(t0), mask); + movr(rn(t1), r0); + /* PDEP */ + vex(r0, _NOREG, rn(t0), 2, WIDE, r1, 0, 3); + ic(0xf5); + mrm(0x03, r7(r0), r7(rn(t0))); + andi(rn(t1), rn(t1), ~mask); + orr(r0, r0, rn(t1)); + jit_unget_reg(t1); + jit_unget_reg(t0); + } + else { + mask = (ONE << i1) - 1; + t0 = jit_get_reg(jit_class_gpr); + andi(rn(t0), r1, mask); + if (i0) { + lshi(rn(t0), rn(t0), i0); + mask <<= i0; + } + andi(r0, r0, ~mask); + orr(r0, r0, rn(t0)); + jit_unget_reg(t0); + } +} + static void _extr_c(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) { @@ -2334,7 +3235,18 @@ static void _ldi_c(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x0f); + ic(0xbe); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x0f); ic(0xbe); @@ -2361,7 +3273,18 @@ static void _ldi_uc(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x0f); + ic(0xb6); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x0f); ic(0xb6); @@ -2388,7 +3311,18 @@ static void _ldi_s(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x0f); + ic(0xbf); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x0f); ic(0xbf); @@ -2415,7 +3349,18 @@ static void _ldi_us(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x0f); + ic(0xb7); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x0f); ic(0xb7); @@ -2446,7 +3391,17 @@ static void _ldi_i(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x63); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { #if __X64 rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x63); @@ -2477,7 +3432,17 @@ static void _ldi_ui(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +# if !__X64_32 + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, 0, r0, _NOREG, _NOREG); + ic(0x63); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, 0, r0, _NOREG, _NOREG); ic(0x63); rx(r0, i0, _NOREG, _NOREG, _SCL1); @@ -2485,7 +3450,11 @@ _ldi_ui(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) else { reg = jit_get_reg(jit_class_gpr); movi(rn(reg), i0); +# if __X64_32 + ldr_i(r0, rn(reg)); +# else ldr_ui(r0, rn(reg)); +# endif jit_unget_reg(reg); } } @@ -2503,8 +3472,15 @@ static void _ldi_l(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { - rex(0, 1, r0, _NOREG, _NOREG); + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x8b); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else if (can_sign_extend_int_p(i0)) { + rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x8b); rx(r0, i0, _NOREG, _NOREG, _SCL1); } @@ -2708,7 +3684,11 @@ _ldxi_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) else { reg = jit_get_reg(jit_class_gpr); movi(rn(reg), i0); +# if __X64_32 + ldxr_i(r0, r1, rn(reg)); +# else ldxr_ui(r0, r1, rn(reg)); +# endif jit_unget_reg(reg); } } @@ -2764,7 +3744,27 @@ static void _sti_c(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 16 : rel + 16; + if (can_sign_extend_int_p(rel)) { + if (reg8_p(r0)) { + rex(0, 0, r0, _NOREG, _NOREG); + ic(0x88); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else { + reg = jit_get_reg(jit_class_gpr|jit_class_rg8); + movr(rn(reg), r0); + rex(0, 0, rn(reg), _NOREG, _NOREG); + ic(0x88); + rx(rn(reg), i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + jit_unget_reg(reg); + } + } + else +#endif + if (address_p(i0)) { if (reg8_p(r0)) { rex(0, 0, r0, _NOREG, _NOREG); ic(0x88); @@ -2800,7 +3800,18 @@ static void _sti_s(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + ic(0x66); + rex(0, 0, r0, _NOREG, _NOREG); + ic(0x89); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { ic(0x66); rex(0, 0, r0, _NOREG, _NOREG); ic(0x89); @@ -2826,7 +3837,17 @@ static void _sti_i(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0) { jit_int32_t reg; - if (can_sign_extend_int_p(i0)) { +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, 0, r0, _NOREG, _NOREG); + ic(0x89); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif + if (address_p(i0)) { rex(0, 0, r0, _NOREG, _NOREG); ic(0x89); rx(r0, i0, _NOREG, _NOREG, _SCL1); @@ -2852,8 +3873,18 @@ static void _sti_l(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0) { jit_int32_t reg; +#if CAN_RIP_ADDRESS + jit_word_t rel = i0 - _jit->pc.w; + rel = rel < 0 ? rel - 8 : rel + 8; + if (can_sign_extend_int_p(rel)) { + rex(0, WIDE, r0, _NOREG, _NOREG); + ic(0x89); + rx(r0, i0 - (_jit->pc.w + 5), _NOREG, _NOREG, _SCL8); + } + else +#endif if (can_sign_extend_int_p(i0)) { - rex(0, 1, r0, _NOREG, _NOREG); + rex(0, WIDE, r0, _NOREG, _NOREG); ic(0x89); rx(r0, i0, _NOREG, _NOREG, _SCL1); } @@ -3014,208 +4045,221 @@ _stxi_l(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) } #endif -static void +static jit_word_t _jccs(jit_state_t *_jit, jit_int32_t code, jit_word_t i0) { + jit_word_t d; jit_word_t w; + w = _jit->pc.w; + d = i0 - (w + 2); ic(0x70 | code); - w = i0 - (_jit->pc.w + 1); - ic(w); + ic(d); + return (w); } -static void +static jit_word_t _jcc(jit_state_t *_jit, jit_int32_t code, jit_word_t i0) { + jit_word_t d; jit_word_t w; + w = _jit->pc.w; ic(0x0f); + d = i0 - (w + 6); ic(0x80 | code); - w = i0 - (_jit->pc.w + 4); - ii(w); + ii(d); + return (w); } -static void +static jit_word_t _jcr(jit_state_t *_jit, jit_int32_t code, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { alur(X86_CMP, r0, r1); - jcc(code, i0); + return (jcc(code, i0)); } -static void +static jit_word_t _jci(jit_state_t *_jit, jit_int32_t code, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { alui(X86_CMP, r0, i1); - jcc(code, i0); + return (jcc(code, i0)); } -static void +static jit_word_t _jci0(jit_state_t *_jit, jit_int32_t code, jit_word_t i0, jit_int32_t r0) { testr(r0, r0); - jcc(code, i0); + return (jcc(code, i0)); } static jit_word_t _bltr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - jcr(X86_CC_L, i0, r0, r1); - return (_jit->pc.w); + return (jcr(X86_CC_L, i0, r0, r1)); } static jit_word_t _blti(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_L, i0, r0, i1); - else jci0(X86_CC_S, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_L, i0, r0, i1); + else w = jci0(X86_CC_S, i0, r0); + return (w); } static jit_word_t _bltr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - jcr(X86_CC_B, i0, r0, r1); - return (_jit->pc.w); + return (jcr(X86_CC_B, i0, r0, r1)); } static jit_word_t _blti_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_B, i0, r0, i1); - else jci0(X86_CC_B, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_B, i0, r0, i1); + else w = jci0(X86_CC_B, i0, r0); + return (w); } static jit_word_t _bler(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - if (r0 == r1) jmpi(i0); - else jcr (X86_CC_LE, i0, r0, r1); - return (_jit->pc.w); + jit_word_t w; + if (r0 == r1) w = jmpi(i0); + else w = jcr (X86_CC_LE, i0, r0, r1); + return (w); } static jit_word_t _blei(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_LE, i0, r0, i1); - else jci0(X86_CC_LE, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_LE, i0, r0, i1); + else w = jci0(X86_CC_LE, i0, r0); + return (w); } static jit_word_t _bler_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - if (r0 == r1) jmpi(i0); - else jcr (X86_CC_BE, i0, r0, r1); - return (_jit->pc.w); + jit_word_t w; + if (r0 == r1) w = jmpi(i0); + else w = jcr (X86_CC_BE, i0, r0, r1); + return (w); } static jit_word_t _blei_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_BE, i0, r0, i1); - else jci0(X86_CC_BE, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_BE, i0, r0, i1); + else w = jci0(X86_CC_BE, i0, r0); + return (w); } static jit_word_t _beqr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - if (r0 == r1) jmpi(i0); - else jcr (X86_CC_E, i0, r0, r1); - return (_jit->pc.w); + jit_word_t w; + if (r0 == r1) w = jmpi(i0); + else w = jcr (X86_CC_E, i0, r0, r1); + return (w); } static jit_word_t _beqi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_E, i0, r0, i1); - else jci0(X86_CC_E, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_E, i0, r0, i1); + else w = jci0(X86_CC_E, i0, r0); + return (w); } static jit_word_t _bger(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - if (r0 == r1) jmpi(i0); - else jcr (X86_CC_GE, i0, r0, r1); - return (_jit->pc.w); + jit_word_t w; + if (r0 == r1) w = jmpi(i0); + else w = jcr (X86_CC_GE, i0, r0, r1); + return (w); } static jit_word_t _bgei(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_GE, i0, r0, i1); - else jci0(X86_CC_NS, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_GE, i0, r0, i1); + else w = jci0(X86_CC_NS, i0, r0); + return (w); } static jit_word_t _bger_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - if (r0 == r1) jmpi(i0); - else jcr (X86_CC_AE, i0, r0, r1); - return (_jit->pc.w); + jit_word_t w; + if (r0 == r1) w = jmpi(i0); + else w = jcr (X86_CC_AE, i0, r0, r1); + return (w); } static jit_word_t _bgei_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_AE, i0, r0, i1); - else jmpi(i0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_AE, i0, r0, i1); + else w = jmpi(i0); + return (w); } static jit_word_t _bgtr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - jcr(X86_CC_G, i0, r0, r1); - return (_jit->pc.w); + return (jcr(X86_CC_G, i0, r0, r1)); } static jit_word_t _bgti(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - jci(X86_CC_G, i0, r0, i1); - return (_jit->pc.w); + return (jci(X86_CC_G, i0, r0, i1)); } static jit_word_t _bgtr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - jcr(X86_CC_A, i0, r0, r1); - return (_jit->pc.w); + return (jcr(X86_CC_A, i0, r0, r1)); } static jit_word_t _bgti_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_A, i0, r0, i1); - else jci0(X86_CC_NE, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_A, i0, r0, i1); + else w = jci0(X86_CC_NE, i0, r0); + return (w); } static jit_word_t _bner(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { - jcr(X86_CC_NE, i0, r0, r1); - return (_jit->pc.w); + return (jcr(X86_CC_NE, i0, r0, r1)); } static jit_word_t _bnei(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) { - if (i1) jci (X86_CC_NE, i0, r0, i1); - else jci0(X86_CC_NE, i0, r0); - return (_jit->pc.w); + jit_word_t w; + if (i1) w = jci (X86_CC_NE, i0, r0, i1); + else w = jci0(X86_CC_NE, i0, r0); + return (w); } static jit_word_t _bmsr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { testr(r0, r1); - jnz(i0); - return (_jit->pc.w); + return (jnz(i0)); } static jit_word_t @@ -3230,16 +4274,14 @@ _bmsi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) testr(r0, rn(reg)); jit_unget_reg(reg); } - jnz(i0); - return (_jit->pc.w); + return (jnz(i0)); } static jit_word_t _bmcr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { testr(r0, r1); - jz(i0); - return (_jit->pc.w); + return (jz(i0)); } static jit_word_t @@ -3254,16 +4296,14 @@ _bmci(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) testr(r0, rn(reg)); jit_unget_reg(reg); } - jz(i0); - return (_jit->pc.w); + return (jz(i0)); } static jit_word_t _boaddr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { iaddr(r0, r1); - jo(i0); - return (_jit->pc.w); + return (jo(i0)); } static jit_word_t @@ -3272,8 +4312,7 @@ _boaddi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { iaddi(r0, i1); - jo(i0); - return (_jit->pc.w); + return (jo(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3285,8 +4324,7 @@ static jit_word_t _boaddr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { iaddr(r0, r1); - jc(i0); - return (_jit->pc.w); + return (jc(i0)); } static jit_word_t @@ -3295,8 +4333,7 @@ _boaddi_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { iaddi(r0, i1); - jc(i0); - return (_jit->pc.w); + return (jc(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3308,8 +4345,7 @@ static jit_word_t _bxaddr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { iaddr(r0, r1); - jno(i0); - return (_jit->pc.w); + return (jno(i0)); } static jit_word_t @@ -3318,8 +4354,7 @@ _bxaddi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { iaddi(r0, i1); - jno(i0); - return (_jit->pc.w); + return (jno(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3331,8 +4366,7 @@ static jit_word_t _bxaddr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { iaddr(r0, r1); - jnc(i0); - return (_jit->pc.w); + return (jnc(i0)); } static jit_word_t @@ -3341,8 +4375,7 @@ _bxaddi_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { iaddi(r0, i1); - jnc(i0); - return (_jit->pc.w); + return (jnc(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3354,8 +4387,7 @@ static jit_word_t _bosubr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { isubr(r0, r1); - jo(i0); - return (_jit->pc.w); + return (jo(i0)); } static jit_word_t @@ -3364,8 +4396,7 @@ _bosubi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { isubi(r0, i1); - jo(i0); - return (_jit->pc.w); + return (jo(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3377,8 +4408,7 @@ static jit_word_t _bosubr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { isubr(r0, r1); - jc(i0); - return (_jit->pc.w); + return (jc(i0)); } static jit_word_t @@ -3387,8 +4417,7 @@ _bosubi_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { isubi(r0, i1); - jc(i0); - return (_jit->pc.w); + return (jc(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3400,8 +4429,7 @@ static jit_word_t _bxsubr(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { isubr(r0, r1); - jno(i0); - return (_jit->pc.w); + return (jno(i0)); } static jit_word_t @@ -3410,8 +4438,7 @@ _bxsubi(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { isubi(r0, i1); - jno(i0); - return (_jit->pc.w); + return (jno(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3423,8 +4450,7 @@ static jit_word_t _bxsubr_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { isubr(r0, r1); - jnc(i0); - return (_jit->pc.w); + return (jnc(i0)); } static jit_word_t @@ -3433,8 +4459,7 @@ _bxsubi_u(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) jit_int32_t reg; if (can_sign_extend_int_p(i1)) { isubi(r0, i1); - jnc(i0); - return (_jit->pc.w); + return (jnc(i0)); } reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i1); @@ -3453,35 +4478,39 @@ _callr(jit_state_t *_jit, jit_int32_t r0) static jit_word_t _calli(jit_state_t *_jit, jit_word_t i0) { - jit_word_t word; jit_word_t w; + jit_word_t d; + jit_word_t l = _jit->pc.w + 5; + d = i0 - l; #if __X64 - w = i0 - (_jit->pc.w + 5); - if ((jit_int32_t)w == w) { + if ( +# if __X64_32 + !((d < 0) ^ (l < 0)) && +# endif + (jit_int32_t)d == d) { #endif + w = _jit->pc.w; ic(0xe8); - w = i0 - (_jit->pc.w + 4); - ii(w); - word = _jit->pc.w; + ii(d); #if __X64 } else - word = calli_p(i0); + w = calli_p(i0); #endif - return (word); + return (w); } #if __X64 static jit_word_t _calli_p(jit_state_t *_jit, jit_word_t i0) { - jit_word_t word; + jit_word_t w; jit_int32_t reg; reg = jit_get_reg(jit_class_gpr); - word = movi_p(rn(reg), i0); + w = movi_p(rn(reg), i0); callr(rn(reg)); jit_unget_reg(reg); - return (word); + return (w); } #endif @@ -3496,51 +4525,61 @@ _jmpr(jit_state_t *_jit, jit_int32_t r0) static jit_word_t _jmpi(jit_state_t *_jit, jit_word_t i0) { - jit_word_t word; jit_word_t w; + jit_word_t d; + jit_word_t l = _jit->pc.w + 5; + d = i0 - l; #if __X64 - w = i0 - (_jit->pc.w + 5); - if ((jit_int32_t)w == w) { + if ( +# if __X64_32 + !((d < 0) ^ (l < 0)) && +# endif + (jit_int32_t)d == d) { #endif + w = _jit->pc.w; ic(0xe9); - w = i0 - (_jit->pc.w + 4); - ii(w); - word = _jit->pc.w; + ii(d); #if __X64 } else - word = jmpi_p(i0); + w = jmpi_p(i0); #endif - return (word); + return (w); } #if __X64 static jit_word_t _jmpi_p(jit_state_t *_jit, jit_word_t i0) { - jit_word_t word; + jit_word_t w; jit_int32_t reg; reg = jit_get_reg(jit_class_gpr|jit_class_nospill); - word = movi_p(rn(reg), i0); + w = movi_p(rn(reg), i0); jmpr(rn(reg)); jit_unget_reg(reg); - return (word); + return (w); } #endif -static void +static jit_word_t _jmpsi(jit_state_t *_jit, jit_uint8_t i0) { + jit_word_t w = _jit->pc.w; ic(0xeb); ic(i0); + return (w); } +#undef clear +#undef allocr +#undef savset static void _prolog(jit_state_t *_jit, jit_node_t *node) { - jit_int32_t reg; + jit_int32_t reg, offs; if (_jitc->function->define_frame || _jitc->function->assume_frame) { jit_int32_t frame = -_jitc->function->frame; + jit_check_frame(); assert(_jitc->function->self.aoff >= frame); if (_jitc->function->assume_frame) return; @@ -3553,76 +4592,51 @@ _prolog(jit_state_t *_jit, jit_node_t *node) (_jitc->function->self.alen > 32 ? _jitc->function->self.alen : 32) - /* align stack at 16 bytes */ - _jitc->function->self.aoff) + 15) & -16) + - stack_adjust; + _jitc->function->self.aoff) + 15) & -16); #else _jitc->function->stack = (((_jitc->function->self.alen - - _jitc->function->self.aoff) + 15) & -16) + - stack_adjust; + _jitc->function->self.aoff) + 15) & -16); #endif - subi(_RSP_REGNO, _RSP_REGNO, stack_framesize - REAL_WORDSIZE); + + if (_jitc->function->stack) + _jitc->function->need_stack = 1; + + if (!_jitc->function->need_frame && !_jitc->function->need_stack) { + /* check if any callee save register needs to be saved */ + for (reg = 0; reg < _jitc->reglen; ++reg) + if (jit_regset_tstbit(&_jitc->function->regset, reg) && + (_rvs[reg].spec & jit_class_sav)) { + _jitc->function->need_stack = 1; + break; + } + } + + if (_jitc->function->need_frame || _jitc->function->need_stack) + subi(_RSP_REGNO, _RSP_REGNO, jit_framesize()); /* callee save registers */ -#if __X32 - if (jit_regset_tstbit(&_jitc->function->regset, _RDI)) - stxi(12, _RSP_REGNO, _RDI_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _RSI)) - stxi( 8, _RSP_REGNO, _RSI_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - stxi( 4, _RSP_REGNO, _RBX_REGNO); -#else -# if __CYGWIN__ || _WIN32 - if (jit_regset_tstbit(&_jitc->function->regset, _XMM15)) - sse_stxi_d(136, _RSP_REGNO, _XMM15_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM14)) - sse_stxi_d(128, _RSP_REGNO, _XMM14_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM13)) - sse_stxi_d(120, _RSP_REGNO, _XMM13_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM12)) - sse_stxi_d(112, _RSP_REGNO, _XMM12_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM11)) - sse_stxi_d(104, _RSP_REGNO, _XMM11_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM10)) - sse_stxi_d(96, _RSP_REGNO, _XMM10_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM9)) - sse_stxi_d(88, _RSP_REGNO, _XMM9_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM8)) - sse_stxi_d(80, _RSP_REGNO, _XMM8_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM7)) - sse_stxi_d(72, _RSP_REGNO, _XMM7_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM6)) - sse_stxi_d(64, _RSP_REGNO, _XMM6_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R15)) - stxi(56, _RSP_REGNO, _R15_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R14)) - stxi(48, _RSP_REGNO, _R14_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R13)) - stxi(40, _RSP_REGNO, _R13_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R12)) - stxi(32, _RSP_REGNO, _R12_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _RSI)) - stxi(24, _RSP_REGNO, _RSI_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _RDI)) - stxi(16, _RSP_REGNO, _RDI_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - stxi( 8, _RSP_REGNO, _RBX_REGNO); -# else - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - stxi(40, _RSP_REGNO, _RBX_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R12)) - stxi(32, _RSP_REGNO, _R12_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R13)) - stxi(24, _RSP_REGNO, _R13_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R14)) - stxi(16, _RSP_REGNO, _R14_REGNO); - if (jit_regset_tstbit(&_jitc->function->regset, _R15)) - stxi( 8, _RSP_REGNO, _R15_REGNO); -# endif + for (reg = 0, offs = REAL_WORDSIZE; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + stxi(offs, _RSP_REGNO, rn(iregs[reg])); + offs += REAL_WORDSIZE; + } + } +#if __X64 && (__CYGWIN__ || _WIN32) + for (reg = 0; reg < jit_size(fregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) { + sse_stxi_d(offs, _RSP_REGNO, rn(fregs[reg])); + offs += sizeof(jit_float64_t); + } + } #endif - stxi(0, _RSP_REGNO, _RBP_REGNO); - movr(_RBP_REGNO, _RSP_REGNO); + + if (_jitc->function->need_frame) { + stxi(0, _RSP_REGNO, _RBP_REGNO); + movr(_RBP_REGNO, _RSP_REGNO); + } /* alloca */ - subi(_RSP_REGNO, _RSP_REGNO, _jitc->function->stack); + if (_jitc->function->stack) + subi(_RSP_REGNO, _RSP_REGNO, _jitc->function->stack); if (_jitc->function->allocar) { reg = jit_get_reg(jit_class_gpr); movi(rn(reg), _jitc->function->self.aoff); @@ -3646,8 +4660,7 @@ _prolog(jit_state_t *_jit, jit_node_t *node) /* test %al, %al */ ic(0x84); ic(0xc0); - jes(0); - nofp_code = _jit->pc.w; + nofp_code = jes(0); /* Save fp registers in the save area, if any is a vararg */ /* Note that the full 16 byte xmm is not saved, because @@ -3658,7 +4671,7 @@ _prolog(jit_state_t *_jit, jit_node_t *node) sse_stxi_d(_jitc->function->vaoff + first_fp_offset + reg * va_fp_increment, _RBP_REGNO, rn(_XMM0 - reg)); - patch_rel_char(nofp_code, _jit->pc.w); + patch_at(nofp_code, _jit->pc.w); } } #endif @@ -3667,68 +4680,38 @@ _prolog(jit_state_t *_jit, jit_node_t *node) static void _epilog(jit_state_t *_jit, jit_node_t *node) { + jit_int32_t reg, offs; if (_jitc->function->assume_frame) return; + if (_jitc->function->need_frame) + movr(_RSP_REGNO, _RBP_REGNO); + /* callee save registers */ - movr(_RSP_REGNO, _RBP_REGNO); -#if __X32 - if (jit_regset_tstbit(&_jitc->function->regset, _RDI)) - ldxi(_RDI_REGNO, _RSP_REGNO, 12); - if (jit_regset_tstbit(&_jitc->function->regset, _RSI)) - ldxi(_RSI_REGNO, _RSP_REGNO, 8); - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - ldxi(_RBX_REGNO, _RSP_REGNO, 4); -#else -# if __CYGWIN__ || _WIN32 - if (jit_regset_tstbit(&_jitc->function->regset, _XMM15)) - sse_ldxi_d(_XMM15_REGNO, _RSP_REGNO, 136); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM14)) - sse_ldxi_d(_XMM14_REGNO, _RSP_REGNO, 128); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM13)) - sse_ldxi_d(_XMM13_REGNO, _RSP_REGNO, 120); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM12)) - sse_ldxi_d(_XMM12_REGNO, _RSP_REGNO, 112); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM11)) - sse_ldxi_d(_XMM11_REGNO, _RSP_REGNO, 104); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM10)) - sse_ldxi_d(_XMM10_REGNO, _RSP_REGNO, 96); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM9)) - sse_ldxi_d(_XMM9_REGNO, _RSP_REGNO, 88); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM8)) - sse_ldxi_d(_XMM8_REGNO, _RSP_REGNO, 80); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM7)) - sse_ldxi_d(_XMM7_REGNO, _RSP_REGNO, 72); - if (jit_regset_tstbit(&_jitc->function->regset, _XMM6)) - sse_ldxi_d(_XMM6_REGNO, _RSP_REGNO, 64); - if (jit_regset_tstbit(&_jitc->function->regset, _R15)) - ldxi(_R15_REGNO, _RSP_REGNO, 56); - if (jit_regset_tstbit(&_jitc->function->regset, _R14)) - ldxi(_R14_REGNO, _RSP_REGNO, 48); - if (jit_regset_tstbit(&_jitc->function->regset, _R13)) - ldxi(_R13_REGNO, _RSP_REGNO, 40); - if (jit_regset_tstbit(&_jitc->function->regset, _R12)) - ldxi(_R12_REGNO, _RSP_REGNO, 32); - if (jit_regset_tstbit(&_jitc->function->regset, _RSI)) - ldxi(_RSI_REGNO, _RSP_REGNO, 24); - if (jit_regset_tstbit(&_jitc->function->regset, _RDI)) - ldxi(_RDI_REGNO, _RSP_REGNO, 16); - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - ldxi(_RBX_REGNO, _RSP_REGNO, 8); -# else - if (jit_regset_tstbit(&_jitc->function->regset, _RBX)) - ldxi(_RBX_REGNO, _RSP_REGNO, 40); - if (jit_regset_tstbit(&_jitc->function->regset, _R12)) - ldxi(_R12_REGNO, _RSP_REGNO, 32); - if (jit_regset_tstbit(&_jitc->function->regset, _R13)) - ldxi(_R13_REGNO, _RSP_REGNO, 24); - if (jit_regset_tstbit(&_jitc->function->regset, _R14)) - ldxi(_R14_REGNO, _RSP_REGNO, 16); - if (jit_regset_tstbit(&_jitc->function->regset, _R15)) - ldxi(_R15_REGNO, _RSP_REGNO, 8); -# endif + for (reg = 0, offs = REAL_WORDSIZE; reg < jit_size(iregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + ldxi(rn(iregs[reg]), _RSP_REGNO, offs); + offs += REAL_WORDSIZE; + } + } +#if __X64 && (__CYGWIN__ || _WIN32) + for (reg = 0; reg < jit_size(fregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) { + sse_ldxi_d(rn(fregs[reg]), _RSP_REGNO, offs); + offs += sizeof(jit_float64_t); + } + } #endif - ldxi(_RBP_REGNO, _RSP_REGNO, 0); - addi(_RSP_REGNO, _RSP_REGNO, stack_framesize - REAL_WORDSIZE); + + if (_jitc->function->need_frame) { + ldxi(_RBP_REGNO, _RSP_REGNO, 0); + addi(_RSP_REGNO, _RSP_REGNO, jit_framesize()); + } + /* This condition does not happen as much as expected because + * it is not safe to not create a frame pointer if any function + * is called, even jit functions, as those might call external + * functions. */ + else if (_jitc->function->need_stack) + addi(_RSP_REGNO, _RSP_REGNO, jit_framesize()); ic(0xc3); } @@ -3738,7 +4721,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) { #if __X32 || __CYGWIN__ || _WIN32 assert(_jitc->function->self.call & jit_call_varargs); - addi(r0, _RBP_REGNO, _jitc->function->self.size); + addi(r0, _RBP_REGNO, jit_selfsize()); #else jit_int32_t reg; @@ -3757,7 +4740,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) stxi_i(offsetof(jit_va_list_t, fpoff), r0, rn(reg)); /* Initialize overflow pointer to the first stack argument. */ - addi(rn(reg), _RBP_REGNO, _jitc->function->self.size); + addi(rn(reg), _RBP_REGNO, jit_selfsize()); stxi(offsetof(jit_va_list_t, over), r0, rn(reg)); /* Initialize register save area pointer. */ @@ -3791,8 +4774,7 @@ _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) /* Jump over if there are no remaining arguments in the save area. */ icmpi(rn(rg0), va_gp_max_offset); - jaes(0); - ge_code = _jit->pc.w; + ge_code = jaes(0); /* Load the save area pointer in the second temporary. */ ldxi(rn(rg1), r1, offsetof(jit_va_list_t, save)); @@ -3808,11 +4790,10 @@ _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) jit_unget_reg(rg1); /* Jump over overflow code. */ - jmpsi(0); - lt_code = _jit->pc.w; + lt_code = jmpsi(0); /* Where to land if argument is in overflow area. */ - patch_rel_char(ge_code, _jit->pc.w); + patch_at(ge_code, _jit->pc.w); /* Load overflow pointer. */ ldxi(rn(rg0), r1, offsetof(jit_va_list_t, over)); @@ -3825,7 +4806,7 @@ _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) stxi(offsetof(jit_va_list_t, over), r1, rn(rg0)); /* Where to land if argument is in save area. */ - patch_rel_char(lt_code, _jit->pc.w); + patch_at(lt_code, _jit->pc.w); jit_unget_reg(rg0); #endif @@ -3859,8 +4840,7 @@ _vaarg_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_bool_t x87) /* Jump over if there are no remaining arguments in the save area. */ icmpi(rn(rg0), va_fp_max_offset); - jaes(0); - ge_code = _jit->pc.w; + ge_code = jaes(0); /* Load the save area pointer in the second temporary. */ ldxi(rn(rg1), r1, offsetof(jit_va_list_t, save)); @@ -3879,11 +4859,10 @@ _vaarg_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_bool_t x87) jit_unget_reg(rg1); /* Jump over overflow code. */ - jmpsi(0); - lt_code = _jit->pc.w; + lt_code = jmpsi(0); /* Where to land if argument is in overflow area. */ - patch_rel_char(ge_code, _jit->pc.w); + patch_at(ge_code, _jit->pc.w); /* Load overflow pointer. */ ldxi(rn(rg0), r1, offsetof(jit_va_list_t, over)); @@ -3899,27 +4878,57 @@ _vaarg_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_bool_t x87) stxi(offsetof(jit_va_list_t, over), r1, rn(rg0)); /* Where to land if argument is in save area. */ - patch_rel_char(lt_code, _jit->pc.w); + patch_at(lt_code, _jit->pc.w); jit_unget_reg(rg0); #endif } static void -_patch_at(jit_state_t *_jit, jit_node_t *node, - jit_word_t instr, jit_word_t label) +_patch_at(jit_state_t *_jit, jit_word_t instr, jit_word_t label) { - switch (node->code) { -# if __X64 - case jit_code_calli: - case jit_code_jmpi: -# endif - case jit_code_movi: - patch_abs(instr, label); + jit_word_t disp; + jit_uint8_t *code = (jit_uint8_t *)instr; + ++instr; + switch (code[0]) { + /* movi_p */ + case 0xb8 ... 0xbf: + *(jit_word_t *)instr = label; break; - default: - patch_rel(instr, label); + /* forward pc relative address known to be in range */ +#if CAN_RIP_ADDRESS + /* movi */ + case 0x8d: + ++instr; + goto apply; +#endif + /* jcc */ + case 0x0f: + ++instr; + if (code[1] < 0x80 || code[1] > 0x8f) + goto fail; + /* calli */ + case 0xe8: + /* jmpi */ + case 0xe9: +#if CAN_RIP_ADDRESS + apply: +#endif + disp = label - (instr + 4); + assert((jit_int32_t)disp == disp); + *(jit_int32_t *)instr = disp; break; + /* jccs */ + case 0x70 ... 0x7f: + /* jmpsi */ + case 0xeb: + disp = label - (instr + 1); + assert((jit_int8_t)disp == disp); + *(jit_int8_t *)instr = disp; + break; + default: + fail: + abort(); } } #endif