X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flightning%2Flib%2Fjit_x86-sse.c;fp=deps%2Flightning%2Flib%2Fjit_x86-sse.c;h=930efedb95b798be2c051fed59114b84ea3dfc3f;hb=ba86ff938a6b17c171dd68ebdf897ca3e30550f8;hp=c3ac895ee15fca4f90d215d6c45c3bde6856866f;hpb=56e500f3428614e677ba5e9719f002046e87d980;p=pcsx_rearmed.git diff --git a/deps/lightning/lib/jit_x86-sse.c b/deps/lightning/lib/jit_x86-sse.c index c3ac895e..930efedb 100644 --- a/deps/lightning/lib/jit_x86-sse.c +++ b/deps/lightning/lib/jit_x86-sse.c @@ -63,7 +63,8 @@ # define sser(c,r0,r1) _sser(_jit,c,r0,r1) static void _sser(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define ssexr(p,c,r0,r1) _ssexr(_jit,p,c,r0,r1) -static void _ssexr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +static void _ssexr(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define ssexi(c,r0,m,i) _ssexi(_jit,c,r0,m,i) static void _ssexi(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define addssr(r0, r1) ssexr(0xf3, X86_SSE_ADD, r0, r1) @@ -93,13 +94,15 @@ static void _ssexi(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t) # define ucomisdr(r0,r1) ssexr(0x66,X86_SSE_UCOMI,r0,r1) # define xorpsr(r0,r1) sser(X86_SSE_XOR,r0,r1) # define xorpdr(r0,r1) ssexr(0x66,X86_SSE_XOR,r0,r1) -# define movdlxr(r0,r1) ssexr(0x66, X86_SSE_X2G,r0,r1) +# define movdxr(r0,r1) ssexr(0x66, X86_SSE_X2G,r0,r1) +# define movdrx(r0,r1) ssexr(0x66, X86_SSE_G2X,r0,r1) +# define movqxr(r0,r1) sselxr(0x66, X86_SSE_X2G,r0,r1) +# define movqrx(r0,r1) sselxr(0x66, X86_SSE_G2X,r0,r1) # define pcmpeqlr(r0, r1) ssexr(0x66, X86_SSE_EQD, r0, r1) # define psrl(r0, i0) ssexi(0x72, r0, 0x02, i0) # define psrq(r0, i0) ssexi(0x73, r0, 0x02, i0) # define psll(r0, i0) ssexi(0x72, r0, 0x06, i0) # define pslq(r0, i0) ssexi(0x73, r0, 0x06, i0) -# define movdqxr(r0,r1) sselxr(0x66,X86_SSE_X2G,r0,r1) # if __X64 && !__X64_32 # define sselxr(p,c,r0,r1) _sselxr(_jit,p,c,r0,r1) static void @@ -163,6 +166,30 @@ static void _sse_negr_f(jit_state_t*,jit_int32_t,jit_int32_t); static void _sse_negr_d(jit_state_t*,jit_int32_t,jit_int32_t); # define sse_sqrtr_f(r0, r1) ssexr(0xf3, X86_SSE_SQRT, r0, r1) # define sse_sqrtr_d(r0, r1) ssexr(0xf2, X86_SSE_SQRT, r0, r1) +# define sse_fmar_f(r0, r1, r2, r3) _sse_fmar_f(_jit, r0, r1, r2, r3) +static void _sse_fmar_f(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fmar_d(r0, r1, r2, r3) _sse_fmar_d(_jit, r0, r1, r2, r3) +static void _sse_fmar_d(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fmsr_f(r0, r1, r2, r3) _sse_fmsr_f(_jit, r0, r1, r2, r3) +static void _sse_fmsr_f(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fmsr_d(r0, r1, r2, r3) _sse_fmsr_d(_jit, r0, r1, r2, r3) +static void _sse_fmsr_d(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fnmar_f(r0, r1, r2, r3) _sse_fnmar_f(_jit, r0, r1, r2, r3) +static void _sse_fnmar_f(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fnmar_d(r0, r1, r2, r3) _sse_fnmar_d(_jit, r0, r1, r2, r3) +static void _sse_fnmar_d(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fnmsr_f(r0, r1, r2, r3) _sse_fnmsr_f(_jit, r0, r1, r2, r3) +static void _sse_fnmsr_f(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define sse_fnmsr_d(r0, r1, r2, r3) _sse_fnmsr_d(_jit, r0, r1, r2, r3) +static void _sse_fnmsr_d(jit_state_t*, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define ssecmpf(code, r0, r1, r2) _ssecmp(_jit, 0, code, r0, r1, r2) # define ssecmpd(code, r0, r1, r2) _ssecmp(_jit, 1, code, r0, r1, r2) static void @@ -172,6 +199,10 @@ _ssecmp(jit_state_t*, jit_bool_t, jit_int32_t, static void _sse_movr_f(jit_state_t*, jit_int32_t, jit_int32_t); #define sse_movi_f(r0,i0) _sse_movi_f(_jit,r0,i0) static void _sse_movi_f(jit_state_t*, jit_int32_t, jit_float32_t*); +# define sse_movr_w_f(r0,r1) movdxr(r0, r1) +# define sse_movr_f_w(r0,r1) movdrx(r1, r0) +#define sse_movi_w_f(r0, i0) _sse_movi_w_f(_jit, r0, i0) +static void _sse_movi_w_f(jit_state_t*, jit_int32_t, jit_word_t); # define sse_lti_f(r0, r1, i0) _sse_lti_f(_jit, r0, r1, i0) static void _sse_lti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*); # define sse_ltr_f(r0, r1, r2) ssecmpf(X86_CC_A, r0, r1, r2) @@ -227,6 +258,10 @@ static void _sse_ldi_f(jit_state_t*, jit_int32_t, jit_word_t); static void _sse_ldxr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); # define sse_ldxi_f(r0, r1, i0) _sse_ldxi_f(_jit, r0, r1, i0) static void _sse_ldxi_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); +# define sse_unldr_x(r0, r1, i0) _sse_unldr_x(_jit, r0, r1, i0) +static void _sse_unldr_x(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); +# define sse_unldi_x(r0, i0, i1) _sse_unldi_x(_jit, r0, i0, i1) +static void _sse_unldi_x(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t); # define sse_str_f(r0, r1) movssrm(r1, 0, r0, _NOREG, _SCL1) # define sse_sti_f(i0, r0) _sse_sti_f(_jit, i0, r0) static void _sse_sti_f(jit_state_t*, jit_word_t,jit_int32_t); @@ -234,6 +269,10 @@ static void _sse_sti_f(jit_state_t*, jit_word_t,jit_int32_t); static void _sse_stxr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define sse_stxi_f(i0, r0, r1) _sse_stxi_f(_jit, i0, r0, r1) static void _sse_stxi_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t); +#define sse_unstr_x(r0, r1, i0) _sse_unstr_x(_jit, r0, r1, i0) +static void _sse_unstr_x(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t); +#define sse_unsti_x(i0, r0, i1) _sse_unsti_x(_jit, i0, r0, i1) +static void _sse_unsti_x(jit_state_t*, jit_word_t, jit_int32_t, jit_word_t); # define sse_bltr_f(i0, r0, r1) _sse_bltr_f(_jit, i0, r0, r1) static jit_word_t _sse_bltr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t); # define sse_blti_f(i0, r0, i1) _sse_blti_f(_jit, i0, r0, i1) @@ -308,6 +347,19 @@ _sse_bunordi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*); static void _sse_movr_d(jit_state_t*, jit_int32_t, jit_int32_t); #define sse_movi_d(r0,i0) _sse_movi_d(_jit,r0,i0) static void _sse_movi_d(jit_state_t*, jit_int32_t, jit_float64_t*); +# if __X32 || __X64_32 +# define sse_movr_ww_d(r0, r1, r2) _sse_movr_ww_d(_jit, r0, r1, r2) +static void _sse_movr_ww_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); +# define sse_movr_d_ww(r0, r1, r2) _sse_movr_d_ww(_jit, r0, r1, r2) +static void _sse_movr_d_ww(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t); +# define sse_movi_ww_d(r0, i0, i1) _sse_movi_ww_d(_jit, r0, i0, i1) +static void _sse_movi_ww_d(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t); +# else +# define sse_movr_w_d(r0, r1) movqxr(r0, r1) +# define sse_movr_d_w(r0, r1) movqrx(r1, r0) +# define sse_movi_w_d(r0, i0) _sse_movi_w_d(_jit, r0, i0) +static void _sse_movi_w_d(jit_state_t*, jit_int32_t, jit_word_t); +# endif # define sse_ltr_d(r0, r1, r2) ssecmpd(X86_CC_A, r0, r1, r2) # define sse_lti_d(r0, r1, i0) _sse_lti_d(_jit, r0, r1, i0) static void _sse_lti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*); @@ -722,12 +774,12 @@ _sse_negr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) imovi(rn(ireg), 0x80000000); if (r0 == r1) { freg = jit_get_reg(jit_class_fpr|jit_class_xpr); - movdlxr(rn(freg), rn(ireg)); + movdxr(rn(freg), rn(ireg)); xorpsr(r0, rn(freg)); jit_unget_reg(freg); } else { - movdlxr(r0, rn(ireg)); + movdxr(r0, rn(ireg)); xorpsr(r0, r1); } jit_unget_reg(ireg); @@ -741,19 +793,333 @@ _sse_negr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) imovi(rn(ireg), 0x80000000); if (r0 == r1) { freg = jit_get_reg(jit_class_fpr|jit_class_xpr); - movdlxr(rn(freg), rn(ireg)); + movdxr(rn(freg), rn(ireg)); pslq(rn(freg), 32); xorpdr(r0, rn(freg)); jit_unget_reg(freg); } else { - movdlxr(r0, rn(ireg)); + movdxr(r0, rn(ireg)); pslq(r0, 32); xorpdr(r0, r1); } jit_unget_reg(ireg); } +/* r1 = (r1 * r3) + r2 */ +#define vfmadd132ss(r1, r2, r3) _vfmadd132sx(_jit, 0, r1, r2, r3) +#define vfmadd132sd(r1, r2, r3) _vfmadd132sx(_jit, 1, r1, r2, r3) +static void +_vfmadd132sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMADD132SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0x99); + mrm(0x03, r7(r1), r7(r3)); +} + +/* r1 = (r1 * r3) - r2 */ +#define vfmsub132ss(r1, r2, r3) _vfmsub132sx(_jit, 0, r1, r2, r3) +#define vfmsub132sd(r1, r2, r3) _vfmsub132sx(_jit, 1, r1, r2, r3) +static void +_vfmsub132sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMSUB132SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0x9b); + mrm(0x03, r7(r1), r7(r3)); +} + +/* r1 = (r1 * r2) + r3 */ +#define vfmadd213ss(r1, r2, r3) _vfmadd213sx(_jit, 0, r1, r2, r3) +#define vfmadd213sd(r1, r2, r3) _vfmadd213sx(_jit, 1, r1, r2, r3) +static void +_vfmadd213sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMADD132SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0xa9); + mrm(0x03, r7(r1), r7(r3)); +} + +/* r1 = (r1 * r2) - r3 */ +#define vfmsub213ss(r1, r2, r3) _vfmsub213sx(_jit, 0, r1, r2, r3) +#define vfmsub213sd(r1, r2, r3) _vfmsub213sx(_jit, 1, r1, r2, r3) +static void +_vfmsub213sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMSUB132SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0xab); + mrm(0x03, r7(r1), r7(r3)); +} + +/* r1 = (r2 * r3) + r1 */ +#define vfmadd231ss(r1, r2, r3) _vfmadd231sx(_jit, 0, r1, r2, r3) +#define vfmadd231sd(r1, r2, r3) _vfmadd231sx(_jit, 1, r1, r2, r3) +static void +_vfmadd231sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMADD231SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0xb9); + mrm(0x03, r7(r1), r7(r3)); +} + +/* r1 = (r2 * r3) - r1 */ +#define vfmsub231ss(r1, r2, r3) _vfmsub231sx(_jit, 0, r1, r2, r3) +#define vfmsub231sd(r1, r2, r3) _vfmsub231sx(_jit, 1, r1, r2, r3) +static void +_vfmsub231sx(jit_state_t *_jit, jit_bool_t dbl, + jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + /* VFMSUB231SD */ + vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1); + ic(0xbb); + mrm(0x03, r7(r1), r7(r3)); +} + +static void +_sse_fmar_f(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_movr_f(r0, r1); + vfmadd213ss(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_movr_f(rn(t0), r1); + vfmadd213ss(rn(t0), r2, r3); + sse_movr_f(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + if (r0 != r3) { + sse_mulr_f(r0, r1, r2); + sse_addr_f(r0, r0, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_mulr_f(rn(t0), r1, r2); + sse_addr_f(r0, rn(t0), r3); + jit_unget_reg(t0); + } + } +} + +static void +_sse_fmar_d(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_movr_d(r0, r1); + vfmadd213sd(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_movr_d(rn(t0), r1); + vfmadd213sd(rn(t0), r2, r3); + sse_movr_d(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + if (r0 != r3) { + sse_mulr_d(r0, r1, r2); + sse_addr_d(r0, r0, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_mulr_d(rn(t0), r1, r2); + sse_addr_d(r0, rn(t0), r3); + jit_unget_reg(t0); + } + } +} + +static void +_sse_fmsr_f(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_movr_f(r0, r1); + vfmsub213ss(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_movr_f(rn(t0), r1); + vfmsub213ss(rn(t0), r2, r3); + sse_movr_f(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + if (r0 != r3) { + sse_mulr_f(r0, r1, r2); + sse_subr_f(r0, r0, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_mulr_f(rn(t0), r1, r2); + sse_subr_f(r0, rn(t0), r3); + jit_unget_reg(t0); + } + } +} + +static void +_sse_fmsr_d(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_movr_d(r0, r1); + vfmsub213sd(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_movr_d(rn(t0), r1); + vfmsub213sd(rn(t0), r2, r3); + sse_movr_d(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + if (r0 != r3) { + sse_mulr_d(r0, r1, r2); + sse_subr_d(r0, r0, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_mulr_d(rn(t0), r1, r2); + sse_subr_d(r0, rn(t0), r3); + jit_unget_reg(t0); + } + } +} + +static void +_sse_fnmar_f(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_negr_f(r0, r1); + vfmsub213ss(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_f(rn(t0), r1); + vfmsub213ss(rn(t0), r2, r3); + sse_movr_f(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_f(rn(t0), r1); + sse_mulr_f(rn(t0), rn(t0), r2); + sse_subr_f(r0, rn(t0), r3); + jit_unget_reg(t0); + } +} + +static void +_sse_fnmar_d(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_negr_d(r0, r1); + vfmsub213sd(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_d(rn(t0), r1); + vfmsub213sd(rn(t0), r2, r3); + sse_movr_d(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_d(rn(t0), r1); + sse_mulr_d(rn(t0), rn(t0), r2); + sse_subr_d(r0, rn(t0), r3); + jit_unget_reg(t0); + } +} + +static void +_sse_fnmsr_f(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_negr_f(r0, r1); + vfmadd213ss(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_f(rn(t0), r1); + vfmadd213ss(rn(t0), r2, r3); + sse_movr_f(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_f(rn(t0), r1); + sse_mulr_f(rn(t0), rn(t0), r2); + sse_addr_f(r0, rn(t0), r3); + jit_unget_reg(t0); + } +} + +static void +_sse_fnmsr_d(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0; + if (jit_cpu.fma) { + if (r0 != r2 && r0 != r3) { + sse_negr_d(r0, r1); + vfmadd213sd(r0, r2, r3); + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_d(rn(t0), r1); + vfmadd213sd(rn(t0), r2, r3); + sse_movr_d(r0, rn(t0)); + jit_unget_reg(t0); + } + } + else { + t0 = jit_get_reg(jit_class_fpr|jit_class_xpr); + sse_negr_d(rn(t0), r1); + sse_mulr_d(rn(t0), rn(t0), r2); + sse_addr_d(r0, rn(t0), r3); + jit_unget_reg(t0); + } +} + static void _ssecmp(jit_state_t *_jit, jit_bool_t d, jit_int32_t code, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) @@ -817,12 +1183,22 @@ _sse_movi_f(jit_state_t *_jit, jit_int32_t r0, jit_float32_t *i0) else { reg = jit_get_reg(jit_class_gpr); movi(rn(reg), data.i); - movdlxr(r0, rn(reg)); + movdxr(r0, rn(reg)); jit_unget_reg(reg); } } } +static void +_sse_movi_w_f(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) +{ + jit_int32_t reg; + reg = jit_get_reg(jit_class_gpr); + movi(rn(reg), i0); + movdxr(r0, rn(reg)); + jit_unget_reg(reg); +} + fopi(lt) fopi(le) @@ -975,6 +1351,26 @@ _sse_ldxi_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) } } +static void +_sse_unldr_x(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) +{ + assert(i0 == 4 || i0 == 8); + if (i0 == 4) + sse_ldr_f(r0, r1); + else + sse_ldr_d(r0, r1); +} + +static void +_sse_unldi_x(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1) +{ + assert(i1 == 4 || i1 == 8); + if (i1 == 4) + sse_ldi_f(r0, i0); + else + sse_ldi_d(r0, i0); +} + static void _sse_sti_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0) { @@ -1028,6 +1424,26 @@ _sse_stxi_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) } } +static void +_sse_unstr_x(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) +{ + assert(i0 == 4 || i0 == 8); + if (i0 == 4) + sse_str_f(r0, r1); + else + sse_str_d(r0, r1); +} + +static void +_sse_unsti_x(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1) +{ + assert(i1 == 4 || i1 == 8); + if (i1 == 4) + sse_sti_f(i0, r0); + else + sse_sti_d(i0, r0); +} + static jit_word_t _sse_bltr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) { @@ -1313,7 +1729,7 @@ _sse_movi_d(jit_state_t *_jit, jit_int32_t r0, jit_float64_t *i0) reg = jit_get_reg(jit_class_gpr); #if __X64 && !__X64_32 movi(rn(reg), data.w); - movdqxr(r0, rn(reg)); + movqxr(r0, rn(reg)); jit_unget_reg(reg); #else CHECK_CVT_OFFSET(); @@ -1328,6 +1744,52 @@ _sse_movi_d(jit_state_t *_jit, jit_int32_t r0, jit_float64_t *i0) } } +#if __X32 || __X64_32 +static void +_sse_movr_ww_d(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) +{ + CHECK_CVT_OFFSET(); + stxi_i(CVT_OFFSET, _RBP_REGNO, r1); + stxi_i(CVT_OFFSET + 4, _RBP_REGNO, r2); + sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET); +} + +static void +_sse_movr_d_ww(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) +{ + CHECK_CVT_OFFSET(); + sse_stxi_d(CVT_OFFSET, _RBP_REGNO, r2); + ldxi_i(r0, _RBP_REGNO, CVT_OFFSET); + ldxi_i(r1, _RBP_REGNO, CVT_OFFSET + 4); +} + +static void +_sse_movi_ww_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1) +{ + jit_int32_t reg; + CHECK_CVT_OFFSET(); + reg = jit_get_reg(jit_class_gpr); + movi(rn(reg), i0); + stxi_i(CVT_OFFSET, _RBP_REGNO, rn(reg)); + movi(rn(reg), i1); + stxi_i(CVT_OFFSET + 4, _RBP_REGNO, rn(reg)); + sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET); + jit_unget_reg(reg); +} +#else +static void +_sse_movi_w_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) +{ + jit_int32_t reg; + reg = jit_get_reg(jit_class_gpr); + movi(rn(reg), i0); + movqxr(r0, rn(reg)); + jit_unget_reg(reg); +} +#endif + static void _sse_ldi_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0) {