git subrepo pull (merge) --force deps/lightning
[pcsx_rearmed.git] / deps / lightning / lib / jit_x86-sse.c
1 /*
2  * Copyright (C) 2012-2023  Free Software Foundation, Inc.
3  *
4  * This file is part of GNU lightning.
5  *
6  * GNU lightning is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU Lesser General Public License as published
8  * by the Free Software Foundation; either version 3, or (at your option)
9  * any later version.
10  *
11  * GNU lightning is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14  * License for more details.
15  *
16  * Authors:
17  *      Paulo Cesar Pereira de Andrade
18  */
19
20 #if PROTO
21 #  define _XMM6_REGNO                   6
22 #  define _XMM7_REGNO                   7
23 #  define _XMM8_REGNO                   8
24 #  define _XMM9_REGNO                   9
25 #  define _XMM10_REGNO                  10
26 #  define _XMM11_REGNO                  11
27 #  define _XMM12_REGNO                  12
28 #  define _XMM13_REGNO                  13
29 #  define _XMM14_REGNO                  14
30 #  define _XMM15_REGNO                  15
31 #define X86_SSE_MOV                     0x10
32 #define X86_SSE_MOV1                    0x11
33 #define X86_SSE_MOVLP                   0x12
34 #define X86_SSE_MOVHP                   0x16
35 #define X86_SSE_MOVA                    0x28
36 #define X86_SSE_CVTIS                   0x2a
37 #define X86_SSE_CVTTSI                  0x2c
38 #define X86_SSE_CVTSI                   0x2d
39 #define X86_SSE_UCOMI                   0x2e
40 #define X86_SSE_COMI                    0x2f
41 #define X86_SSE_ROUND                   0x3a
42 #define X86_SSE_SQRT                    0x51
43 #define X86_SSE_RSQRT                   0x52
44 #define X86_SSE_RCP                     0x53
45 #define X86_SSE_AND                     0x54
46 #define X86_SSE_ANDN                    0x55
47 #define X86_SSE_OR                      0x56
48 #define X86_SSE_XOR                     0x57
49 #define X86_SSE_ADD                     0x58
50 #define X86_SSE_MUL                     0x59
51 #define X86_SSE_CVTSD                   0x5a
52 #define X86_SSE_CVTDT                   0x5b
53 #define X86_SSE_SUB                     0x5c
54 #define X86_SSE_MIN                     0x5d
55 #define X86_SSE_DIV                     0x5e
56 #define X86_SSE_MAX                     0x5f
57 #define X86_SSE_X2G                     0x6e
58 #define X86_SSE_EQB                     0x74
59 #define X86_SSE_EQW                     0x75
60 #define X86_SSE_EQD                     0x76
61 #define X86_SSE_G2X                     0x7e
62 #define X86_SSE_MOV2                    0xd6
63 #  define sser(c,r0,r1)                 _sser(_jit,c,r0,r1)
64 static void _sser(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
65 #  define ssexr(p,c,r0,r1)              _ssexr(_jit,p,c,r0,r1)
66 static void _ssexr(jit_state_t*,
67                    jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
68 #  define ssexi(c,r0,m,i)               _ssexi(_jit,c,r0,m,i)
69 static void _ssexi(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
70 #  define addssr(r0, r1)                ssexr(0xf3, X86_SSE_ADD, r0, r1)
71 #  define addsdr(r0, r1)                ssexr(0xf2, X86_SSE_ADD, r0, r1)
72 #  define subssr(r0, r1)                ssexr(0xf3, X86_SSE_SUB, r0, r1)
73 #  define subsdr(r0, r1)                ssexr(0xf2, X86_SSE_SUB, r0, r1)
74 #  define mulssr(r0, r1)                ssexr(0xf3, X86_SSE_MUL, r0, r1)
75 #  define mulsdr(r0, r1)                ssexr(0xf2, X86_SSE_MUL, r0, r1)
76 #  define divssr(r0, r1)                ssexr(0xf3, X86_SSE_DIV, r0, r1)
77 #  define divsdr(r0, r1)                ssexr(0xf2, X86_SSE_DIV, r0, r1)
78 #  define andpsr(r0, r1)                sser(       X86_SSE_AND, r0, r1)
79 #  define andpdr(r0, r1)                ssexr(0x66, X86_SSE_AND, r0, r1)
80 #  define sse_truncr_f_i(r0, r1)        ssexr(0xf3, X86_SSE_CVTTSI, r0, r1)
81 #  define sse_truncr_d_i(r0, r1)        ssexr(0xf2, X86_SSE_CVTTSI, r0, r1)
82 #  if __X64
83 #    define sse_truncr_f_l(r0, r1)      sselxr(0xf3, X86_SSE_CVTTSI, r0, r1)
84 #    define sse_truncr_d_l(r0, r1)      sselxr(0xf2, X86_SSE_CVTTSI, r0, r1)
85 #    define sse_extr_f(r0, r1)          sselxr(0xf3, X86_SSE_CVTIS, r0, r1)
86 #    define sse_extr_d(r0, r1)          sselxr(0xf2, X86_SSE_CVTIS, r0, r1)
87 #  else
88 #    define sse_extr_f(r0, r1)          ssexr(0xf3, X86_SSE_CVTIS, r0, r1)
89 #    define sse_extr_d(r0, r1)          ssexr(0xf2, X86_SSE_CVTIS, r0, r1)
90 #  endif
91 #  define sse_extr_f_d(r0, r1)          ssexr(0xf3, X86_SSE_CVTSD, r0, r1)
92 #  define sse_extr_d_f(r0, r1)          ssexr(0xf2, X86_SSE_CVTSD, r0, r1)
93 #  define ucomissr(r0,r1)               sser(X86_SSE_UCOMI,r0,r1)
94 #  define ucomisdr(r0,r1)               ssexr(0x66,X86_SSE_UCOMI,r0,r1)
95 #  define xorpsr(r0,r1)                 sser(X86_SSE_XOR,r0,r1)
96 #  define xorpdr(r0,r1)                 ssexr(0x66,X86_SSE_XOR,r0,r1)
97 #  define movdxr(r0,r1)                 ssexr(0x66, X86_SSE_X2G,r0,r1)
98 #  define movdrx(r0,r1)                 ssexr(0x66, X86_SSE_G2X,r0,r1)
99 #  define movqxr(r0,r1)                 sselxr(0x66, X86_SSE_X2G,r0,r1)
100 #  define movqrx(r0,r1)                 sselxr(0x66, X86_SSE_G2X,r0,r1)
101 #  define pcmpeqlr(r0, r1)              ssexr(0x66, X86_SSE_EQD, r0, r1)
102 #  define psrl(r0, i0)                  ssexi(0x72, r0, 0x02, i0)
103 #  define psrq(r0, i0)                  ssexi(0x73, r0, 0x02, i0)
104 #  define psll(r0, i0)                  ssexi(0x72, r0, 0x06, i0)
105 #  define pslq(r0, i0)                  ssexi(0x73, r0, 0x06, i0)
106 #  if __X64 && !__X64_32
107 #    define sselxr(p,c,r0,r1)           _sselxr(_jit,p,c,r0,r1)
108 static void
109 _sselxr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t, jit_int32_t);
110 #  else
111 #    define sselxr(p,c,r0,r1)           ssexr(p,c,r0,r1)
112 #  endif
113 #  define ssexrx(p,c,md,rb,ri,ms,rd)    _ssexrx(_jit,p,c,md,rb,ri,ms,rd)
114 #  define movssmr(md,rb,ri,ms,rd)       ssexrx(0xf3,X86_SSE_MOV,md,rb,ri,ms,rd)
115 #  define movsdmr(md,rb,ri,ms,rd)       ssexrx(0xf2,X86_SSE_MOV,md,rb,ri,ms,rd)
116 #  define movssrm(rs,md,mb,mi,ms)       ssexrx(0xf3,X86_SSE_MOV1,md,mb,mi,ms,rs)
117 #  define movsdrm(rs,md,mb,mi,ms)       ssexrx(0xf2,X86_SSE_MOV1,md,mb,mi,ms,rs)
118 static void
119 _ssexrx(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t,
120         jit_int32_t, jit_int32_t, jit_int32_t, jit_int32_t);
121 #  define sse_addr_f(r0, r1, r2)        _sse_addr_f(_jit, r0, r1, r2)
122 static void _sse_addr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
123 #  define sse_addi_f(r0, r1, i0)        _sse_addi_f(_jit, r0, r1, i0)
124 static void _sse_addi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
125 #  define sse_addr_d(r0, r1, r2)        _sse_addr_d(_jit, r0, r1, r2)
126 static void _sse_addr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
127 #  define sse_addi_d(r0, r1, i0)        _sse_addi_d(_jit, r0, r1, i0)
128 static void _sse_addi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
129 #  define sse_subr_f(r0, r1, r2)        _sse_subr_f(_jit, r0, r1, r2)
130 static void _sse_subr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
131 #  define sse_subi_f(r0, r1, i0)        _sse_subi_f(_jit, r0, r1, i0)
132 static void _sse_subi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
133 #  define sse_subr_d(r0, r1, r2)        _sse_subr_d(_jit, r0, r1, r2)
134 static void _sse_subr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
135 #  define sse_subi_d(r0, r1, i0)        _sse_subi_d(_jit, r0, r1, i0)
136 static void _sse_subi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
137 #  define sse_rsbr_f(r0, r1, r2)        sse_subr_f(r0, r2, r1)
138 #  define sse_rsbi_f(r0, r1, i0)        _sse_rsbi_f(_jit, r0, r1, i0)
139 static void _sse_rsbi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
140 #  define sse_rsbr_d(r0, r1, r2)        sse_subr_d(r0, r2, r1)
141 #  define sse_rsbi_d(r0, r1, i0)        _sse_rsbi_d(_jit, r0, r1, i0)
142 static void _sse_rsbi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
143 #  define sse_mulr_f(r0, r1, r2)        _sse_mulr_f(_jit, r0, r1, r2)
144 static void _sse_mulr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
145 #  define sse_muli_f(r0, r1, i0)        _sse_muli_f(_jit, r0, r1, i0)
146 static void _sse_muli_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
147 #  define sse_mulr_d(r0, r1, r2)        _sse_mulr_d(_jit, r0, r1, r2)
148 static void _sse_mulr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
149 #  define sse_muli_d(r0, r1, i0)        _sse_muli_d(_jit, r0, r1, i0)
150 static void _sse_muli_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
151 #  define sse_divr_f(r0, r1, r2)        _sse_divr_f(_jit, r0, r1, r2)
152 static void _sse_divr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
153 #  define sse_divi_f(r0, r1, i0)        _sse_divi_f(_jit, r0, r1, i0)
154 static void _sse_divi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
155 #  define sse_divr_d(r0, r1, r2)        _sse_divr_d(_jit, r0, r1, r2)
156 static void _sse_divr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
157 #  define sse_divi_d(r0, r1, i0)        _sse_divi_d(_jit, r0, r1, i0)
158 static void _sse_divi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
159 #  define sse_absr_f(r0, r1)            _sse_absr_f(_jit, r0, r1)
160 static void _sse_absr_f(jit_state_t*,jit_int32_t,jit_int32_t);
161 #  define sse_absr_d(r0, r1)            _sse_absr_d(_jit, r0, r1)
162 static void _sse_absr_d(jit_state_t*,jit_int32_t,jit_int32_t);
163 #  define sse_negr_f(r0, r1)            _sse_negr_f(_jit, r0, r1)
164 static void _sse_negr_f(jit_state_t*,jit_int32_t,jit_int32_t);
165 #  define sse_negr_d(r0, r1)            _sse_negr_d(_jit, r0, r1)
166 static void _sse_negr_d(jit_state_t*,jit_int32_t,jit_int32_t);
167 #  define sse_sqrtr_f(r0, r1)           ssexr(0xf3, X86_SSE_SQRT, r0, r1)
168 #  define sse_sqrtr_d(r0, r1)           ssexr(0xf2, X86_SSE_SQRT, r0, r1)
169 #  define sse_fmar_f(r0, r1, r2, r3)    _sse_fmar_f(_jit, r0, r1, r2, r3)
170 static void _sse_fmar_f(jit_state_t*,
171                         jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
172 #  define sse_fmar_d(r0, r1, r2, r3)    _sse_fmar_d(_jit, r0, r1, r2, r3)
173 static void _sse_fmar_d(jit_state_t*,
174                         jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
175 #  define sse_fmsr_f(r0, r1, r2, r3)    _sse_fmsr_f(_jit, r0, r1, r2, r3)
176 static void _sse_fmsr_f(jit_state_t*,
177                         jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
178 #  define sse_fmsr_d(r0, r1, r2, r3)    _sse_fmsr_d(_jit, r0, r1, r2, r3)
179 static void _sse_fmsr_d(jit_state_t*,
180                         jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
181 #  define sse_fnmar_f(r0, r1, r2, r3)   _sse_fnmar_f(_jit, r0, r1, r2, r3)
182 static void _sse_fnmar_f(jit_state_t*,
183                          jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
184 #  define sse_fnmar_d(r0, r1, r2, r3)   _sse_fnmar_d(_jit, r0, r1, r2, r3)
185 static void _sse_fnmar_d(jit_state_t*,
186                          jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
187 #  define sse_fnmsr_f(r0, r1, r2, r3)   _sse_fnmsr_f(_jit, r0, r1, r2, r3)
188 static void _sse_fnmsr_f(jit_state_t*,
189                          jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
190 #  define sse_fnmsr_d(r0, r1, r2, r3)   _sse_fnmsr_d(_jit, r0, r1, r2, r3)
191 static void _sse_fnmsr_d(jit_state_t*,
192                          jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
193 #  define ssecmpf(code, r0, r1, r2)     _ssecmp(_jit, 0, code, r0, r1, r2)
194 #  define ssecmpd(code, r0, r1, r2)     _ssecmp(_jit, 1, code, r0, r1, r2)
195 static void
196 _ssecmp(jit_state_t*, jit_bool_t, jit_int32_t,
197         jit_int32_t, jit_int32_t, jit_int32_t);
198 #define sse_movr_f(r0,r1)               _sse_movr_f(_jit,r0,r1)
199 static void _sse_movr_f(jit_state_t*, jit_int32_t, jit_int32_t);
200 #define sse_movi_f(r0,i0)               _sse_movi_f(_jit,r0,i0)
201 static void _sse_movi_f(jit_state_t*, jit_int32_t, jit_float32_t*);
202 #  define sse_movr_w_f(r0,r1)           movdxr(r0, r1)
203 #  define sse_movr_f_w(r0,r1)           movdrx(r1, r0)
204 #define sse_movi_w_f(r0, i0)            _sse_movi_w_f(_jit, r0, i0)
205 static void _sse_movi_w_f(jit_state_t*, jit_int32_t, jit_word_t);
206 #  define sse_lti_f(r0, r1, i0)         _sse_lti_f(_jit, r0, r1, i0)
207 static void _sse_lti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
208 #  define sse_ltr_f(r0, r1, r2)         ssecmpf(X86_CC_A, r0, r1, r2)
209 #  define sse_lei_f(r0, r1, i0)         _sse_lei_f(_jit, r0, r1, i0)
210 static void _sse_lei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
211 #  define sse_ler_f(r0, r1, r2)         ssecmpf(X86_CC_AE, r0, r1, r2)
212 #  define sse_eqi_f(r0, r1, i0)         _sse_eqi_f(_jit, r0, r1, i0)
213 static void _sse_eqi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
214 #  define sse_eqr_f(r0, r1, r2)         _sse_eqr_f(_jit, r0, r1, r2)
215 static void _sse_eqr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
216 #  define sse_gei_f(r0, r1, i0)         _sse_gei_f(_jit, r0, r1, i0)
217 static void _sse_gei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
218 #  define sse_ger_f(r0, r1, r2)         ssecmpf(X86_CC_AE, r0, r2, r1)
219 #  define sse_gti_f(r0, r1, i0)         _sse_gti_f(_jit, r0, r1, i0)
220 static void _sse_gti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
221 #  define sse_gtr_f(r0, r1, r2)         ssecmpf(X86_CC_A, r0, r2, r1)
222 #  define sse_nei_f(r0, r1, i0)         _sse_nei_f(_jit, r0, r1, i0)
223 static void _sse_nei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
224 #  define sse_ner_f(r0, r1, r2)         _sse_ner_f(_jit, r0, r1, r2)
225 static void _sse_ner_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
226 #  define sse_unlti_f(r0, r1, i0)       _sse_unlti_f(_jit, r0, r1, i0)
227 static void _sse_unlti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
228 #  define sse_unltr_f(r0, r1, r2)       ssecmpf(X86_CC_NAE, r0, r2, r1)
229 #  define sse_unlei_f(r0, r1, i0)       _sse_unlei_f(_jit, r0, r1, i0)
230 static void _sse_unlei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
231 #  define sse_unler_f(r0, r1, r2)       _sse_unler_f(_jit, r0, r1, r2)
232 #  define sse_uneqi_f(r0, r1, i0)       _sse_uneqi_f(_jit, r0, r1, i0)
233 static void _sse_uneqi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
234 static void _sse_unler_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
235 #  define sse_uneqr_f(r0, r1, r2)       _sse_uneqr_f(_jit, r0, r1, r2)
236 static void _sse_uneqr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
237 #  define sse_ungei_f(r0, r1, i0)       _sse_ungei_f(_jit, r0, r1, i0)
238 static void _sse_ungei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
239 #  define sse_unger_f(r0, r1, r2)       _sse_unger_f(_jit, r0, r1, r2)
240 static void _sse_unger_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
241 #  define sse_ungti_f(r0, r1, i0)       _sse_ungti_f(_jit, r0, r1, i0)
242 static void _sse_ungti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
243 #  define sse_ungtr_f(r0, r1, r2)       ssecmpf(X86_CC_NAE, r0, r1, r2)
244 #  define sse_ltgti_f(r0, r1, i0)       _sse_ltgti_f(_jit, r0, r1, i0)
245 static void _sse_ltgti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
246 #  define sse_ltgtr_f(r0, r1, r2)       _sse_ltgtr_f(_jit, r0, r1, r2)
247 static void _sse_ltgtr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
248 #  define sse_ordi_f(r0, r1, i0)        _sse_ordi_f(_jit, r0, r1, i0)
249 static void _sse_ordi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
250 #  define sse_ordr_f(r0, r1, r2)        ssecmpf(X86_CC_NP, r0, r2, r1)
251 #  define sse_unordi_f(r0, r1, i0)      _sse_unordi_f(_jit, r0, r1, i0)
252 static void _sse_unordi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
253 #  define sse_unordr_f(r0, r1, r2)      ssecmpf(X86_CC_P, r0, r2, r1)
254 #  define sse_ldr_f(r0, r1)             movssmr(0, r1, _NOREG, _SCL1, r0)
255 #  define sse_ldi_f(r0, i0)             _sse_ldi_f(_jit, r0, i0)
256 static void _sse_ldi_f(jit_state_t*, jit_int32_t, jit_word_t);
257 #  define sse_ldxr_f(r0, r1, r2)        _sse_ldxr_f(_jit, r0, r1, r2)
258 static void _sse_ldxr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
259 #  define sse_ldxi_f(r0, r1, i0)        _sse_ldxi_f(_jit, r0, r1, i0)
260 static void _sse_ldxi_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
261 #  define sse_unldr_x(r0, r1, i0)       _sse_unldr_x(_jit, r0, r1, i0)
262 static void _sse_unldr_x(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
263 #  define sse_unldi_x(r0, i0, i1)       _sse_unldi_x(_jit, r0, i0, i1)
264 static void _sse_unldi_x(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t);
265 #  define sse_str_f(r0, r1)             movssrm(r1, 0, r0, _NOREG, _SCL1)
266 #  define sse_sti_f(i0, r0)             _sse_sti_f(_jit, i0, r0)
267 static void _sse_sti_f(jit_state_t*, jit_word_t,jit_int32_t);
268 #  define sse_stxr_f(r0, r1, r2)        _sse_stxr_f(_jit, r0, r1, r2)
269 static void _sse_stxr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
270 #  define sse_stxi_f(i0, r0, r1)        _sse_stxi_f(_jit, i0, r0, r1)
271 static void _sse_stxi_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
272 #define sse_unstr_x(r0, r1, i0)         _sse_unstr_x(_jit, r0, r1, i0)
273 static void _sse_unstr_x(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
274 #define sse_unsti_x(i0, r0, i1)         _sse_unsti_x(_jit, i0, r0, i1)
275 static void _sse_unsti_x(jit_state_t*, jit_word_t, jit_int32_t, jit_word_t);
276 #  define sse_bltr_f(i0, r0, r1)        _sse_bltr_f(_jit, i0, r0, r1)
277 static jit_word_t _sse_bltr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
278 #  define sse_blti_f(i0, r0, i1)        _sse_blti_f(_jit, i0, r0, i1)
279 static jit_word_t
280 _sse_blti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
281 #  define sse_bler_f(i0, r0, r1)        _sse_bler_f(_jit, i0, r0, r1)
282 static jit_word_t _sse_bler_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
283 #  define sse_blei_f(i0, r0, i1)        _sse_blei_f(_jit, i0, r0, i1)
284 static jit_word_t
285 _sse_blei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
286 #  define sse_beqr_f(i0, r0, r1)        _sse_beqr_f(_jit, i0, r0, r1)
287 static jit_word_t _sse_beqr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
288 #  define sse_beqi_f(i0, r0, i1)        _sse_beqi_f(_jit, i0, r0, i1)
289 static jit_word_t
290 _sse_beqi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
291 #  define sse_bger_f(i0, r0, r1)        _sse_bger_f(_jit, i0, r0, r1)
292 static jit_word_t _sse_bger_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
293 #  define sse_bgei_f(i0, r0, i1)        _sse_bgei_f(_jit, i0, r0, i1)
294 static jit_word_t
295 _sse_bgei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
296 #  define sse_bgtr_f(i0, r0, r1)        _sse_bgtr_f(_jit, i0, r0, r1)
297 static jit_word_t _sse_bgtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
298 #  define sse_bgti_f(i0, r0, i1)        _sse_bgti_f(_jit, i0, r0, i1)
299 static jit_word_t
300 _sse_bgti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
301 #  define sse_bner_f(i0, r0, r1)        _sse_bner_f(_jit, i0, r0, r1)
302 static jit_word_t _sse_bner_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
303 #  define sse_bnei_f(i0, r0, i1)        _sse_bnei_f(_jit, i0, r0, i1)
304 static jit_word_t
305 _sse_bnei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
306 #  define sse_bunltr_f(i0, r0, r1)      _sse_bunltr_f(_jit, i0, r0, r1)
307 static jit_word_t _sse_bunltr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
308 #  define sse_bunlti_f(i0, r0, i1)      _sse_bunlti_f(_jit, i0, r0, i1)
309 static jit_word_t
310 _sse_bunlti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
311 #  define sse_bunler_f(i0, r0, r1)      _sse_bunler_f(_jit, i0, r0, r1)
312 static jit_word_t _sse_bunler_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
313 #  define sse_bunlei_f(i0, r0, i1)      _sse_bunlei_f(_jit, i0, r0, i1)
314 static jit_word_t
315 _sse_bunlei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
316 #  define sse_buneqr_f(i0, r0, r1)      _sse_buneqr_f(_jit, i0, r0, r1)
317 static jit_word_t _sse_buneqr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
318 #  define sse_buneqi_f(i0, r0, i1)      _sse_buneqi_f(_jit, i0, r0, i1)
319 static jit_word_t
320 _sse_buneqi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
321 #  define sse_bunger_f(i0, r0, r1)      _sse_bunger_f(_jit, i0, r0, r1)
322 static jit_word_t _sse_bunger_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
323 #  define sse_bungei_f(i0, r0, i1)      _sse_bungei_f(_jit, i0, r0, i1)
324 static jit_word_t
325 _sse_bungei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
326 #  define sse_bungtr_f(i0, r0, r1)      _sse_bungtr_f(_jit, i0, r0, r1)
327 static jit_word_t _sse_bungtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
328 #  define sse_bungti_f(i0, r0, i1)      _sse_bungti_f(_jit, i0, r0, i1)
329 static jit_word_t
330 _sse_bungti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
331 #  define sse_bltgtr_f(i0, r0, r1)      _sse_bltgtr_f(_jit, i0, r0, r1)
332 static jit_word_t _sse_bltgtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
333 #  define sse_bltgti_f(i0, r0, i1)      _sse_bltgti_f(_jit, i0, r0, i1)
334 static jit_word_t
335 _sse_bltgti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
336 #  define sse_bordr_f(i0, r0, r1)       _sse_bordr_f(_jit, i0, r0, r1)
337 static jit_word_t _sse_bordr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
338 #  define sse_bordi_f(i0, r0, i1)       _sse_bordi_f(_jit, i0, r0, i1)
339 static jit_word_t
340 _sse_bordi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
341 #  define sse_bunordr_f(i0, r0, r1)     _sse_bunordr_f(_jit, i0, r0, r1)
342 static jit_word_t _sse_bunordr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
343 #  define sse_bunordi_f(i0, r0, i1)     _sse_bunordi_f(_jit, i0, r0, i1)
344 static jit_word_t
345 _sse_bunordi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
346 #define sse_movr_d(r0,r1)               _sse_movr_d(_jit,r0,r1)
347 static void _sse_movr_d(jit_state_t*, jit_int32_t, jit_int32_t);
348 #define sse_movi_d(r0,i0)               _sse_movi_d(_jit,r0,i0)
349 static void _sse_movi_d(jit_state_t*, jit_int32_t, jit_float64_t*);
350 #  if __X32 || __X64_32
351 #    define sse_movr_ww_d(r0, r1, r2)   _sse_movr_ww_d(_jit, r0, r1, r2)
352 static void _sse_movr_ww_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
353 #    define sse_movr_d_ww(r0, r1, r2)   _sse_movr_d_ww(_jit, r0, r1, r2)
354 static void _sse_movr_d_ww(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
355 #    define sse_movi_ww_d(r0, i0, i1)   _sse_movi_ww_d(_jit, r0, i0, i1)
356 static void _sse_movi_ww_d(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t);
357 #  else
358 #    define sse_movr_w_d(r0, r1)        movqxr(r0, r1)
359 #    define sse_movr_d_w(r0, r1)        movqrx(r1, r0)
360 #    define sse_movi_w_d(r0, i0)        _sse_movi_w_d(_jit, r0, i0)
361 static void _sse_movi_w_d(jit_state_t*, jit_int32_t, jit_word_t);
362 #  endif
363 #  define sse_ltr_d(r0, r1, r2)         ssecmpd(X86_CC_A, r0, r1, r2)
364 #  define sse_lti_d(r0, r1, i0)         _sse_lti_d(_jit, r0, r1, i0)
365 static void _sse_lti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
366 #  define sse_ler_d(r0, r1, r2)         ssecmpd(X86_CC_AE, r0, r1, r2)
367 #  define sse_lei_d(r0, r1, i0)         _sse_lei_d(_jit, r0, r1, i0)
368 static void _sse_lei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
369 #  define sse_eqr_d(r0, r1, r2)         _sse_eqr_d(_jit, r0, r1, r2)
370 static void _sse_eqr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
371 #  define sse_eqi_d(r0, r1, i0)         _sse_eqi_d(_jit, r0, r1, i0)
372 static void _sse_eqi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
373 #  define sse_ger_d(r0, r1, r2)         ssecmpd(X86_CC_AE, r0, r2, r1)
374 #  define sse_gei_d(r0, r1, i0)         _sse_gei_d(_jit, r0, r1, i0)
375 static void _sse_gei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
376 #  define sse_gtr_d(r0, r1, r2)         ssecmpd(X86_CC_A, r0, r2, r1)
377 #  define sse_gti_d(r0, r1, i0)         _sse_gti_d(_jit, r0, r1, i0)
378 static void _sse_gti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
379 #  define sse_ner_d(r0, r1, r2)         _sse_ner_d(_jit, r0, r1, r2)
380 static void _sse_ner_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
381 #  define sse_nei_d(r0, r1, i0)         _sse_nei_d(_jit, r0, r1, i0)
382 static void _sse_nei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
383 #  define sse_unltr_d(r0, r1, r2)       ssecmpd(X86_CC_NAE, r0, r2, r1)
384 #  define sse_unlti_d(r0, r1, i0)       _sse_unlti_d(_jit, r0, r1, i0)
385 static void _sse_unlti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
386 #  define sse_unler_d(r0, r1, r2)       _sse_unler_d(_jit, r0, r1, r2)
387 static void _sse_unler_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
388 #  define sse_unlei_d(r0, r1, i0)       _sse_unlei_d(_jit, r0, r1, i0)
389 static void _sse_unlei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
390 #  define sse_uneqr_d(r0, r1, r2)       _sse_uneqr_d(_jit, r0, r1, r2)
391 static void _sse_uneqr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
392 #  define sse_uneqi_d(r0, r1, i0)       _sse_uneqi_d(_jit, r0, r1, i0)
393 static void _sse_uneqi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
394 #  define sse_unger_d(r0, r1, r2)       _sse_unger_d(_jit, r0, r1, r2)
395 static void _sse_unger_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
396 #  define sse_ungei_d(r0, r1, i0)       _sse_ungei_d(_jit, r0, r1, i0)
397 static void _sse_ungei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
398 #  define sse_ungtr_d(r0, r1, r2)       ssecmpd(X86_CC_NAE, r0, r1, r2)
399 #  define sse_ungti_d(r0, r1, i0)       _sse_ungti_d(_jit, r0, r1, i0)
400 static void _sse_ungti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
401 #  define sse_ltgtr_d(r0, r1, r2)       _sse_ltgtr_d(_jit, r0, r1, r2)
402 static void _sse_ltgtr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
403 #  define sse_ltgti_d(r0, r1, i0)       _sse_ltgti_d(_jit, r0, r1, i0)
404 static void _sse_ltgti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
405 #  define sse_ordr_d(r0, r1, r2)        ssecmpd(X86_CC_NP, r0, r2, r1)
406 #  define sse_ordi_d(r0, r1, i0)        _sse_ordi_d(_jit, r0, r1, i0)
407 static void _sse_ordi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
408 #  define sse_unordr_d(r0, r1, r2)      ssecmpd(X86_CC_P, r0, r2, r1)
409 #  define sse_unordi_d(r0, r1, i0)      _sse_unordi_d(_jit, r0, r1, i0)
410 static void _sse_unordi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
411 #  define sse_ldr_d(r0, r1)             movsdmr(0, r1, _NOREG, _SCL1, r0)
412 #  define sse_ldi_d(r0, i0)             _sse_ldi_d(_jit, r0, i0)
413 static void _sse_ldi_d(jit_state_t*, jit_int32_t, jit_word_t);
414 #  define sse_ldxr_d(r0, r1, r2)        _sse_ldxr_d(_jit, r0, r1, r2)
415 static void _sse_ldxr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
416 #  define sse_ldxi_d(r0, r1, i0)        _sse_ldxi_d(_jit, r0, r1, i0)
417 static void _sse_ldxi_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
418 #  define sse_bltr_d(i0, r0, r1)        _sse_bltr_d(_jit, i0, r0, r1)
419 #  define sse_str_d(r0, r1)             movsdrm(r1, 0, r0, _NOREG, _SCL1)
420 #  define sse_sti_d(i0, r0)             _sse_sti_d(_jit, i0, r0)
421 static void _sse_sti_d(jit_state_t*, jit_word_t,jit_int32_t);
422 #  define sse_stxr_d(r0, r1, r2)        _sse_stxr_d(_jit, r0, r1, r2)
423 static void _sse_stxr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
424 #  define sse_stxi_d(i0, r0, r1)        _sse_stxi_d(_jit, i0, r0, r1)
425 static void _sse_stxi_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
426 static jit_word_t _sse_bltr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
427 #  define sse_blti_d(i0, r0, i1)        _sse_blti_d(_jit, i0, r0, i1)
428 static jit_word_t
429 _sse_blti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
430 #  define sse_bler_d(i0, r0, r1)        _sse_bler_d(_jit, i0, r0, r1)
431 static jit_word_t _sse_bler_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
432 #  define sse_blei_d(i0, r0, i1)        _sse_blei_d(_jit, i0, r0, i1)
433 static jit_word_t
434 _sse_blei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
435 #  define sse_beqr_d(i0, r0, r1)        _sse_beqr_d(_jit, i0, r0, r1)
436 static jit_word_t _sse_beqr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
437 #  define sse_beqi_d(i0, r0, i1)        _sse_beqi_d(_jit, i0, r0, i1)
438 static jit_word_t
439 _sse_beqi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
440 #  define sse_bger_d(i0, r0, r1)        _sse_bger_d(_jit, i0, r0, r1)
441 static jit_word_t _sse_bger_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
442 #  define sse_bgei_d(i0, r0, i1)        _sse_bgei_d(_jit, i0, r0, i1)
443 static jit_word_t
444 _sse_bgei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
445 #  define sse_bgtr_d(i0, r0, r1)        _sse_bgtr_d(_jit, i0, r0, r1)
446 static jit_word_t _sse_bgtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
447 #  define sse_bgti_d(i0, r0, i1)        _sse_bgti_d(_jit, i0, r0, i1)
448 static jit_word_t
449 _sse_bgti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
450 #  define sse_bner_d(i0, r0, r1)        _sse_bner_d(_jit, i0, r0, r1)
451 static jit_word_t _sse_bner_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
452 #  define sse_bnei_d(i0, r0, i1)        _sse_bnei_d(_jit, i0, r0, i1)
453 static jit_word_t
454 _sse_bnei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
455 #  define sse_bunltr_d(i0, r0, r1)      _sse_bunltr_d(_jit, i0, r0, r1)
456 static jit_word_t _sse_bunltr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
457 #  define sse_bunlti_d(i0, r0, i1)      _sse_bunlti_d(_jit, i0, r0, i1)
458 static jit_word_t
459 _sse_bunlti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
460 #  define sse_bunler_d(i0, r0, r1)      _sse_bunler_d(_jit, i0, r0, r1)
461 static jit_word_t _sse_bunler_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
462 #  define sse_bunlei_d(i0, r0, i1)      _sse_bunlei_d(_jit, i0, r0, i1)
463 static jit_word_t
464 _sse_bunlei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
465 #  define sse_buneqr_d(i0, r0, r1)      _sse_buneqr_d(_jit, i0, r0, r1)
466 static jit_word_t _sse_buneqr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
467 #  define sse_buneqi_d(i0, r0, i1)      _sse_buneqi_d(_jit, i0, r0, i1)
468 static jit_word_t
469 _sse_buneqi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
470 #  define sse_bunger_d(i0, r0, r1)      _sse_bunger_d(_jit, i0, r0, r1)
471 static jit_word_t _sse_bunger_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
472 #  define sse_bungei_d(i0, r0, i1)      _sse_bungei_d(_jit, i0, r0, i1)
473 static jit_word_t
474 _sse_bungei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
475 #  define sse_bungtr_d(i0, r0, r1)      _sse_bungtr_d(_jit, i0, r0, r1)
476 static jit_word_t _sse_bungtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
477 #  define sse_bungti_d(i0, r0, i1)      _sse_bungti_d(_jit, i0, r0, i1)
478 static jit_word_t
479 _sse_bungti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
480 #  define sse_bltgtr_d(i0, r0, r1)      _sse_bltgtr_d(_jit, i0, r0, r1)
481 static jit_word_t _sse_bltgtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
482 #  define sse_bltgti_d(i0, r0, i1)      _sse_bltgti_d(_jit, i0, r0, i1)
483 static jit_word_t
484 _sse_bltgti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
485 #  define sse_bordr_d(i0, r0, r1)       _sse_bordr_d(_jit, i0, r0, r1)
486 static jit_word_t _sse_bordr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
487 #  define sse_bordi_d(i0, r0, i1)       _sse_bordi_d(_jit, i0, r0, i1)
488 static jit_word_t
489 _sse_bordi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
490 #  define sse_bunordr_d(i0, r0, r1)     _sse_bunordr_d(_jit, i0, r0, r1)
491 static jit_word_t _sse_bunordr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
492 #  define sse_bunordi_d(i0, r0, i1)     _sse_bunordi_d(_jit, i0, r0, i1)
493 static jit_word_t
494 _sse_bunordi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
495 #endif
496
497 #if CODE
498 #  define fpr_opi(name, type, size)                                     \
499 static void                                                             \
500 _sse_##name##i_##type(jit_state_t *_jit,                                \
501                       jit_int32_t r0, jit_int32_t r1,                   \
502                       jit_float##size##_t *i0)                          \
503 {                                                                       \
504     jit_int32_t         reg = jit_get_reg(jit_class_fpr|jit_class_xpr); \
505     assert(jit_sse_reg_p(reg));                                         \
506     sse_movi_##type(rn(reg), i0);                                       \
507     sse_##name##r_##type(r0, r1, rn(reg));                              \
508     jit_unget_reg(reg);                                                 \
509 }
510 #  define fpr_bopi(name, type, size)                                    \
511 static jit_word_t                                                       \
512 _sse_b##name##i_##type(jit_state_t *_jit,                               \
513                        jit_word_t i0, jit_int32_t r0,                   \
514                        jit_float##size##_t *i1)                         \
515 {                                                                       \
516     jit_word_t          w;                                              \
517     jit_int32_t         reg = jit_get_reg(jit_class_fpr|jit_class_xpr|  \
518                                           jit_class_nospill);           \
519     assert(jit_sse_reg_p(reg));                                         \
520     sse_movi_##type(rn(reg), i1);                                       \
521     w = sse_b##name##r_##type(i0, r0, rn(reg));                         \
522     jit_unget_reg(reg);                                                 \
523     return (w);                                                         \
524 }
525 #  define fopi(name)                    fpr_opi(name, f, 32)
526 #  define fbopi(name)                   fpr_bopi(name, f, 32)
527 #  define dopi(name)                    fpr_opi(name, d, 64)
528 #  define dbopi(name)                   fpr_bopi(name, d, 64)
529 static void
530 _sser(jit_state_t *_jit, jit_int32_t c, jit_int32_t r0, jit_int32_t r1)
531 {
532     rex(0, 0, r0, 0, r1);
533     ic(0x0f);
534     ic(c);
535     mrm(0x03, r7(r0), r7(r1));
536 }
537
538 static void
539 _ssexr(jit_state_t *_jit, jit_int32_t p, jit_int32_t c,
540        jit_int32_t r0, jit_int32_t r1)
541 {
542     ic(p);
543     rex(0, 0, r0, 0, r1);
544     ic(0x0f);
545     ic(c);
546     mrm(0x03, r7(r0), r7(r1));
547 }
548
549 static void
550 _ssexi(jit_state_t *_jit, jit_int32_t c, jit_int32_t r0,
551        jit_int32_t m, jit_int32_t i)
552 {
553     ic(0x66);
554     rex(0, 0, 0, 0, r0);
555     ic(0x0f);
556     ic(c);
557     mrm(0x03, r7(m), r7(r0));
558     ic(i);
559 }
560
561 #if __X64
562 static void
563 _sselxr(jit_state_t *_jit, jit_int32_t p, jit_int32_t c,
564         jit_int32_t r0, jit_int32_t r1)
565 {
566     ic(p);
567     rex(0, 1, r0, 0, r1);
568     ic(0x0f);
569     ic(c);
570     mrm(0x03, r7(r0), r7(r1));
571 }
572 #endif
573
574 static void
575 _ssexrx(jit_state_t *_jit, jit_int32_t px, jit_int32_t code, jit_int32_t md,
576        jit_int32_t rb, jit_int32_t ri, jit_int32_t ms, jit_int32_t rd)
577 {
578     ic(px);
579     rex(0, 0, rd, ri, rb);
580     ic(0x0f);
581     ic(code);
582     rx(rd, md, rb, ri, ms);
583 }
584
585 static void
586 _sse_addr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
587 {
588     if (r0 == r1)
589         addssr(r0, r2);
590     else if (r0 == r2)
591         addssr(r0, r1);
592     else {
593         sse_movr_f(r0, r1);
594         addssr(r0, r2);
595     }
596 }
597
598 fopi(add)
599
600 static void
601 _sse_addr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
602 {
603     if (r0 == r1)
604         addsdr(r0, r2);
605     else if (r0 == r2)
606         addsdr(r0, r1);
607     else {
608         sse_movr_d(r0, r1);
609         addsdr(r0, r2);
610     }
611 }
612
613 dopi(add)
614
615 static void
616 _sse_subr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
617 {
618     jit_int32_t         reg;
619     if (r0 == r1)
620         subssr(r0, r2);
621     else if (r0 == r2) {
622         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
623         sse_movr_f(rn(reg), r0);
624         sse_movr_f(r0, r1);
625         subssr(r0, rn(reg));
626         jit_unget_reg(reg);
627     }
628     else {
629         sse_movr_f(r0, r1);
630         subssr(r0, r2);
631     }
632 }
633
634 fopi(sub)
635
636 static void
637 _sse_subr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
638 {
639     jit_int32_t         reg;
640     if (r0 == r1)
641         subsdr(r0, r2);
642     else if (r0 == r2) {
643         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
644         sse_movr_d(rn(reg), r0);
645         sse_movr_d(r0, r1);
646         subsdr(r0, rn(reg));
647         jit_unget_reg(reg);
648     }
649     else {
650         sse_movr_d(r0, r1);
651         subsdr(r0, r2);
652     }
653 }
654
655 dopi(sub)
656
657 fopi(rsb)
658
659 dopi(rsb)
660
661 static void
662 _sse_mulr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
663 {
664     if (r0 == r1)
665         mulssr(r0, r2);
666     else if (r0 == r2)
667         mulssr(r0, r1);
668     else {
669         sse_movr_f(r0, r1);
670         mulssr(r0, r2);
671     }
672 }
673
674 fopi(mul)
675
676 static void
677 _sse_mulr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
678 {
679     if (r0 == r1)
680         mulsdr(r0, r2);
681     else if (r0 == r2)
682         mulsdr(r0, r1);
683     else {
684         sse_movr_d(r0, r1);
685         mulsdr(r0, r2);
686     }
687 }
688
689 dopi(mul)
690
691 static void
692 _sse_divr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
693 {
694     jit_int32_t         reg;
695     if (r0 == r1)
696         divssr(r0, r2);
697     else if (r0 == r2) {
698         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
699         sse_movr_f(rn(reg), r0);
700         sse_movr_f(r0, r1);
701         divssr(r0, rn(reg));
702         jit_unget_reg(reg);
703     }
704     else {
705         sse_movr_f(r0, r1);
706         divssr(r0, r2);
707     }
708 }
709
710 fopi(div)
711
712 static void
713 _sse_divr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
714 {
715     jit_int32_t         reg;
716     if (r0 == r1)
717         divsdr(r0, r2);
718     else if (r0 == r2) {
719         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
720         sse_movr_d(rn(reg), r0);
721         sse_movr_d(r0, r1);
722         divsdr(r0, rn(reg));
723         jit_unget_reg(reg);
724     }
725     else {
726         sse_movr_d(r0, r1);
727         divsdr(r0, r2);
728     }
729 }
730
731 dopi(div)
732
733 static void
734 _sse_absr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
735 {
736     jit_int32_t         reg;
737     if (r0 == r1) {
738         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
739         pcmpeqlr(rn(reg), rn(reg));
740         psrl(rn(reg), 1);
741         andpsr(r0, rn(reg));
742         jit_unget_reg(reg);
743     }
744     else {
745         pcmpeqlr(r0, r0);
746         psrl(r0, 1);
747         andpsr(r0, r1);
748     }
749 }
750
751 static void
752 _sse_absr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
753 {
754     jit_int32_t         reg;
755     if (r0 == r1) {
756         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
757         pcmpeqlr(rn(reg), rn(reg));
758         psrq(rn(reg), 1);
759         andpdr(r0, rn(reg));
760         jit_unget_reg(reg);
761     }
762     else {
763         pcmpeqlr(r0, r0);
764         psrq(r0, 1);
765         andpdr(r0, r1);
766     }
767 }
768
769 static void
770 _sse_negr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
771 {
772     jit_int32_t         freg, ireg;
773     ireg = jit_get_reg(jit_class_gpr);
774     imovi(rn(ireg), 0x80000000);
775     if (r0 == r1) {
776         freg = jit_get_reg(jit_class_fpr|jit_class_xpr);
777         movdxr(rn(freg), rn(ireg));
778         xorpsr(r0, rn(freg));
779         jit_unget_reg(freg);
780     }
781     else {
782         movdxr(r0, rn(ireg));
783         xorpsr(r0, r1);
784     }
785     jit_unget_reg(ireg);
786 }
787
788 static void
789 _sse_negr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
790 {
791     jit_int32_t         freg, ireg;
792     ireg = jit_get_reg(jit_class_gpr);
793     imovi(rn(ireg), 0x80000000);
794     if (r0 == r1) {
795         freg = jit_get_reg(jit_class_fpr|jit_class_xpr);
796         movdxr(rn(freg), rn(ireg));
797         pslq(rn(freg), 32);
798         xorpdr(r0, rn(freg));
799         jit_unget_reg(freg);
800     }
801     else {
802         movdxr(r0, rn(ireg));
803         pslq(r0, 32);
804         xorpdr(r0, r1);
805     }
806     jit_unget_reg(ireg);
807 }
808
809 /* r1 = (r1 * r3) + r2 */
810 #define vfmadd132ss(r1, r2, r3)         _vfmadd132sx(_jit, 0, r1, r2, r3)
811 #define vfmadd132sd(r1, r2, r3)         _vfmadd132sx(_jit, 1, r1, r2, r3)
812 static void
813 _vfmadd132sx(jit_state_t *_jit, jit_bool_t dbl,
814              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
815 {
816     /* VFMADD132SD */
817     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
818     ic(0x99);
819     mrm(0x03, r7(r1), r7(r3));
820 }
821
822 /* r1 = (r1 * r3) - r2 */
823 #define vfmsub132ss(r1, r2, r3)         _vfmsub132sx(_jit, 0, r1, r2, r3)
824 #define vfmsub132sd(r1, r2, r3)         _vfmsub132sx(_jit, 1, r1, r2, r3)
825 static void
826 _vfmsub132sx(jit_state_t *_jit, jit_bool_t dbl,
827              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
828 {
829     /* VFMSUB132SD */
830     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
831     ic(0x9b);
832     mrm(0x03, r7(r1), r7(r3));
833 }
834
835 /* r1 = (r1 * r2) + r3 */
836 #define vfmadd213ss(r1, r2, r3)         _vfmadd213sx(_jit, 0, r1, r2, r3)
837 #define vfmadd213sd(r1, r2, r3)         _vfmadd213sx(_jit, 1, r1, r2, r3)
838 static void
839 _vfmadd213sx(jit_state_t *_jit, jit_bool_t dbl,
840              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
841 {
842     /* VFMADD132SD */
843     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
844     ic(0xa9);
845     mrm(0x03, r7(r1), r7(r3));
846 }
847
848 /* r1 = (r1 * r2) - r3 */
849 #define vfmsub213ss(r1, r2, r3)         _vfmsub213sx(_jit, 0, r1, r2, r3)
850 #define vfmsub213sd(r1, r2, r3)         _vfmsub213sx(_jit, 1, r1, r2, r3)
851 static void
852 _vfmsub213sx(jit_state_t *_jit, jit_bool_t dbl,
853              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
854 {
855     /* VFMSUB132SD */
856     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
857     ic(0xab);
858     mrm(0x03, r7(r1), r7(r3));
859 }
860
861 /* r1 = (r2 * r3) + r1 */
862 #define vfmadd231ss(r1, r2, r3)         _vfmadd231sx(_jit, 0, r1, r2, r3)
863 #define vfmadd231sd(r1, r2, r3)         _vfmadd231sx(_jit, 1, r1, r2, r3)
864 static void
865 _vfmadd231sx(jit_state_t *_jit, jit_bool_t dbl,
866              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
867 {
868     /* VFMADD231SD */
869     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
870     ic(0xb9);
871     mrm(0x03, r7(r1), r7(r3));
872 }
873
874 /* r1 = (r2 * r3) - r1 */
875 #define vfmsub231ss(r1, r2, r3)         _vfmsub231sx(_jit, 0, r1, r2, r3)
876 #define vfmsub231sd(r1, r2, r3)         _vfmsub231sx(_jit, 1, r1, r2, r3)
877 static void
878 _vfmsub231sx(jit_state_t *_jit, jit_bool_t dbl,
879              jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
880 {
881     /* VFMSUB231SD */
882     vex(r1, _NOREG, r3, 2, !!dbl, r2, 0, 1);
883     ic(0xbb);
884     mrm(0x03, r7(r1), r7(r3));
885 }
886
887 static void
888 _sse_fmar_f(jit_state_t *_jit,
889             jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
890 {
891     jit_int32_t         t0;
892     if (jit_cpu.fma) {
893         if (r0 != r2 && r0 != r3) {
894             sse_movr_f(r0, r1);
895             vfmadd213ss(r0, r2, r3);
896         }
897         else {
898             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
899             sse_movr_f(rn(t0), r1);
900             vfmadd213ss(rn(t0), r2, r3);
901             sse_movr_f(r0, rn(t0));
902             jit_unget_reg(t0);
903         }
904     }
905     else {
906         if (r0 != r3) {
907             sse_mulr_f(r0, r1, r2);
908             sse_addr_f(r0, r0, r3);
909         }
910         else {
911             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
912             sse_mulr_f(rn(t0), r1, r2);
913             sse_addr_f(r0, rn(t0), r3);
914             jit_unget_reg(t0);
915         }
916     }
917 }
918
919 static void
920 _sse_fmar_d(jit_state_t *_jit,
921             jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
922 {
923     jit_int32_t         t0;
924     if (jit_cpu.fma) {
925         if (r0 != r2 && r0 != r3) {
926             sse_movr_d(r0, r1);
927             vfmadd213sd(r0, r2, r3);
928         }
929         else {
930             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
931             sse_movr_d(rn(t0), r1);
932             vfmadd213sd(rn(t0), r2, r3);
933             sse_movr_d(r0, rn(t0));
934             jit_unget_reg(t0);
935         }
936     }
937     else {
938         if (r0 != r3) {
939             sse_mulr_d(r0, r1, r2);
940             sse_addr_d(r0, r0, r3);
941         }
942         else {
943             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
944             sse_mulr_d(rn(t0), r1, r2);
945             sse_addr_d(r0, rn(t0), r3);
946             jit_unget_reg(t0);
947         }
948     }
949 }
950
951 static void
952 _sse_fmsr_f(jit_state_t *_jit,
953             jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
954 {
955     jit_int32_t         t0;
956     if (jit_cpu.fma) {
957         if (r0 != r2 && r0 != r3) {
958             sse_movr_f(r0, r1);
959             vfmsub213ss(r0, r2, r3);
960         }
961         else {
962             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
963             sse_movr_f(rn(t0), r1);
964             vfmsub213ss(rn(t0), r2, r3);
965             sse_movr_f(r0, rn(t0));
966             jit_unget_reg(t0);
967         }
968     }
969     else {
970         if (r0 != r3) {
971             sse_mulr_f(r0, r1, r2);
972             sse_subr_f(r0, r0, r3);
973         }
974         else {
975             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
976             sse_mulr_f(rn(t0), r1, r2);
977             sse_subr_f(r0, rn(t0), r3);
978             jit_unget_reg(t0);
979         }
980     }
981 }
982
983 static void
984 _sse_fmsr_d(jit_state_t *_jit,
985             jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
986 {
987     jit_int32_t         t0;
988     if (jit_cpu.fma) {
989         if (r0 != r2 && r0 != r3) {
990             sse_movr_d(r0, r1);
991             vfmsub213sd(r0, r2, r3);
992         }
993         else {
994             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
995             sse_movr_d(rn(t0), r1);
996             vfmsub213sd(rn(t0), r2, r3);
997             sse_movr_d(r0, rn(t0));
998             jit_unget_reg(t0);
999         }
1000     }
1001     else {
1002         if (r0 != r3) {
1003             sse_mulr_d(r0, r1, r2);
1004             sse_subr_d(r0, r0, r3);
1005         }
1006         else {
1007             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1008             sse_mulr_d(rn(t0), r1, r2);
1009             sse_subr_d(r0, rn(t0), r3);
1010             jit_unget_reg(t0);
1011         }
1012     }
1013 }
1014
1015 static void
1016 _sse_fnmar_f(jit_state_t *_jit,
1017              jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
1018 {
1019     jit_int32_t         t0;
1020     if (jit_cpu.fma) {
1021         if (r0 != r2 && r0 != r3) {
1022             sse_negr_f(r0, r1);
1023             vfmsub213ss(r0, r2, r3);
1024         }
1025         else {
1026             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1027             sse_negr_f(rn(t0), r1);
1028             vfmsub213ss(rn(t0), r2, r3);
1029             sse_movr_f(r0, rn(t0));
1030             jit_unget_reg(t0);
1031         }
1032     }
1033     else {
1034         t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1035         sse_negr_f(rn(t0), r1);
1036         sse_mulr_f(rn(t0), rn(t0), r2);
1037         sse_subr_f(r0, rn(t0), r3);
1038         jit_unget_reg(t0);
1039     }
1040 }
1041
1042 static void
1043 _sse_fnmar_d(jit_state_t *_jit,
1044              jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
1045 {
1046     jit_int32_t         t0;
1047     if (jit_cpu.fma) {
1048         if (r0 != r2 && r0 != r3) {
1049             sse_negr_d(r0, r1);
1050             vfmsub213sd(r0, r2, r3);
1051         }
1052         else {
1053             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1054             sse_negr_d(rn(t0), r1);
1055             vfmsub213sd(rn(t0), r2, r3);
1056             sse_movr_d(r0, rn(t0));
1057             jit_unget_reg(t0);
1058         }
1059     }
1060     else {
1061         t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1062         sse_negr_d(rn(t0), r1);
1063         sse_mulr_d(rn(t0), rn(t0), r2);
1064         sse_subr_d(r0, rn(t0), r3);
1065         jit_unget_reg(t0);
1066     }
1067 }
1068
1069 static void
1070 _sse_fnmsr_f(jit_state_t *_jit,
1071              jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
1072 {
1073     jit_int32_t         t0;
1074     if (jit_cpu.fma) {
1075         if (r0 != r2 && r0 != r3) {
1076             sse_negr_f(r0, r1);
1077             vfmadd213ss(r0, r2, r3);
1078         }
1079         else {
1080             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1081             sse_negr_f(rn(t0), r1);
1082             vfmadd213ss(rn(t0), r2, r3);
1083             sse_movr_f(r0, rn(t0));
1084             jit_unget_reg(t0);
1085         }
1086     }
1087     else {
1088         t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1089         sse_negr_f(rn(t0), r1);
1090         sse_mulr_f(rn(t0), rn(t0), r2);
1091         sse_addr_f(r0, rn(t0), r3);
1092         jit_unget_reg(t0);
1093     }
1094 }
1095
1096 static void
1097 _sse_fnmsr_d(jit_state_t *_jit,
1098              jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
1099 {
1100     jit_int32_t         t0;
1101     if (jit_cpu.fma) {
1102         if (r0 != r2 && r0 != r3) {
1103             sse_negr_d(r0, r1);
1104             vfmadd213sd(r0, r2, r3);
1105         }
1106         else {
1107             t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1108             sse_negr_d(rn(t0), r1);
1109             vfmadd213sd(rn(t0), r2, r3);
1110             sse_movr_d(r0, rn(t0));
1111             jit_unget_reg(t0);
1112         }
1113     }
1114     else {
1115         t0 = jit_get_reg(jit_class_fpr|jit_class_xpr);
1116         sse_negr_d(rn(t0), r1);
1117         sse_mulr_d(rn(t0), rn(t0), r2);
1118         sse_addr_d(r0, rn(t0), r3);
1119         jit_unget_reg(t0);
1120     }
1121 }
1122
1123 static void
1124 _ssecmp(jit_state_t *_jit, jit_bool_t d, jit_int32_t code,
1125         jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1126 {
1127     jit_bool_t          rc;
1128     jit_int32_t         reg;
1129     if ((rc = reg8_p(r0)))
1130         reg = r0;
1131     else {
1132         reg = _RAX_REGNO;
1133         movr(r0, reg);
1134     }
1135     ixorr(reg, reg);
1136     if (d)
1137         ucomisdr(r2, r1);
1138     else
1139         ucomissr(r2, r1);
1140     cc(code, reg);
1141     if (!rc)
1142         xchgr(r0, reg);
1143 }
1144
1145 static void
1146 _sse_movr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
1147 {
1148     if (r0 != r1)
1149         ssexr(0xf3, X86_SSE_MOV, r0, r1);
1150 }
1151
1152 static void
1153 _sse_movi_f(jit_state_t *_jit, jit_int32_t r0, jit_float32_t *i0)
1154 {
1155     union {
1156         jit_int32_t      i;
1157         jit_float32_t    f;
1158     } data;
1159     jit_int32_t          reg;
1160     jit_bool_t           ldi;
1161
1162     data.f = *i0;
1163     if (data.f == 0.0 && !(data.i & 0x80000000))
1164         xorpsr(r0, r0);
1165     else {
1166         ldi = !_jitc->no_data;
1167 #if __X64
1168         /* if will allocate a register for offset, just use immediate */
1169 #  if CAN_RIP_ADDRESS
1170         if (ldi) {
1171             jit_word_t  rel = (jit_word_t)i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1172             ldi = can_sign_extend_int_p(rel);
1173             if (!ldi && address_p(i0))
1174                 ldi = 1;
1175         }
1176 #  else
1177         if (ldi && !address_p(i0))
1178             ldi = 0;
1179 #  endif
1180 #endif
1181         if (ldi)
1182             sse_ldi_f(r0, (jit_word_t)i0);
1183         else {
1184             reg = jit_get_reg(jit_class_gpr);
1185             movi(rn(reg), data.i);
1186             movdxr(r0, rn(reg));
1187             jit_unget_reg(reg);
1188         }
1189     }
1190 }
1191
1192 static void
1193 _sse_movi_w_f(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
1194 {
1195     jit_int32_t         reg;
1196     reg = jit_get_reg(jit_class_gpr);
1197     movi(rn(reg), i0);
1198     movdxr(r0, rn(reg));
1199     jit_unget_reg(reg);
1200 }
1201
1202 fopi(lt)
1203 fopi(le)
1204
1205 static void
1206 _sse_eqr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1207 {
1208     jit_bool_t          rc;
1209     jit_int32_t         reg;
1210     jit_word_t          jp_code;
1211     if ((rc = reg8_p(r0)))
1212         reg = r0;
1213     else {
1214         reg = _RAX_REGNO;
1215         movr(r0, _RAX_REGNO);
1216     }
1217     ixorr(reg, reg);
1218     ucomissr(r2, r1);
1219     jp_code = jpes(0);
1220     cc(X86_CC_E, reg);
1221     patch_at(jp_code, _jit->pc.w);
1222     if (!rc)
1223         xchgr(r0, reg);
1224 }
1225
1226 fopi(eq)
1227 fopi(ge)
1228 fopi(gt)
1229
1230 static void
1231 _sse_ner_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1232 {
1233     jit_bool_t          rc;
1234     jit_int32_t         reg;
1235     jit_word_t          jp_code;
1236     if ((rc = reg8_p(r0)))
1237         reg = r0;
1238     else {
1239         reg = _RAX_REGNO;
1240         movr(r0, _RAX_REGNO);
1241     }
1242     imovi(reg, 1);
1243     ucomissr(r2, r1);
1244     jp_code = jpes(0);
1245     cc(X86_CC_NE, reg);
1246     patch_at(jp_code, _jit->pc.w);
1247     if (!rc)
1248         xchgr(r0, reg);
1249 }
1250
1251 fopi(ne)
1252 fopi(unlt)
1253
1254 static void
1255 _sse_unler_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1256 {
1257     if (r1 == r2)
1258         movi(r0, 1);
1259     else
1260         ssecmpf(X86_CC_NA, r0, r2, r1);
1261 }
1262
1263 fopi(unle)
1264
1265 static void
1266 _sse_uneqr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1267 {
1268     if (r1 == r2)
1269         movi(r0, 1);
1270     else
1271         ssecmpf(X86_CC_E, r0, r1, r2);
1272 }
1273
1274 fopi(uneq)
1275
1276 static void
1277 _sse_unger_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1278 {
1279     if (r1 == r2)
1280         movi(r0, 1);
1281     else
1282         ssecmpf(X86_CC_NA, r0, r1, r2);
1283 }
1284
1285 fopi(unge)
1286 fopi(ungt)
1287
1288 static void
1289 _sse_ltgtr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1290 {
1291     if (r1 == r2)
1292         ixorr(r0, r0);
1293     else
1294         ssecmpf(X86_CC_NE, r0, r1, r2);
1295 }
1296
1297 fopi(ltgt)
1298 fopi(ord)
1299 fopi(unord)
1300
1301 static void
1302 _sse_ldi_f(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
1303 {
1304     jit_int32_t         reg;
1305 #if CAN_RIP_ADDRESS
1306     jit_word_t          rel = i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1307     if (can_sign_extend_int_p(rel))
1308         movssmr(rel, _NOREG, _NOREG, _SCL8, r0);
1309     else
1310 #endif
1311     if (address_p(i0))
1312         movssmr(i0, _NOREG, _NOREG, _SCL1, r0);
1313     else {
1314         reg = jit_get_reg(jit_class_gpr);
1315         movi(rn(reg), i0);
1316         sse_ldr_f(r0, rn(reg));
1317         jit_unget_reg(reg);
1318     }
1319 }
1320
1321 static void
1322 _sse_ldxr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1323 {
1324 #if __X64_32
1325     jit_int32_t         reg;
1326     reg = jit_get_reg(jit_class_gpr);
1327     addr(rn(reg), r1, r2);
1328     sse_ldr_f(r0, rn(reg));
1329     jit_unget_reg(reg);
1330 #else
1331     movssmr(0, r1, r2, _SCL1, r0);
1332 #endif
1333 }
1334
1335 static void
1336 _sse_ldxi_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
1337 {
1338     jit_int32_t         reg;
1339     if (can_sign_extend_int_p(i0))
1340         movssmr(i0, r1, _NOREG, _SCL1, r0);
1341     else {
1342         reg = jit_get_reg(jit_class_gpr);
1343 #if __X64_32
1344         addi(rn(reg), r1, i0);
1345         sse_ldr_f(r0, rn(reg));
1346 #else
1347         movi(rn(reg), i0);
1348         sse_ldxr_f(r0, r1, rn(reg));
1349 #endif
1350         jit_unget_reg(reg);
1351     }
1352 }
1353
1354 static void
1355 _sse_unldr_x(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
1356 {
1357     assert(i0 == 4 || i0 == 8);
1358     if (i0 == 4)
1359         sse_ldr_f(r0, r1);
1360     else
1361         sse_ldr_d(r0, r1);
1362 }
1363
1364 static void
1365 _sse_unldi_x(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1)
1366 {
1367     assert(i1 == 4 || i1 == 8);
1368     if (i1 == 4)
1369         sse_ldi_f(r0, i0);
1370     else
1371         sse_ldi_d(r0, i0);
1372 }
1373
1374 static void
1375 _sse_sti_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0)
1376 {
1377     jit_int32_t         reg;
1378 #if CAN_RIP_ADDRESS
1379     jit_word_t          rel = i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1380     if (can_sign_extend_int_p(rel))
1381         movssrm(r0, rel, _NOREG, _NOREG, _SCL8);
1382     else
1383 #endif
1384     if (address_p(i0))
1385         movssrm(r0, i0, _NOREG, _NOREG, _SCL1);
1386     else {
1387         reg = jit_get_reg(jit_class_gpr);
1388         movi(rn(reg), i0);
1389         sse_str_f(rn(reg), r0);
1390         jit_unget_reg(reg);
1391     }
1392 }
1393
1394 static void
1395 _sse_stxr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1396 {
1397 #if __X64_32
1398     jit_int32_t         reg;
1399     reg = jit_get_reg(jit_class_gpr);
1400     addr(rn(reg), r0, r1);
1401     sse_str_f(rn(reg), r2);
1402     jit_unget_reg(reg);
1403 #else
1404     movssrm(r2, 0, r0, r1, _SCL1);
1405 #endif
1406 }
1407
1408 static void
1409 _sse_stxi_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1410 {
1411     jit_int32_t         reg;
1412     if (can_sign_extend_int_p(i0))
1413         movssrm(r1, i0, r0, _NOREG, _SCL1);
1414     else {
1415         reg = jit_get_reg(jit_class_gpr);
1416 #if __X64_32
1417         addi(rn(reg), r0, i0);
1418         sse_str_f(rn(reg), r1);
1419 #else
1420         movi(rn(reg), i0);
1421         sse_stxr_f(rn(reg), r0, r1);
1422 #endif
1423         jit_unget_reg(reg);
1424     }
1425 }
1426
1427 static void
1428 _sse_unstr_x(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
1429 {
1430     assert(i0 == 4 || i0 == 8);
1431     if (i0 == 4)
1432         sse_str_f(r0, r1);
1433     else
1434         sse_str_d(r0, r1);
1435 }
1436
1437 static void
1438 _sse_unsti_x(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1)
1439 {
1440     assert(i1 == 4 || i1 == 8);
1441     if (i1 == 4)
1442         sse_sti_f(i0, r0);
1443     else
1444         sse_sti_d(i0, r0);
1445 }
1446
1447 static jit_word_t
1448 _sse_bltr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1449 {
1450     ucomissr(r1, r0);
1451     return (ja(i0));
1452 }
1453 fbopi(lt)
1454
1455 static jit_word_t
1456 _sse_bler_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1457 {
1458     ucomissr(r1, r0);
1459     return (jae(i0));
1460 }
1461 fbopi(le)
1462
1463 static jit_word_t
1464 _sse_beqr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1465 {
1466     jit_word_t          w;
1467     jit_word_t          jp_code;
1468     ucomissr(r0, r1);
1469     jp_code = jps(0);
1470     w = je(i0);
1471     patch_at(jp_code, _jit->pc.w);
1472     return (w);
1473 }
1474 fbopi(eq)
1475
1476 static jit_word_t
1477 _sse_bger_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1478 {
1479     ucomissr(r0, r1);
1480     return (jae(i0));
1481 }
1482 fbopi(ge)
1483
1484 static jit_word_t
1485 _sse_bgtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1486 {
1487     ucomissr(r0, r1);
1488     return (ja(i0));
1489 }
1490 fbopi(gt)
1491
1492 static jit_word_t
1493 _sse_bner_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1494 {
1495     jit_word_t          w;
1496     jit_word_t          jp_code;
1497     jit_word_t          jz_code;
1498     ucomissr(r0, r1);
1499     jp_code = jps(0);
1500     jz_code = jzs(0);
1501     patch_at(jp_code, _jit->pc.w);
1502     w = jmpi(i0);
1503     patch_at(jz_code, _jit->pc.w);
1504     return (w);
1505 }
1506 fbopi(ne)
1507
1508 static jit_word_t
1509 _sse_bunltr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1510 {
1511     ucomissr(r0, r1);
1512     return (jnae(i0));
1513 }
1514 fbopi(unlt)
1515
1516 static jit_word_t
1517 _sse_bunler_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1518 {
1519     jit_word_t          w;
1520     if (r0 == r1)
1521         w = jmpi(i0);
1522     else {
1523         ucomissr(r0, r1);
1524         w = jna(i0);
1525     }
1526     return (w);
1527 }
1528 fbopi(unle)
1529
1530 static jit_word_t
1531 _sse_buneqr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1532 {
1533     jit_word_t          w;
1534     if (r0 == r1)
1535         w = jmpi(i0);
1536     else {
1537         ucomissr(r0, r1);
1538         w = je(i0);
1539     }
1540     return (w);
1541 }
1542 fbopi(uneq)
1543
1544 static jit_word_t
1545 _sse_bunger_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1546 {
1547     jit_word_t          w;
1548     if (r0 == r1)
1549         w = jmpi(i0);
1550     else {
1551         ucomissr(r1, r0);
1552         w = jna(i0);
1553     }
1554     return (w);
1555 }
1556 fbopi(unge)
1557
1558 static jit_word_t
1559 _sse_bungtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1560 {
1561     ucomissr(r1, r0);
1562     return (jnae(i0));
1563 }
1564 fbopi(ungt)
1565
1566 static jit_word_t
1567 _sse_bltgtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1568 {
1569     ucomissr(r0, r1);
1570     return (jne(i0));
1571 }
1572 fbopi(ltgt)
1573
1574 static jit_word_t
1575 _sse_bordr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1576 {
1577     ucomissr(r0, r1);
1578     return (jnp(i0));
1579 }
1580 fbopi(ord)
1581
1582 static jit_word_t
1583 _sse_bunordr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1584 {
1585     ucomissr(r0, r1);
1586     return (jp(i0));
1587 }
1588 fbopi(unord)
1589
1590 dopi(lt)
1591 dopi(le)
1592
1593 static void
1594 _sse_eqr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1595 {
1596     jit_bool_t          rc;
1597     jit_int32_t         reg;
1598     jit_word_t          jp_code;
1599     if ((rc = reg8_p(r0)))
1600         reg = r0;
1601     else {
1602         reg = _RAX_REGNO;
1603         movr(r0, _RAX_REGNO);
1604     }
1605     ixorr(reg, reg);
1606     ucomisdr(r2, r1);
1607     jp_code = jpes(0);
1608     cc(X86_CC_E, reg);
1609     patch_at(jp_code, _jit->pc.w);
1610     if (!rc)
1611         xchgr(r0, reg);
1612 }
1613
1614 dopi(eq)
1615 dopi(ge)
1616 dopi(gt)
1617
1618 static void
1619 _sse_ner_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1620 {
1621     jit_bool_t          rc;
1622     jit_int32_t         reg;
1623     jit_word_t          jp_code;
1624     if ((rc = reg8_p(r0)))
1625         reg = r0;
1626     else {
1627         reg = _RAX_REGNO;
1628         movr(r0, _RAX_REGNO);
1629     }
1630     imovi(reg, 1);
1631     ucomisdr(r2, r1);
1632     jp_code = jpes(0);
1633     cc(X86_CC_NE, reg);
1634     patch_at(jp_code, _jit->pc.w);
1635     if (!rc)
1636         xchgr(r0, reg);
1637 }
1638
1639 dopi(ne)
1640 dopi(unlt)
1641
1642 static void
1643 _sse_unler_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1644 {
1645     if (r1 == r2)
1646         movi(r0, 1);
1647     else
1648         ssecmpd(X86_CC_NA, r0, r2, r1);
1649 }
1650
1651 dopi(unle)
1652
1653 static void
1654 _sse_uneqr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1655 {
1656     if (r1 == r2)
1657         movi(r0, 1);
1658     else
1659         ssecmpd(X86_CC_E, r0, r1, r2);
1660 }
1661
1662 dopi(uneq)
1663
1664 static void
1665 _sse_unger_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1666 {
1667     if (r1 == r2)
1668         movi(r0, 1);
1669     else
1670         ssecmpd(X86_CC_NA, r0, r1, r2);
1671 }
1672
1673 dopi(unge)
1674 dopi(ungt)
1675
1676 static void
1677 _sse_ltgtr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1678 {
1679     if (r1 == r2)
1680         ixorr(r0, r0);
1681     else
1682         ssecmpd(X86_CC_NE, r0, r1, r2);
1683 }
1684
1685 dopi(ltgt)
1686 dopi(ord)
1687 dopi(unord)
1688
1689 static void
1690 _sse_movr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
1691 {
1692     if (r0 != r1)
1693         ssexr(0xf2, X86_SSE_MOV, r0, r1);
1694 }
1695
1696 static void
1697 _sse_movi_d(jit_state_t *_jit, jit_int32_t r0, jit_float64_t *i0)
1698 {
1699     union {
1700         jit_int32_t      ii[2];
1701         jit_word_t       w;
1702         jit_float64_t    d;
1703     } data;
1704     jit_int32_t          reg;
1705     jit_bool_t           ldi;
1706
1707     data.d = *i0;
1708     if (data.d == 0.0 && !(data.ii[1] & 0x80000000))
1709         xorpdr(r0, r0);
1710     else {
1711         ldi = !_jitc->no_data;
1712 #if __X64
1713         /* if will allocate a register for offset, just use immediate */
1714 #  if CAN_RIP_ADDRESS
1715         if (ldi) {
1716             jit_word_t  rel = (jit_word_t)i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1717             ldi = can_sign_extend_int_p(rel);
1718             if (!ldi && address_p(i0))
1719                 ldi = 1;
1720         }
1721 #  else
1722         if (ldi && !address_p(i0))
1723             ldi = 0;
1724 #  endif
1725 #endif
1726         if (ldi)
1727             sse_ldi_d(r0, (jit_word_t)i0);
1728         else {
1729             reg = jit_get_reg(jit_class_gpr);
1730 #if __X64 && !__X64_32
1731             movi(rn(reg), data.w);
1732             movqxr(r0, rn(reg));
1733             jit_unget_reg(reg);
1734 #else
1735             CHECK_CVT_OFFSET();
1736             movi(rn(reg), data.ii[0]);
1737             stxi_i(CVT_OFFSET, _RBP_REGNO, rn(reg));
1738             movi(rn(reg), data.ii[1]);
1739             stxi_i(CVT_OFFSET + 4, _RBP_REGNO, rn(reg));
1740             jit_unget_reg(reg);
1741             sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET);
1742 #endif
1743         }
1744     }
1745 }
1746
1747 #if __X32 || __X64_32
1748 static void
1749 _sse_movr_ww_d(jit_state_t *_jit,
1750                jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1751 {
1752     CHECK_CVT_OFFSET();
1753     stxi_i(CVT_OFFSET, _RBP_REGNO, r1);
1754     stxi_i(CVT_OFFSET + 4, _RBP_REGNO, r2);
1755     sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET);
1756 }
1757
1758 static void
1759 _sse_movr_d_ww(jit_state_t *_jit,
1760                jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1761 {
1762     CHECK_CVT_OFFSET();
1763     sse_stxi_d(CVT_OFFSET, _RBP_REGNO, r2);
1764     ldxi_i(r0, _RBP_REGNO, CVT_OFFSET);
1765     ldxi_i(r1, _RBP_REGNO, CVT_OFFSET + 4);
1766 }
1767
1768 static void
1769 _sse_movi_ww_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1)
1770 {
1771     jit_int32_t         reg;
1772     CHECK_CVT_OFFSET();
1773     reg = jit_get_reg(jit_class_gpr);
1774     movi(rn(reg), i0);
1775     stxi_i(CVT_OFFSET, _RBP_REGNO, rn(reg));
1776     movi(rn(reg), i1);
1777     stxi_i(CVT_OFFSET + 4, _RBP_REGNO, rn(reg));
1778     sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET);
1779     jit_unget_reg(reg);
1780 }
1781 #else
1782 static void
1783 _sse_movi_w_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
1784 {
1785     jit_int32_t         reg;
1786     reg = jit_get_reg(jit_class_gpr);
1787     movi(rn(reg), i0);
1788     movqxr(r0, rn(reg));
1789     jit_unget_reg(reg);
1790 }
1791 #endif
1792
1793 static void
1794 _sse_ldi_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
1795 {
1796     jit_int32_t         reg;
1797 #if CAN_RIP_ADDRESS
1798     jit_word_t          rel = i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1799     if (can_sign_extend_int_p(rel))
1800         movsdmr(rel, _NOREG, _NOREG, _SCL8, r0);
1801     else
1802 #endif
1803     if (address_p(i0))
1804         movsdmr(i0, _NOREG, _NOREG, _SCL1, r0);
1805     else {
1806         reg = jit_get_reg(jit_class_gpr);
1807         movi(rn(reg), i0);
1808         sse_ldr_d(r0, rn(reg));
1809         jit_unget_reg(reg);
1810     }
1811 }
1812
1813 static void
1814 _sse_ldxr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1815 {
1816 #if __X64_32
1817     jit_int32_t         reg;
1818     reg = jit_get_reg(jit_class_gpr);
1819     addr(rn(reg), r1, r2);
1820     sse_ldr_d(r0, rn(reg));
1821     jit_unget_reg(reg);
1822 #else
1823     movsdmr(0, r1, r2, _SCL1, r0);
1824 #endif
1825 }
1826
1827 static void
1828 _sse_ldxi_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
1829 {
1830     jit_int32_t         reg;
1831     if (can_sign_extend_int_p(i0))
1832         movsdmr(i0, r1, _NOREG, _SCL1, r0);
1833     else {
1834         reg = jit_get_reg(jit_class_gpr);
1835 #if __X64_32
1836         addi(rn(reg), r1, i0);
1837         sse_ldr_d(r0, rn(reg));
1838 #else
1839         movi(rn(reg), i0);
1840         sse_ldxr_d(r0, r1, rn(reg));
1841 #endif
1842         jit_unget_reg(reg);
1843     }
1844 }
1845
1846 static void
1847 _sse_sti_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0)
1848 {
1849     jit_int32_t         reg;
1850 #if CAN_RIP_ADDRESS
1851     jit_word_t          rel = i0 - (_jit->pc.w + 8 + !!(r0 & 8));
1852     if (can_sign_extend_int_p(rel))
1853         movsdrm(r0, rel, _NOREG, _NOREG, _SCL8);
1854     else
1855 #endif
1856     if (address_p(i0))
1857         movsdrm(r0, i0, _NOREG, _NOREG, _SCL1);
1858     else {
1859         reg = jit_get_reg(jit_class_gpr);
1860         movi(rn(reg), i0);
1861         sse_str_d(rn(reg), r0);
1862         jit_unget_reg(reg);
1863     }
1864 }
1865
1866 static void
1867 _sse_stxr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1868 {
1869 #if __X64_32
1870     jit_int32_t         reg;
1871     reg = jit_get_reg(jit_class_gpr);
1872     addr(rn(reg), r0, r1);
1873     sse_str_d(rn(reg), r2);
1874     jit_unget_reg(reg);
1875 #else
1876     movsdrm(r2, 0, r0, r1, _SCL1);
1877 #endif
1878 }
1879
1880 static void
1881 _sse_stxi_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1882 {
1883     jit_int32_t         reg;
1884     if (can_sign_extend_int_p(i0))
1885         movsdrm(r1, i0, r0, _NOREG, _SCL1);
1886     else {
1887         reg = jit_get_reg(jit_class_gpr);
1888 #if __X64_32
1889         addi(rn(reg), r0, i0);
1890         sse_str_d(rn(reg), r1);
1891 #else
1892         movi(rn(reg), i0);
1893         sse_stxr_f(rn(reg), r0, r1);
1894 #endif
1895         jit_unget_reg(reg);
1896     }
1897 }
1898
1899 static jit_word_t
1900 _sse_bltr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1901 {
1902     ucomisdr(r1, r0);
1903     return (ja(i0));
1904 }
1905 dbopi(lt)
1906
1907 static jit_word_t
1908 _sse_bler_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1909 {
1910     ucomisdr(r1, r0);
1911     return (jae(i0));
1912 }
1913 dbopi(le)
1914
1915 static jit_word_t
1916 _sse_beqr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1917 {
1918     jit_word_t          w;
1919     jit_word_t          jp_code;
1920     ucomisdr(r0, r1);
1921     jp_code = jps(0);
1922     w = je(i0);
1923     patch_at(jp_code, _jit->pc.w);
1924     return (w);
1925 }
1926 dbopi(eq)
1927
1928 static jit_word_t
1929 _sse_bger_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1930 {
1931     ucomisdr(r0, r1);
1932     return (jae(i0));
1933 }
1934 dbopi(ge)
1935
1936 static jit_word_t
1937 _sse_bgtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1938 {
1939     ucomisdr(r0, r1);
1940     return (ja(i0));
1941 }
1942 dbopi(gt)
1943
1944 static jit_word_t
1945 _sse_bner_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1946 {
1947     jit_word_t          w;
1948     jit_word_t          jp_code;
1949     jit_word_t          jz_code;
1950     ucomisdr(r0, r1);
1951     jp_code = jps(0);
1952     jz_code = jzs(0);
1953     patch_at(jp_code, _jit->pc.w);
1954     w = jmpi(i0);
1955     patch_at(jz_code, _jit->pc.w);
1956     return (w);
1957 }
1958 dbopi(ne)
1959
1960 static jit_word_t
1961 _sse_bunltr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1962 {
1963     ucomisdr(r0, r1);
1964     return (jnae(i0));
1965 }
1966 dbopi(unlt)
1967
1968 static jit_word_t
1969 _sse_bunler_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1970 {
1971     jit_word_t          w;
1972     if (r0 == r1)
1973         w = jmpi(i0);
1974     else {
1975         ucomisdr(r0, r1);
1976         w = jna(i0);
1977     }
1978     return (w);
1979 }
1980 dbopi(unle)
1981
1982 static jit_word_t
1983 _sse_buneqr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1984 {
1985     jit_word_t          w;
1986     if (r0 == r1)
1987         w = jmpi(i0);
1988     else {
1989         ucomisdr(r0, r1);
1990         w = je(i0);
1991     }
1992     return (w);
1993 }
1994 dbopi(uneq)
1995
1996 static jit_word_t
1997 _sse_bunger_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1998 {
1999     jit_word_t          w;
2000     if (r0 == r1)
2001         w = jmpi(i0);
2002     else {
2003         ucomisdr(r1, r0);
2004         w = jna(i0);
2005     }
2006     return (w);
2007 }
2008 dbopi(unge)
2009
2010 static jit_word_t
2011 _sse_bungtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
2012 {
2013     ucomisdr(r1, r0);
2014     return (jnae(i0));
2015 }
2016 dbopi(ungt)
2017
2018 static jit_word_t
2019 _sse_bltgtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
2020 {
2021     ucomisdr(r0, r1);
2022     return (jne(i0));
2023 }
2024 dbopi(ltgt)
2025
2026 static jit_word_t
2027 _sse_bordr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
2028 {
2029     ucomisdr(r0, r1);
2030     return (jnp(i0));
2031 }
2032 dbopi(ord)
2033
2034 static jit_word_t
2035 _sse_bunordr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
2036 {
2037     ucomisdr(r0, r1);
2038     return (jp(i0));
2039 }
2040 dbopi(unord)
2041 #  undef fopi
2042 #  undef fbopi
2043 #  undef bopi
2044 #  undef dbopi
2045 #  undef fpr_bopi
2046 #  undef fpr_opi
2047 #endif