git subrepo pull --force deps/lightning
[pcsx_rearmed.git] / deps / lightning / lib / jit_x86-sse.c
1 /*
2  * Copyright (C) 2012-2022  Free Software Foundation, Inc.
3  *
4  * This file is part of GNU lightning.
5  *
6  * GNU lightning is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU Lesser General Public License as published
8  * by the Free Software Foundation; either version 3, or (at your option)
9  * any later version.
10  *
11  * GNU lightning is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14  * License for more details.
15  *
16  * Authors:
17  *      Paulo Cesar Pereira de Andrade
18  */
19
20 #if PROTO
21 #  if __X32
22 #    define sse_address_p(i0)           1
23 #  else
24 #    if __X64_32
25 #      define sse_address_p(i0)         ((jit_word_t)(i0) >= 0)
26 #    else
27 #      define sse_address_p(i0)         can_sign_extend_int_p(i0)
28 #    endif
29 #  endif
30 #  define _XMM6_REGNO                   6
31 #  define _XMM7_REGNO                   7
32 #  define _XMM8_REGNO                   8
33 #  define _XMM9_REGNO                   9
34 #  define _XMM10_REGNO                  10
35 #  define _XMM11_REGNO                  11
36 #  define _XMM12_REGNO                  12
37 #  define _XMM13_REGNO                  13
38 #  define _XMM14_REGNO                  14
39 #  define _XMM15_REGNO                  15
40 #define X86_SSE_MOV                     0x10
41 #define X86_SSE_MOV1                    0x11
42 #define X86_SSE_MOVLP                   0x12
43 #define X86_SSE_MOVHP                   0x16
44 #define X86_SSE_MOVA                    0x28
45 #define X86_SSE_CVTIS                   0x2a
46 #define X86_SSE_CVTTSI                  0x2c
47 #define X86_SSE_CVTSI                   0x2d
48 #define X86_SSE_UCOMI                   0x2e
49 #define X86_SSE_COMI                    0x2f
50 #define X86_SSE_ROUND                   0x3a
51 #define X86_SSE_SQRT                    0x51
52 #define X86_SSE_RSQRT                   0x52
53 #define X86_SSE_RCP                     0x53
54 #define X86_SSE_AND                     0x54
55 #define X86_SSE_ANDN                    0x55
56 #define X86_SSE_OR                      0x56
57 #define X86_SSE_XOR                     0x57
58 #define X86_SSE_ADD                     0x58
59 #define X86_SSE_MUL                     0x59
60 #define X86_SSE_CVTSD                   0x5a
61 #define X86_SSE_CVTDT                   0x5b
62 #define X86_SSE_SUB                     0x5c
63 #define X86_SSE_MIN                     0x5d
64 #define X86_SSE_DIV                     0x5e
65 #define X86_SSE_MAX                     0x5f
66 #define X86_SSE_X2G                     0x6e
67 #define X86_SSE_EQB                     0x74
68 #define X86_SSE_EQW                     0x75
69 #define X86_SSE_EQD                     0x76
70 #define X86_SSE_G2X                     0x7e
71 #define X86_SSE_MOV2                    0xd6
72 #  define sser(c,r0,r1)                 _sser(_jit,c,r0,r1)
73 static void _sser(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
74 #  define ssexr(p,c,r0,r1)              _ssexr(_jit,p,c,r0,r1)
75 static void _ssexr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
76 #  define ssexi(c,r0,m,i)               _ssexi(_jit,c,r0,m,i)
77 static void _ssexi(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
78 #  define addssr(r0, r1)                ssexr(0xf3, X86_SSE_ADD, r0, r1)
79 #  define addsdr(r0, r1)                ssexr(0xf2, X86_SSE_ADD, r0, r1)
80 #  define subssr(r0, r1)                ssexr(0xf3, X86_SSE_SUB, r0, r1)
81 #  define subsdr(r0, r1)                ssexr(0xf2, X86_SSE_SUB, r0, r1)
82 #  define mulssr(r0, r1)                ssexr(0xf3, X86_SSE_MUL, r0, r1)
83 #  define mulsdr(r0, r1)                ssexr(0xf2, X86_SSE_MUL, r0, r1)
84 #  define divssr(r0, r1)                ssexr(0xf3, X86_SSE_DIV, r0, r1)
85 #  define divsdr(r0, r1)                ssexr(0xf2, X86_SSE_DIV, r0, r1)
86 #  define andpsr(r0, r1)                sser(       X86_SSE_AND, r0, r1)
87 #  define andpdr(r0, r1)                ssexr(0x66, X86_SSE_AND, r0, r1)
88 #  define sse_truncr_f_i(r0, r1)        ssexr(0xf3, X86_SSE_CVTTSI, r0, r1)
89 #  define sse_truncr_d_i(r0, r1)        ssexr(0xf2, X86_SSE_CVTTSI, r0, r1)
90 #  if __X64
91 #    define sse_truncr_f_l(r0, r1)      sselxr(0xf3, X86_SSE_CVTTSI, r0, r1)
92 #    define sse_truncr_d_l(r0, r1)      sselxr(0xf2, X86_SSE_CVTTSI, r0, r1)
93 #    define sse_extr_f(r0, r1)          sselxr(0xf3, X86_SSE_CVTIS, r0, r1)
94 #    define sse_extr_d(r0, r1)          sselxr(0xf2, X86_SSE_CVTIS, r0, r1)
95 #  else
96 #    define sse_extr_f(r0, r1)          ssexr(0xf3, X86_SSE_CVTIS, r0, r1)
97 #    define sse_extr_d(r0, r1)          ssexr(0xf2, X86_SSE_CVTIS, r0, r1)
98 #  endif
99 #  define sse_extr_f_d(r0, r1)          ssexr(0xf3, X86_SSE_CVTSD, r0, r1)
100 #  define sse_extr_d_f(r0, r1)          ssexr(0xf2, X86_SSE_CVTSD, r0, r1)
101 #  define ucomissr(r0,r1)               sser(X86_SSE_UCOMI,r0,r1)
102 #  define ucomisdr(r0,r1)               ssexr(0x66,X86_SSE_UCOMI,r0,r1)
103 #  define xorpsr(r0,r1)                 sser(X86_SSE_XOR,r0,r1)
104 #  define xorpdr(r0,r1)                 ssexr(0x66,X86_SSE_XOR,r0,r1)
105 #  define movdlxr(r0,r1)                ssexr(0x66, X86_SSE_X2G,r0,r1)
106 #  define pcmpeqlr(r0, r1)              ssexr(0x66, X86_SSE_EQD, r0, r1)
107 #  define psrl(r0, i0)                  ssexi(0x72, r0, 0x02, i0)
108 #  define psrq(r0, i0)                  ssexi(0x73, r0, 0x02, i0)
109 #  define psll(r0, i0)                  ssexi(0x72, r0, 0x06, i0)
110 #  define pslq(r0, i0)                  ssexi(0x73, r0, 0x06, i0)
111 #  define movdqxr(r0,r1)                sselxr(0x66,X86_SSE_X2G,r0,r1)
112 #  if __X64 && !__X64_32
113 #    define sselxr(p,c,r0,r1)           _sselxr(_jit,p,c,r0,r1)
114 static void
115 _sselxr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t, jit_int32_t);
116 #  else
117 #    define sselxr(p,c,r0,r1)           ssexr(p,c,r0,r1)
118 #  endif
119 #  define ssexrx(p,c,md,rb,ri,ms,rd)    _ssexrx(_jit,p,c,md,rb,ri,ms,rd)
120 #  define movssmr(md,rb,ri,ms,rd)       ssexrx(0xf3,X86_SSE_MOV,md,rb,ri,ms,rd)
121 #  define movsdmr(md,rb,ri,ms,rd)       ssexrx(0xf2,X86_SSE_MOV,md,rb,ri,ms,rd)
122 #  define movssrm(rs,md,mb,mi,ms)       ssexrx(0xf3,X86_SSE_MOV1,md,mb,mi,ms,rs)
123 #  define movsdrm(rs,md,mb,mi,ms)       ssexrx(0xf2,X86_SSE_MOV1,md,mb,mi,ms,rs)
124 static void
125 _ssexrx(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t,
126         jit_int32_t, jit_int32_t, jit_int32_t, jit_int32_t);
127 #  define sse_addr_f(r0, r1, r2)        _sse_addr_f(_jit, r0, r1, r2)
128 static void _sse_addr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
129 #  define sse_addi_f(r0, r1, i0)        _sse_addi_f(_jit, r0, r1, i0)
130 static void _sse_addi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
131 #  define sse_addr_d(r0, r1, r2)        _sse_addr_d(_jit, r0, r1, r2)
132 static void _sse_addr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
133 #  define sse_addi_d(r0, r1, i0)        _sse_addi_d(_jit, r0, r1, i0)
134 static void _sse_addi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
135 #  define sse_subr_f(r0, r1, r2)        _sse_subr_f(_jit, r0, r1, r2)
136 static void _sse_subr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
137 #  define sse_subi_f(r0, r1, i0)        _sse_subi_f(_jit, r0, r1, i0)
138 static void _sse_subi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
139 #  define sse_subr_d(r0, r1, r2)        _sse_subr_d(_jit, r0, r1, r2)
140 static void _sse_subr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
141 #  define sse_subi_d(r0, r1, i0)        _sse_subi_d(_jit, r0, r1, i0)
142 static void _sse_subi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
143 #  define sse_rsbr_f(r0, r1, r2)        sse_subr_f(r0, r2, r1)
144 #  define sse_rsbi_f(r0, r1, i0)        _sse_rsbi_f(_jit, r0, r1, i0)
145 static void _sse_rsbi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
146 #  define sse_rsbr_d(r0, r1, r2)        sse_subr_d(r0, r2, r1)
147 #  define sse_rsbi_d(r0, r1, i0)        _sse_rsbi_d(_jit, r0, r1, i0)
148 static void _sse_rsbi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
149 #  define sse_mulr_f(r0, r1, r2)        _sse_mulr_f(_jit, r0, r1, r2)
150 static void _sse_mulr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
151 #  define sse_muli_f(r0, r1, i0)        _sse_muli_f(_jit, r0, r1, i0)
152 static void _sse_muli_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
153 #  define sse_mulr_d(r0, r1, r2)        _sse_mulr_d(_jit, r0, r1, r2)
154 static void _sse_mulr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
155 #  define sse_muli_d(r0, r1, i0)        _sse_muli_d(_jit, r0, r1, i0)
156 static void _sse_muli_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
157 #  define sse_divr_f(r0, r1, r2)        _sse_divr_f(_jit, r0, r1, r2)
158 static void _sse_divr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
159 #  define sse_divi_f(r0, r1, i0)        _sse_divi_f(_jit, r0, r1, i0)
160 static void _sse_divi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
161 #  define sse_divr_d(r0, r1, r2)        _sse_divr_d(_jit, r0, r1, r2)
162 static void _sse_divr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
163 #  define sse_divi_d(r0, r1, i0)        _sse_divi_d(_jit, r0, r1, i0)
164 static void _sse_divi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
165 #  define sse_absr_f(r0, r1)            _sse_absr_f(_jit, r0, r1)
166 static void _sse_absr_f(jit_state_t*,jit_int32_t,jit_int32_t);
167 #  define sse_absr_d(r0, r1)            _sse_absr_d(_jit, r0, r1)
168 static void _sse_absr_d(jit_state_t*,jit_int32_t,jit_int32_t);
169 #  define sse_negr_f(r0, r1)            _sse_negr_f(_jit, r0, r1)
170 static void _sse_negr_f(jit_state_t*,jit_int32_t,jit_int32_t);
171 #  define sse_negr_d(r0, r1)            _sse_negr_d(_jit, r0, r1)
172 static void _sse_negr_d(jit_state_t*,jit_int32_t,jit_int32_t);
173 #  define sse_sqrtr_f(r0, r1)           ssexr(0xf3, X86_SSE_SQRT, r0, r1)
174 #  define sse_sqrtr_d(r0, r1)           ssexr(0xf2, X86_SSE_SQRT, r0, r1)
175 #  define ssecmpf(code, r0, r1, r2)     _ssecmp(_jit, 0, code, r0, r1, r2)
176 #  define ssecmpd(code, r0, r1, r2)     _ssecmp(_jit, 1, code, r0, r1, r2)
177 static void
178 _ssecmp(jit_state_t*, jit_bool_t, jit_int32_t,
179         jit_int32_t, jit_int32_t, jit_int32_t);
180 #define sse_movr_f(r0,r1)               _sse_movr_f(_jit,r0,r1)
181 static void _sse_movr_f(jit_state_t*, jit_int32_t, jit_int32_t);
182 #define sse_movi_f(r0,i0)               _sse_movi_f(_jit,r0,i0)
183 static void _sse_movi_f(jit_state_t*, jit_int32_t, jit_float32_t*);
184 #  define sse_lti_f(r0, r1, i0)         _sse_lti_f(_jit, r0, r1, i0)
185 static void _sse_lti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
186 #  define sse_ltr_f(r0, r1, r2)         ssecmpf(X86_CC_A, r0, r1, r2)
187 #  define sse_lei_f(r0, r1, i0)         _sse_lei_f(_jit, r0, r1, i0)
188 static void _sse_lei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
189 #  define sse_ler_f(r0, r1, r2)         ssecmpf(X86_CC_AE, r0, r1, r2)
190 #  define sse_eqi_f(r0, r1, i0)         _sse_eqi_f(_jit, r0, r1, i0)
191 static void _sse_eqi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
192 #  define sse_eqr_f(r0, r1, r2)         _sse_eqr_f(_jit, r0, r1, r2)
193 static void _sse_eqr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
194 #  define sse_gei_f(r0, r1, i0)         _sse_gei_f(_jit, r0, r1, i0)
195 static void _sse_gei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
196 #  define sse_ger_f(r0, r1, r2)         ssecmpf(X86_CC_AE, r0, r2, r1)
197 #  define sse_gti_f(r0, r1, i0)         _sse_gti_f(_jit, r0, r1, i0)
198 static void _sse_gti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
199 #  define sse_gtr_f(r0, r1, r2)         ssecmpf(X86_CC_A, r0, r2, r1)
200 #  define sse_nei_f(r0, r1, i0)         _sse_nei_f(_jit, r0, r1, i0)
201 static void _sse_nei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
202 #  define sse_ner_f(r0, r1, r2)         _sse_ner_f(_jit, r0, r1, r2)
203 static void _sse_ner_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
204 #  define sse_unlti_f(r0, r1, i0)       _sse_unlti_f(_jit, r0, r1, i0)
205 static void _sse_unlti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
206 #  define sse_unltr_f(r0, r1, r2)       ssecmpf(X86_CC_NAE, r0, r2, r1)
207 #  define sse_unlei_f(r0, r1, i0)       _sse_unlei_f(_jit, r0, r1, i0)
208 static void _sse_unlei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
209 #  define sse_unler_f(r0, r1, r2)       _sse_unler_f(_jit, r0, r1, r2)
210 #  define sse_uneqi_f(r0, r1, i0)       _sse_uneqi_f(_jit, r0, r1, i0)
211 static void _sse_uneqi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
212 static void _sse_unler_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
213 #  define sse_uneqr_f(r0, r1, r2)       _sse_uneqr_f(_jit, r0, r1, r2)
214 static void _sse_uneqr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
215 #  define sse_ungei_f(r0, r1, i0)       _sse_ungei_f(_jit, r0, r1, i0)
216 static void _sse_ungei_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
217 #  define sse_unger_f(r0, r1, r2)       _sse_unger_f(_jit, r0, r1, r2)
218 static void _sse_unger_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
219 #  define sse_ungti_f(r0, r1, i0)       _sse_ungti_f(_jit, r0, r1, i0)
220 static void _sse_ungti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
221 #  define sse_ungtr_f(r0, r1, r2)       ssecmpf(X86_CC_NAE, r0, r1, r2)
222 #  define sse_ltgti_f(r0, r1, i0)       _sse_ltgti_f(_jit, r0, r1, i0)
223 static void _sse_ltgti_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
224 #  define sse_ltgtr_f(r0, r1, r2)       _sse_ltgtr_f(_jit, r0, r1, r2)
225 static void _sse_ltgtr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
226 #  define sse_ordi_f(r0, r1, i0)        _sse_ordi_f(_jit, r0, r1, i0)
227 static void _sse_ordi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
228 #  define sse_ordr_f(r0, r1, r2)        ssecmpf(X86_CC_NP, r0, r2, r1)
229 #  define sse_unordi_f(r0, r1, i0)      _sse_unordi_f(_jit, r0, r1, i0)
230 static void _sse_unordi_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_float32_t*);
231 #  define sse_unordr_f(r0, r1, r2)      ssecmpf(X86_CC_P, r0, r2, r1)
232 #  define sse_ldr_f(r0, r1)             movssmr(0, r1, _NOREG, _SCL1, r0)
233 #  define sse_ldi_f(r0, i0)             _sse_ldi_f(_jit, r0, i0)
234 static void _sse_ldi_f(jit_state_t*, jit_int32_t, jit_word_t);
235 #  define sse_ldxr_f(r0, r1, r2)        _sse_ldxr_f(_jit, r0, r1, r2)
236 static void _sse_ldxr_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
237 #  define sse_ldxi_f(r0, r1, i0)        _sse_ldxi_f(_jit, r0, r1, i0)
238 static void _sse_ldxi_f(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
239 #  define sse_str_f(r0, r1)             movssrm(r1, 0, r0, _NOREG, _SCL1)
240 #  define sse_sti_f(i0, r0)             _sse_sti_f(_jit, i0, r0)
241 static void _sse_sti_f(jit_state_t*, jit_word_t,jit_int32_t);
242 #  define sse_stxr_f(r0, r1, r2)        _sse_stxr_f(_jit, r0, r1, r2)
243 static void _sse_stxr_f(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
244 #  define sse_stxi_f(i0, r0, r1)        _sse_stxi_f(_jit, i0, r0, r1)
245 static void _sse_stxi_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
246 #  define sse_bltr_f(i0, r0, r1)        _sse_bltr_f(_jit, i0, r0, r1)
247 static jit_word_t _sse_bltr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
248 #  define sse_blti_f(i0, r0, i1)        _sse_blti_f(_jit, i0, r0, i1)
249 static jit_word_t
250 _sse_blti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
251 #  define sse_bler_f(i0, r0, r1)        _sse_bler_f(_jit, i0, r0, r1)
252 static jit_word_t _sse_bler_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
253 #  define sse_blei_f(i0, r0, i1)        _sse_blei_f(_jit, i0, r0, i1)
254 static jit_word_t
255 _sse_blei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
256 #  define sse_beqr_f(i0, r0, r1)        _sse_beqr_f(_jit, i0, r0, r1)
257 static jit_word_t _sse_beqr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
258 #  define sse_beqi_f(i0, r0, i1)        _sse_beqi_f(_jit, i0, r0, i1)
259 static jit_word_t
260 _sse_beqi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
261 #  define sse_bger_f(i0, r0, r1)        _sse_bger_f(_jit, i0, r0, r1)
262 static jit_word_t _sse_bger_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
263 #  define sse_bgei_f(i0, r0, i1)        _sse_bgei_f(_jit, i0, r0, i1)
264 static jit_word_t
265 _sse_bgei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
266 #  define sse_bgtr_f(i0, r0, r1)        _sse_bgtr_f(_jit, i0, r0, r1)
267 static jit_word_t _sse_bgtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
268 #  define sse_bgti_f(i0, r0, i1)        _sse_bgti_f(_jit, i0, r0, i1)
269 static jit_word_t
270 _sse_bgti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
271 #  define sse_bner_f(i0, r0, r1)        _sse_bner_f(_jit, i0, r0, r1)
272 static jit_word_t _sse_bner_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
273 #  define sse_bnei_f(i0, r0, i1)        _sse_bnei_f(_jit, i0, r0, i1)
274 static jit_word_t
275 _sse_bnei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
276 #  define sse_bunltr_f(i0, r0, r1)      _sse_bunltr_f(_jit, i0, r0, r1)
277 static jit_word_t _sse_bunltr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
278 #  define sse_bunlti_f(i0, r0, i1)      _sse_bunlti_f(_jit, i0, r0, i1)
279 static jit_word_t
280 _sse_bunlti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
281 #  define sse_bunler_f(i0, r0, r1)      _sse_bunler_f(_jit, i0, r0, r1)
282 static jit_word_t _sse_bunler_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
283 #  define sse_bunlei_f(i0, r0, i1)      _sse_bunlei_f(_jit, i0, r0, i1)
284 static jit_word_t
285 _sse_bunlei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
286 #  define sse_buneqr_f(i0, r0, r1)      _sse_buneqr_f(_jit, i0, r0, r1)
287 static jit_word_t _sse_buneqr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
288 #  define sse_buneqi_f(i0, r0, i1)      _sse_buneqi_f(_jit, i0, r0, i1)
289 static jit_word_t
290 _sse_buneqi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
291 #  define sse_bunger_f(i0, r0, r1)      _sse_bunger_f(_jit, i0, r0, r1)
292 static jit_word_t _sse_bunger_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
293 #  define sse_bungei_f(i0, r0, i1)      _sse_bungei_f(_jit, i0, r0, i1)
294 static jit_word_t
295 _sse_bungei_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
296 #  define sse_bungtr_f(i0, r0, r1)      _sse_bungtr_f(_jit, i0, r0, r1)
297 static jit_word_t _sse_bungtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
298 #  define sse_bungti_f(i0, r0, i1)      _sse_bungti_f(_jit, i0, r0, i1)
299 static jit_word_t
300 _sse_bungti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
301 #  define sse_bltgtr_f(i0, r0, r1)      _sse_bltgtr_f(_jit, i0, r0, r1)
302 static jit_word_t _sse_bltgtr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
303 #  define sse_bltgti_f(i0, r0, i1)      _sse_bltgti_f(_jit, i0, r0, i1)
304 static jit_word_t
305 _sse_bltgti_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
306 #  define sse_bordr_f(i0, r0, r1)       _sse_bordr_f(_jit, i0, r0, r1)
307 static jit_word_t _sse_bordr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
308 #  define sse_bordi_f(i0, r0, i1)       _sse_bordi_f(_jit, i0, r0, i1)
309 static jit_word_t
310 _sse_bordi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
311 #  define sse_bunordr_f(i0, r0, r1)     _sse_bunordr_f(_jit, i0, r0, r1)
312 static jit_word_t _sse_bunordr_f(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
313 #  define sse_bunordi_f(i0, r0, i1)     _sse_bunordi_f(_jit, i0, r0, i1)
314 static jit_word_t
315 _sse_bunordi_f(jit_state_t*, jit_word_t, jit_int32_t, jit_float32_t*);
316 #define sse_movr_d(r0,r1)               _sse_movr_d(_jit,r0,r1)
317 static void _sse_movr_d(jit_state_t*, jit_int32_t, jit_int32_t);
318 #define sse_movi_d(r0,i0)               _sse_movi_d(_jit,r0,i0)
319 static void _sse_movi_d(jit_state_t*, jit_int32_t, jit_float64_t*);
320 #  define sse_ltr_d(r0, r1, r2)         ssecmpd(X86_CC_A, r0, r1, r2)
321 #  define sse_lti_d(r0, r1, i0)         _sse_lti_d(_jit, r0, r1, i0)
322 static void _sse_lti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
323 #  define sse_ler_d(r0, r1, r2)         ssecmpd(X86_CC_AE, r0, r1, r2)
324 #  define sse_lei_d(r0, r1, i0)         _sse_lei_d(_jit, r0, r1, i0)
325 static void _sse_lei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
326 #  define sse_eqr_d(r0, r1, r2)         _sse_eqr_d(_jit, r0, r1, r2)
327 static void _sse_eqr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
328 #  define sse_eqi_d(r0, r1, i0)         _sse_eqi_d(_jit, r0, r1, i0)
329 static void _sse_eqi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
330 #  define sse_ger_d(r0, r1, r2)         ssecmpd(X86_CC_AE, r0, r2, r1)
331 #  define sse_gei_d(r0, r1, i0)         _sse_gei_d(_jit, r0, r1, i0)
332 static void _sse_gei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
333 #  define sse_gtr_d(r0, r1, r2)         ssecmpd(X86_CC_A, r0, r2, r1)
334 #  define sse_gti_d(r0, r1, i0)         _sse_gti_d(_jit, r0, r1, i0)
335 static void _sse_gti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
336 #  define sse_ner_d(r0, r1, r2)         _sse_ner_d(_jit, r0, r1, r2)
337 static void _sse_ner_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
338 #  define sse_nei_d(r0, r1, i0)         _sse_nei_d(_jit, r0, r1, i0)
339 static void _sse_nei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
340 #  define sse_unltr_d(r0, r1, r2)       ssecmpd(X86_CC_NAE, r0, r2, r1)
341 #  define sse_unlti_d(r0, r1, i0)       _sse_unlti_d(_jit, r0, r1, i0)
342 static void _sse_unlti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
343 #  define sse_unler_d(r0, r1, r2)       _sse_unler_d(_jit, r0, r1, r2)
344 static void _sse_unler_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
345 #  define sse_unlei_d(r0, r1, i0)       _sse_unlei_d(_jit, r0, r1, i0)
346 static void _sse_unlei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
347 #  define sse_uneqr_d(r0, r1, r2)       _sse_uneqr_d(_jit, r0, r1, r2)
348 static void _sse_uneqr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
349 #  define sse_uneqi_d(r0, r1, i0)       _sse_uneqi_d(_jit, r0, r1, i0)
350 static void _sse_uneqi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
351 #  define sse_unger_d(r0, r1, r2)       _sse_unger_d(_jit, r0, r1, r2)
352 static void _sse_unger_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
353 #  define sse_ungei_d(r0, r1, i0)       _sse_ungei_d(_jit, r0, r1, i0)
354 static void _sse_ungei_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
355 #  define sse_ungtr_d(r0, r1, r2)       ssecmpd(X86_CC_NAE, r0, r1, r2)
356 #  define sse_ungti_d(r0, r1, i0)       _sse_ungti_d(_jit, r0, r1, i0)
357 static void _sse_ungti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
358 #  define sse_ltgtr_d(r0, r1, r2)       _sse_ltgtr_d(_jit, r0, r1, r2)
359 static void _sse_ltgtr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
360 #  define sse_ltgti_d(r0, r1, i0)       _sse_ltgti_d(_jit, r0, r1, i0)
361 static void _sse_ltgti_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
362 #  define sse_ordr_d(r0, r1, r2)        ssecmpd(X86_CC_NP, r0, r2, r1)
363 #  define sse_ordi_d(r0, r1, i0)        _sse_ordi_d(_jit, r0, r1, i0)
364 static void _sse_ordi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
365 #  define sse_unordr_d(r0, r1, r2)      ssecmpd(X86_CC_P, r0, r2, r1)
366 #  define sse_unordi_d(r0, r1, i0)      _sse_unordi_d(_jit, r0, r1, i0)
367 static void _sse_unordi_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_float64_t*);
368 #  define sse_ldr_d(r0, r1)             movsdmr(0, r1, _NOREG, _SCL1, r0)
369 #  define sse_ldi_d(r0, i0)             _sse_ldi_d(_jit, r0, i0)
370 static void _sse_ldi_d(jit_state_t*, jit_int32_t, jit_word_t);
371 #  define sse_ldxr_d(r0, r1, r2)        _sse_ldxr_d(_jit, r0, r1, r2)
372 static void _sse_ldxr_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
373 #  define sse_ldxi_d(r0, r1, i0)        _sse_ldxi_d(_jit, r0, r1, i0)
374 static void _sse_ldxi_d(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
375 #  define sse_bltr_d(i0, r0, r1)        _sse_bltr_d(_jit, i0, r0, r1)
376 #  define sse_str_d(r0, r1)             movsdrm(r1, 0, r0, _NOREG, _SCL1)
377 #  define sse_sti_d(i0, r0)             _sse_sti_d(_jit, i0, r0)
378 static void _sse_sti_d(jit_state_t*, jit_word_t,jit_int32_t);
379 #  define sse_stxr_d(r0, r1, r2)        _sse_stxr_d(_jit, r0, r1, r2)
380 static void _sse_stxr_d(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
381 #  define sse_stxi_d(i0, r0, r1)        _sse_stxi_d(_jit, i0, r0, r1)
382 static void _sse_stxi_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
383 static jit_word_t _sse_bltr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
384 #  define sse_blti_d(i0, r0, i1)        _sse_blti_d(_jit, i0, r0, i1)
385 static jit_word_t
386 _sse_blti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
387 #  define sse_bler_d(i0, r0, r1)        _sse_bler_d(_jit, i0, r0, r1)
388 static jit_word_t _sse_bler_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
389 #  define sse_blei_d(i0, r0, i1)        _sse_blei_d(_jit, i0, r0, i1)
390 static jit_word_t
391 _sse_blei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
392 #  define sse_beqr_d(i0, r0, r1)        _sse_beqr_d(_jit, i0, r0, r1)
393 static jit_word_t _sse_beqr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
394 #  define sse_beqi_d(i0, r0, i1)        _sse_beqi_d(_jit, i0, r0, i1)
395 static jit_word_t
396 _sse_beqi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
397 #  define sse_bger_d(i0, r0, r1)        _sse_bger_d(_jit, i0, r0, r1)
398 static jit_word_t _sse_bger_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
399 #  define sse_bgei_d(i0, r0, i1)        _sse_bgei_d(_jit, i0, r0, i1)
400 static jit_word_t
401 _sse_bgei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
402 #  define sse_bgtr_d(i0, r0, r1)        _sse_bgtr_d(_jit, i0, r0, r1)
403 static jit_word_t _sse_bgtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
404 #  define sse_bgti_d(i0, r0, i1)        _sse_bgti_d(_jit, i0, r0, i1)
405 static jit_word_t
406 _sse_bgti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
407 #  define sse_bner_d(i0, r0, r1)        _sse_bner_d(_jit, i0, r0, r1)
408 static jit_word_t _sse_bner_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
409 #  define sse_bnei_d(i0, r0, i1)        _sse_bnei_d(_jit, i0, r0, i1)
410 static jit_word_t
411 _sse_bnei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
412 #  define sse_bunltr_d(i0, r0, r1)      _sse_bunltr_d(_jit, i0, r0, r1)
413 static jit_word_t _sse_bunltr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
414 #  define sse_bunlti_d(i0, r0, i1)      _sse_bunlti_d(_jit, i0, r0, i1)
415 static jit_word_t
416 _sse_bunlti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
417 #  define sse_bunler_d(i0, r0, r1)      _sse_bunler_d(_jit, i0, r0, r1)
418 static jit_word_t _sse_bunler_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
419 #  define sse_bunlei_d(i0, r0, i1)      _sse_bunlei_d(_jit, i0, r0, i1)
420 static jit_word_t
421 _sse_bunlei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
422 #  define sse_buneqr_d(i0, r0, r1)      _sse_buneqr_d(_jit, i0, r0, r1)
423 static jit_word_t _sse_buneqr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
424 #  define sse_buneqi_d(i0, r0, i1)      _sse_buneqi_d(_jit, i0, r0, i1)
425 static jit_word_t
426 _sse_buneqi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
427 #  define sse_bunger_d(i0, r0, r1)      _sse_bunger_d(_jit, i0, r0, r1)
428 static jit_word_t _sse_bunger_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
429 #  define sse_bungei_d(i0, r0, i1)      _sse_bungei_d(_jit, i0, r0, i1)
430 static jit_word_t
431 _sse_bungei_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
432 #  define sse_bungtr_d(i0, r0, r1)      _sse_bungtr_d(_jit, i0, r0, r1)
433 static jit_word_t _sse_bungtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
434 #  define sse_bungti_d(i0, r0, i1)      _sse_bungti_d(_jit, i0, r0, i1)
435 static jit_word_t
436 _sse_bungti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
437 #  define sse_bltgtr_d(i0, r0, r1)      _sse_bltgtr_d(_jit, i0, r0, r1)
438 static jit_word_t _sse_bltgtr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
439 #  define sse_bltgti_d(i0, r0, i1)      _sse_bltgti_d(_jit, i0, r0, i1)
440 static jit_word_t
441 _sse_bltgti_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
442 #  define sse_bordr_d(i0, r0, r1)       _sse_bordr_d(_jit, i0, r0, r1)
443 static jit_word_t _sse_bordr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
444 #  define sse_bordi_d(i0, r0, i1)       _sse_bordi_d(_jit, i0, r0, i1)
445 static jit_word_t
446 _sse_bordi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
447 #  define sse_bunordr_d(i0, r0, r1)     _sse_bunordr_d(_jit, i0, r0, r1)
448 static jit_word_t _sse_bunordr_d(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
449 #  define sse_bunordi_d(i0, r0, i1)     _sse_bunordi_d(_jit, i0, r0, i1)
450 static jit_word_t
451 _sse_bunordi_d(jit_state_t*, jit_word_t, jit_int32_t, jit_float64_t*);
452 #endif
453
454 #if CODE
455 #  define fpr_opi(name, type, size)                                     \
456 static void                                                             \
457 _sse_##name##i_##type(jit_state_t *_jit,                                \
458                       jit_int32_t r0, jit_int32_t r1,                   \
459                       jit_float##size##_t *i0)                          \
460 {                                                                       \
461     jit_int32_t         reg = jit_get_reg(jit_class_fpr|jit_class_xpr); \
462     assert(jit_sse_reg_p(reg));                                         \
463     sse_movi_##type(rn(reg), i0);                                       \
464     sse_##name##r_##type(r0, r1, rn(reg));                              \
465     jit_unget_reg(reg);                                                 \
466 }
467 #  define fpr_bopi(name, type, size)                                    \
468 static jit_word_t                                                       \
469 _sse_b##name##i_##type(jit_state_t *_jit,                               \
470                        jit_word_t i0, jit_int32_t r0,                   \
471                        jit_float##size##_t *i1)                         \
472 {                                                                       \
473     jit_word_t          word;                                           \
474     jit_int32_t         reg = jit_get_reg(jit_class_fpr|jit_class_xpr|  \
475                                           jit_class_nospill);           \
476     assert(jit_sse_reg_p(reg));                                         \
477     sse_movi_##type(rn(reg), i1);                                       \
478     word = sse_b##name##r_##type(i0, r0, rn(reg));                      \
479     jit_unget_reg(reg);                                                 \
480     return (word);                                                      \
481 }
482 #  define fopi(name)                    fpr_opi(name, f, 32)
483 #  define fbopi(name)                   fpr_bopi(name, f, 32)
484 #  define dopi(name)                    fpr_opi(name, d, 64)
485 #  define dbopi(name)                   fpr_bopi(name, d, 64)
486 static void
487 _sser(jit_state_t *_jit, jit_int32_t c, jit_int32_t r0, jit_int32_t r1)
488 {
489     rex(0, 0, r0, 0, r1);
490     ic(0x0f);
491     ic(c);
492     mrm(0x03, r7(r0), r7(r1));
493 }
494
495 static void
496 _ssexr(jit_state_t *_jit, jit_int32_t p, jit_int32_t c,
497        jit_int32_t r0, jit_int32_t r1)
498 {
499     ic(p);
500     rex(0, 0, r0, 0, r1);
501     ic(0x0f);
502     ic(c);
503     mrm(0x03, r7(r0), r7(r1));
504 }
505
506 static void
507 _ssexi(jit_state_t *_jit, jit_int32_t c, jit_int32_t r0,
508        jit_int32_t m, jit_int32_t i)
509 {
510     ic(0x66);
511     rex(0, 0, 0, 0, r0);
512     ic(0x0f);
513     ic(c);
514     mrm(0x03, r7(m), r7(r0));
515     ic(i);
516 }
517
518 #if __X64
519 static void
520 _sselxr(jit_state_t *_jit, jit_int32_t p, jit_int32_t c,
521         jit_int32_t r0, jit_int32_t r1)
522 {
523     ic(p);
524     rex(0, 1, r0, 0, r1);
525     ic(0x0f);
526     ic(c);
527     mrm(0x03, r7(r0), r7(r1));
528 }
529 #endif
530
531 static void
532 _ssexrx(jit_state_t *_jit, jit_int32_t px, jit_int32_t code, jit_int32_t md,
533        jit_int32_t rb, jit_int32_t ri, jit_int32_t ms, jit_int32_t rd)
534 {
535     ic(px);
536     rex(0, 0, rd, ri, rb);
537     ic(0x0f);
538     ic(code);
539     rx(rd, md, rb, ri, ms);
540 }
541
542 static void
543 _sse_addr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
544 {
545     if (r0 == r1)
546         addssr(r0, r2);
547     else if (r0 == r2)
548         addssr(r0, r1);
549     else {
550         sse_movr_f(r0, r1);
551         addssr(r0, r2);
552     }
553 }
554
555 fopi(add)
556
557 static void
558 _sse_addr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
559 {
560     if (r0 == r1)
561         addsdr(r0, r2);
562     else if (r0 == r2)
563         addsdr(r0, r1);
564     else {
565         sse_movr_d(r0, r1);
566         addsdr(r0, r2);
567     }
568 }
569
570 dopi(add)
571
572 static void
573 _sse_subr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
574 {
575     jit_int32_t         reg;
576     if (r0 == r1)
577         subssr(r0, r2);
578     else if (r0 == r2) {
579         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
580         sse_movr_f(rn(reg), r0);
581         sse_movr_f(r0, r1);
582         subssr(r0, rn(reg));
583         jit_unget_reg(reg);
584     }
585     else {
586         sse_movr_f(r0, r1);
587         subssr(r0, r2);
588     }
589 }
590
591 fopi(sub)
592
593 static void
594 _sse_subr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
595 {
596     jit_int32_t         reg;
597     if (r0 == r1)
598         subsdr(r0, r2);
599     else if (r0 == r2) {
600         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
601         sse_movr_d(rn(reg), r0);
602         sse_movr_d(r0, r1);
603         subsdr(r0, rn(reg));
604         jit_unget_reg(reg);
605     }
606     else {
607         sse_movr_d(r0, r1);
608         subsdr(r0, r2);
609     }
610 }
611
612 dopi(sub)
613
614 fopi(rsb)
615
616 dopi(rsb)
617
618 static void
619 _sse_mulr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
620 {
621     if (r0 == r1)
622         mulssr(r0, r2);
623     else if (r0 == r2)
624         mulssr(r0, r1);
625     else {
626         sse_movr_f(r0, r1);
627         mulssr(r0, r2);
628     }
629 }
630
631 fopi(mul)
632
633 static void
634 _sse_mulr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
635 {
636     if (r0 == r1)
637         mulsdr(r0, r2);
638     else if (r0 == r2)
639         mulsdr(r0, r1);
640     else {
641         sse_movr_d(r0, r1);
642         mulsdr(r0, r2);
643     }
644 }
645
646 dopi(mul)
647
648 static void
649 _sse_divr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
650 {
651     jit_int32_t         reg;
652     if (r0 == r1)
653         divssr(r0, r2);
654     else if (r0 == r2) {
655         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
656         sse_movr_f(rn(reg), r0);
657         sse_movr_f(r0, r1);
658         divssr(r0, rn(reg));
659         jit_unget_reg(reg);
660     }
661     else {
662         sse_movr_f(r0, r1);
663         divssr(r0, r2);
664     }
665 }
666
667 fopi(div)
668
669 static void
670 _sse_divr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
671 {
672     jit_int32_t         reg;
673     if (r0 == r1)
674         divsdr(r0, r2);
675     else if (r0 == r2) {
676         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
677         sse_movr_d(rn(reg), r0);
678         sse_movr_d(r0, r1);
679         divsdr(r0, rn(reg));
680         jit_unget_reg(reg);
681     }
682     else {
683         sse_movr_d(r0, r1);
684         divsdr(r0, r2);
685     }
686 }
687
688 dopi(div)
689
690 static void
691 _sse_absr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
692 {
693     jit_int32_t         reg;
694     if (r0 == r1) {
695         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
696         pcmpeqlr(rn(reg), rn(reg));
697         psrl(rn(reg), 1);
698         andpsr(r0, rn(reg));
699         jit_unget_reg(reg);
700     }
701     else {
702         pcmpeqlr(r0, r0);
703         psrl(r0, 1);
704         andpsr(r0, r1);
705     }
706 }
707
708 static void
709 _sse_absr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
710 {
711     jit_int32_t         reg;
712     if (r0 == r1) {
713         reg = jit_get_reg(jit_class_fpr|jit_class_xpr);
714         pcmpeqlr(rn(reg), rn(reg));
715         psrq(rn(reg), 1);
716         andpdr(r0, rn(reg));
717         jit_unget_reg(reg);
718     }
719     else {
720         pcmpeqlr(r0, r0);
721         psrq(r0, 1);
722         andpdr(r0, r1);
723     }
724 }
725
726 static void
727 _sse_negr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
728 {
729     jit_int32_t         freg, ireg;
730     ireg = jit_get_reg(jit_class_gpr);
731     imovi(rn(ireg), 0x80000000);
732     if (r0 == r1) {
733         freg = jit_get_reg(jit_class_fpr|jit_class_xpr);
734         movdlxr(rn(freg), rn(ireg));
735         xorpsr(r0, rn(freg));
736         jit_unget_reg(freg);
737     }
738     else {
739         movdlxr(r0, rn(ireg));
740         xorpsr(r0, r1);
741     }
742     jit_unget_reg(ireg);
743 }
744
745 static void
746 _sse_negr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
747 {
748     jit_int32_t         freg, ireg;
749     ireg = jit_get_reg(jit_class_gpr);
750     imovi(rn(ireg), 0x80000000);
751     if (r0 == r1) {
752         freg = jit_get_reg(jit_class_fpr|jit_class_xpr);
753         movdlxr(rn(freg), rn(ireg));
754         pslq(rn(freg), 32);
755         xorpdr(r0, rn(freg));
756         jit_unget_reg(freg);
757     }
758     else {
759         movdlxr(r0, rn(ireg));
760         pslq(r0, 32);
761         xorpdr(r0, r1);
762     }
763     jit_unget_reg(ireg);
764 }
765
766 static void
767 _ssecmp(jit_state_t *_jit, jit_bool_t d, jit_int32_t code,
768         jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
769 {
770     jit_bool_t          rc;
771     jit_int32_t         reg;
772     if ((rc = reg8_p(r0)))
773         reg = r0;
774     else {
775         reg = _RAX_REGNO;
776         movr(r0, reg);
777     }
778     ixorr(reg, reg);
779     if (d)
780         ucomisdr(r2, r1);
781     else
782         ucomissr(r2, r1);
783     cc(code, reg);
784     if (!rc)
785         xchgr(r0, reg);
786 }
787
788 static void
789 _sse_movr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
790 {
791     if (r0 != r1)
792         ssexr(0xf3, X86_SSE_MOV, r0, r1);
793 }
794
795 static void
796 _sse_movi_f(jit_state_t *_jit, jit_int32_t r0, jit_float32_t *i0)
797 {
798     union {
799         jit_int32_t      i;
800         jit_float32_t    f;
801     } data;
802     jit_int32_t          reg;
803     jit_bool_t           ldi;
804
805     data.f = *i0;
806     if (data.f == 0.0 && !(data.i & 0x80000000))
807         xorpsr(r0, r0);
808     else {
809         ldi = !_jitc->no_data;
810 #if __X64
811         /* if will allocate a register for offset, just use immediate */
812         if (ldi && !sse_address_p(i0))
813             ldi = 0;
814 #endif
815         if (ldi)
816             sse_ldi_f(r0, (jit_word_t)i0);
817         else {
818             reg = jit_get_reg(jit_class_gpr);
819             movi(rn(reg), data.i);
820             movdlxr(r0, rn(reg));
821             jit_unget_reg(reg);
822         }
823     }
824 }
825
826 fopi(lt)
827 fopi(le)
828
829 static void
830 _sse_eqr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
831 {
832     jit_bool_t          rc;
833     jit_int32_t         reg;
834     jit_word_t          jp_code;
835     if ((rc = reg8_p(r0)))
836         reg = r0;
837     else {
838         reg = _RAX_REGNO;
839         movr(r0, _RAX_REGNO);
840     }
841     ixorr(reg, reg);
842     ucomissr(r2, r1);
843     jpes(0);
844     jp_code = _jit->pc.w;
845     cc(X86_CC_E, reg);
846     patch_rel_char(jp_code, _jit->pc.w);
847     if (!rc)
848         xchgr(r0, reg);
849 }
850
851 fopi(eq)
852 fopi(ge)
853 fopi(gt)
854
855 static void
856 _sse_ner_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
857 {
858     jit_bool_t          rc;
859     jit_int32_t         reg;
860     jit_word_t          jp_code;
861     if ((rc = reg8_p(r0)))
862         reg = r0;
863     else {
864         reg = _RAX_REGNO;
865         movr(r0, _RAX_REGNO);
866     }
867     imovi(reg, 1);
868     ucomissr(r2, r1);
869     jpes(0);
870     jp_code = _jit->pc.w;
871     cc(X86_CC_NE, reg);
872     patch_rel_char(jp_code, _jit->pc.w);
873     if (!rc)
874         xchgr(r0, reg);
875 }
876
877 fopi(ne)
878 fopi(unlt)
879
880 static void
881 _sse_unler_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
882 {
883     if (r1 == r2)
884         movi(r0, 1);
885     else
886         ssecmpf(X86_CC_NA, r0, r2, r1);
887 }
888
889 fopi(unle)
890
891 static void
892 _sse_uneqr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
893 {
894     if (r1 == r2)
895         movi(r0, 1);
896     else
897         ssecmpf(X86_CC_E, r0, r1, r2);
898 }
899
900 fopi(uneq)
901
902 static void
903 _sse_unger_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
904 {
905     if (r1 == r2)
906         movi(r0, 1);
907     else
908         ssecmpf(X86_CC_NA, r0, r1, r2);
909 }
910
911 fopi(unge)
912 fopi(ungt)
913
914 static void
915 _sse_ltgtr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
916 {
917     if (r1 == r2)
918         ixorr(r0, r0);
919     else
920         ssecmpf(X86_CC_NE, r0, r1, r2);
921 }
922
923 fopi(ltgt)
924 fopi(ord)
925 fopi(unord)
926
927 static void
928 _sse_ldi_f(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
929 {
930     jit_int32_t         reg;
931     if (sse_address_p(i0))
932         movssmr(i0, _NOREG, _NOREG, _SCL1, r0);
933     else {
934         reg = jit_get_reg(jit_class_gpr);
935         movi(rn(reg), i0);
936         sse_ldr_f(r0, rn(reg));
937         jit_unget_reg(reg);
938     }
939 }
940
941 static void
942 _sse_ldxr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
943 {
944 #if __X64_32
945     jit_int32_t         reg;
946     reg = jit_get_reg(jit_class_gpr);
947     addr(rn(reg), r1, r2);
948     sse_ldr_f(r0, rn(reg));
949     jit_unget_reg(reg);
950 #else
951     movssmr(0, r1, r2, _SCL1, r0);
952 #endif
953 }
954
955 static void
956 _sse_ldxi_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
957 {
958     jit_int32_t         reg;
959     if (can_sign_extend_int_p(i0))
960         movssmr(i0, r1, _NOREG, _SCL1, r0);
961     else {
962         reg = jit_get_reg(jit_class_gpr);
963 #if __X64_32
964         addi(rn(reg), r1, i0);
965         sse_ldr_f(r0, rn(reg));
966 #else
967         movi(rn(reg), i0);
968         sse_ldxr_f(r0, r1, rn(reg));
969 #endif
970         jit_unget_reg(reg);
971     }
972 }
973
974 static void
975 _sse_sti_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0)
976 {
977     jit_int32_t         reg;
978     if (sse_address_p(i0))
979         movssrm(r0, i0, _NOREG, _NOREG, _SCL1);
980     else {
981         reg = jit_get_reg(jit_class_gpr);
982         movi(rn(reg), i0);
983         sse_str_f(rn(reg), r0);
984         jit_unget_reg(reg);
985     }
986 }
987
988 static void
989 _sse_stxr_f(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
990 {
991 #if __X64_32
992     jit_int32_t         reg;
993     reg = jit_get_reg(jit_class_gpr);
994     addr(rn(reg), r0, r1);
995     sse_str_f(rn(reg), r2);
996     jit_unget_reg(reg);
997 #else
998     movssrm(r2, 0, r0, r1, _SCL1);
999 #endif
1000 }
1001
1002 static void
1003 _sse_stxi_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1004 {
1005     jit_int32_t         reg;
1006     if (can_sign_extend_int_p(i0))
1007         movssrm(r1, i0, r0, _NOREG, _SCL1);
1008     else {
1009         reg = jit_get_reg(jit_class_gpr);
1010 #if __X64_32
1011         addi(rn(reg), r0, i0);
1012         sse_str_f(rn(reg), r1);
1013 #else
1014         movi(rn(reg), i0);
1015         sse_stxr_f(rn(reg), r0, r1);
1016 #endif
1017         jit_unget_reg(reg);
1018     }
1019 }
1020
1021 static jit_word_t
1022 _sse_bltr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1023 {
1024     ucomissr(r1, r0);
1025     ja(i0);
1026     return (_jit->pc.w);
1027 }
1028 fbopi(lt)
1029
1030 static jit_word_t
1031 _sse_bler_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1032 {
1033     ucomissr(r1, r0);
1034     jae(i0);
1035     return (_jit->pc.w);
1036 }
1037 fbopi(le)
1038
1039 static jit_word_t
1040 _sse_beqr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1041 {
1042     jit_word_t          jp_code;
1043     ucomissr(r0, r1);
1044     jps(0);
1045     jp_code = _jit->pc.w;
1046     je(i0);
1047     patch_rel_char(jp_code, _jit->pc.w);
1048     return (_jit->pc.w);
1049 }
1050 fbopi(eq)
1051
1052 static jit_word_t
1053 _sse_bger_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1054 {
1055     ucomissr(r0, r1);
1056     jae(i0);
1057     return (_jit->pc.w);
1058 }
1059 fbopi(ge)
1060
1061 static jit_word_t
1062 _sse_bgtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1063 {
1064     ucomissr(r0, r1);
1065     ja(i0);
1066     return (_jit->pc.w);
1067 }
1068 fbopi(gt)
1069
1070 static jit_word_t
1071 _sse_bner_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1072 {
1073     jit_word_t          jp_code;
1074     jit_word_t          jz_code;
1075     ucomissr(r0, r1);
1076     jps(0);
1077     jp_code = _jit->pc.w;
1078     jzs(0);
1079     jz_code = _jit->pc.w;
1080     patch_rel_char(jp_code, _jit->pc.w);
1081     jmpi(i0);
1082     patch_rel_char(jz_code, _jit->pc.w);
1083     return (_jit->pc.w);
1084 }
1085 fbopi(ne)
1086
1087 static jit_word_t
1088 _sse_bunltr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1089 {
1090     ucomissr(r0, r1);
1091     jnae(i0);
1092     return (_jit->pc.w);
1093 }
1094 fbopi(unlt)
1095
1096 static jit_word_t
1097 _sse_bunler_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1098 {
1099     if (r0 == r1)
1100         jmpi(i0);
1101     else {
1102         ucomissr(r0, r1);
1103         jna(i0);
1104     }
1105     return (_jit->pc.w);
1106 }
1107 fbopi(unle)
1108
1109 static jit_word_t
1110 _sse_buneqr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1111 {
1112     if (r0 == r1)
1113         jmpi(i0);
1114     else {
1115         ucomissr(r0, r1);
1116         je(i0);
1117     }
1118     return (_jit->pc.w);
1119 }
1120 fbopi(uneq)
1121
1122 static jit_word_t
1123 _sse_bunger_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1124 {
1125     if (r0 == r1)
1126         jmpi(i0);
1127     else {
1128         ucomissr(r1, r0);
1129         jna(i0);
1130     }
1131     return (_jit->pc.w);
1132 }
1133 fbopi(unge)
1134
1135 static jit_word_t
1136 _sse_bungtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1137 {
1138     ucomissr(r1, r0);
1139     jnae(i0);
1140     return (_jit->pc.w);
1141 }
1142 fbopi(ungt)
1143
1144 static jit_word_t
1145 _sse_bltgtr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1146 {
1147     ucomissr(r0, r1);
1148     jne(i0);
1149     return (_jit->pc.w);
1150 }
1151 fbopi(ltgt)
1152
1153 static jit_word_t
1154 _sse_bordr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1155 {
1156     ucomissr(r0, r1);
1157     jnp(i0);
1158     return (_jit->pc.w);
1159 }
1160 fbopi(ord)
1161
1162 static jit_word_t
1163 _sse_bunordr_f(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1164 {
1165     ucomissr(r0, r1);
1166     jp(i0);
1167     return (_jit->pc.w);
1168 }
1169 fbopi(unord)
1170
1171 dopi(lt)
1172 dopi(le)
1173
1174 static void
1175 _sse_eqr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1176 {
1177     jit_bool_t          rc;
1178     jit_int32_t         reg;
1179     jit_word_t          jp_code;
1180     if ((rc = reg8_p(r0)))
1181         reg = r0;
1182     else {
1183         reg = _RAX_REGNO;
1184         movr(r0, _RAX_REGNO);
1185     }
1186     ixorr(reg, reg);
1187     ucomisdr(r2, r1);
1188     jpes(0);
1189     jp_code = _jit->pc.w;
1190     cc(X86_CC_E, reg);
1191     patch_rel_char(jp_code, _jit->pc.w);
1192     if (!rc)
1193         xchgr(r0, reg);
1194 }
1195
1196 dopi(eq)
1197 dopi(ge)
1198 dopi(gt)
1199
1200 static void
1201 _sse_ner_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1202 {
1203     jit_bool_t          rc;
1204     jit_int32_t         reg;
1205     jit_word_t          jp_code;
1206     if ((rc = reg8_p(r0)))
1207         reg = r0;
1208     else {
1209         reg = _RAX_REGNO;
1210         movr(r0, _RAX_REGNO);
1211     }
1212     imovi(reg, 1);
1213     ucomisdr(r2, r1);
1214     jpes(0);
1215     jp_code = _jit->pc.w;
1216     cc(X86_CC_NE, reg);
1217     patch_rel_char(jp_code, _jit->pc.w);
1218     if (!rc)
1219         xchgr(r0, reg);
1220 }
1221
1222 dopi(ne)
1223 dopi(unlt)
1224
1225 static void
1226 _sse_unler_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1227 {
1228     if (r1 == r2)
1229         movi(r0, 1);
1230     else
1231         ssecmpd(X86_CC_NA, r0, r2, r1);
1232 }
1233
1234 dopi(unle)
1235
1236 static void
1237 _sse_uneqr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1238 {
1239     if (r1 == r2)
1240         movi(r0, 1);
1241     else
1242         ssecmpd(X86_CC_E, r0, r1, r2);
1243 }
1244
1245 dopi(uneq)
1246
1247 static void
1248 _sse_unger_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1249 {
1250     if (r1 == r2)
1251         movi(r0, 1);
1252     else
1253         ssecmpd(X86_CC_NA, r0, r1, r2);
1254 }
1255
1256 dopi(unge)
1257 dopi(ungt)
1258
1259 static void
1260 _sse_ltgtr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1261 {
1262     if (r1 == r2)
1263         ixorr(r0, r0);
1264     else
1265         ssecmpd(X86_CC_NE, r0, r1, r2);
1266 }
1267
1268 dopi(ltgt)
1269 dopi(ord)
1270 dopi(unord)
1271
1272 static void
1273 _sse_movr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
1274 {
1275     if (r0 != r1)
1276         ssexr(0xf2, X86_SSE_MOV, r0, r1);
1277 }
1278
1279 static void
1280 _sse_movi_d(jit_state_t *_jit, jit_int32_t r0, jit_float64_t *i0)
1281 {
1282     union {
1283         jit_int32_t      ii[2];
1284         jit_word_t       w;
1285         jit_float64_t    d;
1286     } data;
1287     jit_int32_t          reg;
1288     jit_bool_t           ldi;
1289
1290     data.d = *i0;
1291     if (data.d == 0.0 && !(data.ii[1] & 0x80000000))
1292         xorpdr(r0, r0);
1293     else {
1294         ldi = !_jitc->no_data;
1295 #if __X64
1296         /* if will allocate a register for offset, just use immediate */
1297         if (ldi && !sse_address_p(i0))
1298             ldi = 0;
1299 #endif
1300         if (ldi)
1301             sse_ldi_d(r0, (jit_word_t)i0);
1302         else {
1303             reg = jit_get_reg(jit_class_gpr);
1304 #if __X64 && !__X64_32
1305             movi(rn(reg), data.w);
1306             movdqxr(r0, rn(reg));
1307             jit_unget_reg(reg);
1308 #else
1309             movi(rn(reg), data.ii[0]);
1310             stxi_i(CVT_OFFSET, _RBP_REGNO, rn(reg));
1311             movi(rn(reg), data.ii[1]);
1312             stxi_i(CVT_OFFSET + 4, _RBP_REGNO, rn(reg));
1313             jit_unget_reg(reg);
1314             sse_ldxi_d(r0, _RBP_REGNO, CVT_OFFSET);
1315 #endif
1316         }
1317     }
1318 }
1319
1320 static void
1321 _sse_ldi_d(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
1322 {
1323     jit_int32_t         reg;
1324     if (sse_address_p(i0))
1325         movsdmr(i0, _NOREG, _NOREG, _SCL1, r0);
1326     else {
1327         reg = jit_get_reg(jit_class_gpr);
1328         movi(rn(reg), i0);
1329         sse_ldr_d(r0, rn(reg));
1330         jit_unget_reg(reg);
1331     }
1332 }
1333
1334 static void
1335 _sse_ldxr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1336 {
1337 #if __X64_32
1338     jit_int32_t         reg;
1339     reg = jit_get_reg(jit_class_gpr);
1340     addr(rn(reg), r1, r2);
1341     sse_ldr_d(r0, rn(reg));
1342     jit_unget_reg(reg);
1343 #else
1344     movsdmr(0, r1, r2, _SCL1, r0);
1345 #endif
1346 }
1347
1348 static void
1349 _sse_ldxi_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
1350 {
1351     jit_int32_t         reg;
1352     if (can_sign_extend_int_p(i0))
1353         movsdmr(i0, r1, _NOREG, _SCL1, r0);
1354     else {
1355         reg = jit_get_reg(jit_class_gpr);
1356 #if __X64_32
1357         addi(rn(reg), r1, i0);
1358         sse_ldr_d(r0, rn(reg));
1359 #else
1360         movi(rn(reg), i0);
1361         sse_ldxr_d(r0, r1, rn(reg));
1362 #endif
1363         jit_unget_reg(reg);
1364     }
1365 }
1366
1367 static void
1368 _sse_sti_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0)
1369 {
1370     jit_int32_t         reg;
1371     if (sse_address_p(i0))
1372         movsdrm(r0, i0, _NOREG, _NOREG, _SCL1);
1373     else {
1374         reg = jit_get_reg(jit_class_gpr);
1375         movi(rn(reg), i0);
1376         sse_str_d(rn(reg), r0);
1377         jit_unget_reg(reg);
1378     }
1379 }
1380
1381 static void
1382 _sse_stxr_d(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
1383 {
1384 #if __X64_32
1385     jit_int32_t         reg;
1386     reg = jit_get_reg(jit_class_gpr);
1387     addr(rn(reg), r0, r1);
1388     sse_str_d(rn(reg), r2);
1389     jit_unget_reg(reg);
1390 #else
1391     movsdrm(r2, 0, r0, r1, _SCL1);
1392 #endif
1393 }
1394
1395 static void
1396 _sse_stxi_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1397 {
1398     jit_int32_t         reg;
1399     if (can_sign_extend_int_p(i0))
1400         movsdrm(r1, i0, r0, _NOREG, _SCL1);
1401     else {
1402         reg = jit_get_reg(jit_class_gpr);
1403 #if __X64_32
1404         addi(rn(reg), r0, i0);
1405         sse_str_d(rn(reg), r1);
1406 #else
1407         movi(rn(reg), i0);
1408         sse_stxr_f(rn(reg), r0, r1);
1409 #endif
1410         jit_unget_reg(reg);
1411     }
1412 }
1413
1414 static jit_word_t
1415 _sse_bltr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1416 {
1417     ucomisdr(r1, r0);
1418     ja(i0);
1419     return (_jit->pc.w);
1420 }
1421 dbopi(lt)
1422
1423 static jit_word_t
1424 _sse_bler_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1425 {
1426     ucomisdr(r1, r0);
1427     jae(i0);
1428     return (_jit->pc.w);
1429 }
1430 dbopi(le)
1431
1432 static jit_word_t
1433 _sse_beqr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1434 {
1435     jit_word_t          jp_code;
1436     ucomisdr(r0, r1);
1437     jps(0);
1438     jp_code = _jit->pc.w;
1439     je(i0);
1440     patch_rel_char(jp_code, _jit->pc.w);
1441     return (_jit->pc.w);
1442 }
1443 dbopi(eq)
1444
1445 static jit_word_t
1446 _sse_bger_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1447 {
1448     ucomisdr(r0, r1);
1449     jae(i0);
1450     return (_jit->pc.w);
1451 }
1452 dbopi(ge)
1453
1454 static jit_word_t
1455 _sse_bgtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1456 {
1457     ucomisdr(r0, r1);
1458     ja(i0);
1459     return (_jit->pc.w);
1460 }
1461 dbopi(gt)
1462
1463 static jit_word_t
1464 _sse_bner_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1465 {
1466     jit_word_t          jp_code;
1467     jit_word_t          jz_code;
1468     ucomisdr(r0, r1);
1469     jps(0);
1470     jp_code = _jit->pc.w;
1471     jzs(0);
1472     jz_code = _jit->pc.w;
1473     patch_rel_char(jp_code, _jit->pc.w);
1474     jmpi(i0);
1475     patch_rel_char(jz_code, _jit->pc.w);
1476     return (_jit->pc.w);
1477 }
1478 dbopi(ne)
1479
1480 static jit_word_t
1481 _sse_bunltr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1482 {
1483     ucomisdr(r0, r1);
1484     jnae(i0);
1485     return (_jit->pc.w);
1486 }
1487 dbopi(unlt)
1488
1489 static jit_word_t
1490 _sse_bunler_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1491 {
1492     if (r0 == r1)
1493         jmpi(i0);
1494     else {
1495         ucomisdr(r0, r1);
1496         jna(i0);
1497     }
1498     return (_jit->pc.w);
1499 }
1500 dbopi(unle)
1501
1502 static jit_word_t
1503 _sse_buneqr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1504 {
1505     if (r0 == r1)
1506         jmpi(i0);
1507     else {
1508         ucomisdr(r0, r1);
1509         je(i0);
1510     }
1511     return (_jit->pc.w);
1512 }
1513 dbopi(uneq)
1514
1515 static jit_word_t
1516 _sse_bunger_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1517 {
1518     if (r0 == r1)
1519         jmpi(i0);
1520     else {
1521         ucomisdr(r1, r0);
1522         jna(i0);
1523     }
1524     return (_jit->pc.w);
1525 }
1526 dbopi(unge)
1527
1528 static jit_word_t
1529 _sse_bungtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1530 {
1531     ucomisdr(r1, r0);
1532     jnae(i0);
1533     return (_jit->pc.w);
1534 }
1535 dbopi(ungt)
1536
1537 static jit_word_t
1538 _sse_bltgtr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1539 {
1540     ucomisdr(r0, r1);
1541     jne(i0);
1542     return (_jit->pc.w);
1543 }
1544 dbopi(ltgt)
1545
1546 static jit_word_t
1547 _sse_bordr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1548 {
1549     ucomisdr(r0, r1);
1550     jnp(i0);
1551     return (_jit->pc.w);
1552 }
1553 dbopi(ord)
1554
1555 static jit_word_t
1556 _sse_bunordr_d(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
1557 {
1558     ucomisdr(r0, r1);
1559     jp(i0);
1560     return (_jit->pc.w);
1561 }
1562 dbopi(unord)
1563 #  undef fopi
1564 #  undef fbopi
1565 #  undef bopi
1566 #  undef dbopi
1567 #  undef fpr_bopi
1568 #  undef fpr_opi
1569 #endif