cdrom: change pause timing again
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / vector_ops.h
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of
7  * the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  */
14
15 #ifndef VECTOR_OPS
16 #define VECTOR_OPS
17
18 #include "vector_types.h"
19
20
21 #define foreach_element(iterations, operation)                                 \
22 {                                                                              \
23   u32 _i;                                                                      \
24   for(_i = 0; _i < iterations; _i++)                                           \
25   {                                                                            \
26     operation;                                                                 \
27   }                                                                            \
28 }                                                                              \
29
30 #define load_64b(dest, source)                                                 \
31  *((u64 *)(dest).e) = *((u64 *)(source))                                       \
32
33 #define load_128b(dest, source)                                                \
34  *((u64 *)(dest).e) = *((u64 *)(source));                                      \
35  *((u64 *)(dest).e + 1) = *(((u64 *)(source)) + 1)                             \
36
37 #define load_8x16b(dest, source)                                               \
38   foreach_element(8, (dest).e[_i] = ((u16 *)(source))[_i])                     \
39
40 #define store_64b(source, dest)                                                \
41  *((u64 *)(dest)) = *((u64 *)(source).e)                                       \
42
43 #define store_128b(source, dest)                                               \
44  *((u64 *)(dest)) = *((u64 *)(source).e);                                      \
45  *(((u64 *)(dest)) + 1) = *((u64 *)(source).e + 1)                             \
46
47 #define store_8x16b(source, dest)                                              \
48   foreach_element(8, ((u16 *)dest)[_i] = (source).e[_i])                       \
49
50
51 #define split_8x16b(dest, source)                                              \
52   foreach_element(8,                                                           \
53   {                                                                            \
54     (dest).e[_i * 2] = (source).e[_i];                                         \
55     (dest).e[(_i * 2) + 1] = (source).e[_i] >> 8;                              \
56   })                                                                           \
57
58 #define merge_16x8b(dest, source)                                              \
59   foreach_element(8,                                                           \
60     (dest).e[_i] = (source).e[_i * 2] | ((source).e[(_i * 2) + 1] << 8))       \
61
62 #define vector_cast(vec_to, source)                                            \
63   (*((volatile vec_to *)(&(source))))                                          \
64
65 #define vector_cast_high(vec_to, source)                                       \
66   (*((volatile vec_to *)((u8 *)source.e + (sizeof(source.e) / 2))))            \
67
68
69 #define dup_8x8b(dest, value)                                                  \
70   foreach_element(8, (dest).e[_i] = value)                                     \
71
72 #define dup_16x8b(dest, value)                                                 \
73   foreach_element(16, (dest).e[_i] = value)                                    \
74
75 #define dup_4x16b(dest, value)                                                 \
76   foreach_element(4, (dest).e[_i] = value)                                     \
77
78 #define dup_8x16b(dest, value)                                                 \
79   foreach_element(8, (dest).e[_i] = value)                                     \
80
81 #define dup_2x32b(dest, value)                                                 \
82   foreach_element(2, (dest).e[_i] = value)                                     \
83
84 #define dup_4x32b(dest, value)                                                 \
85   foreach_element(4, (dest).e[_i] = value)                                     \
86
87 #define shr_narrow_8x16b(dest, source, shift)                                  \
88   foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
89
90 #define shr_narrow_2x64b(dest, source, shift)                                  \
91   foreach_element(2, (dest).e[_i] = (source).e[_i] >> (shift))                 \
92
93 #define shr_8x8b(dest, source, shift)                                          \
94   foreach_element(8, (dest).e[_i] = (u8)(source).e[_i] >> (shift))             \
95
96 #define shl_8x8b(dest, source, shift)                                          \
97   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
98
99 #define shr_8x16b(dest, source, shift)                                         \
100   foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
101
102 #define shr_2x32b(dest, source, shift)                                         \
103   foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
104
105 #define shr_4x16b(dest, source, shift)                                         \
106   foreach_element(4, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
107
108 #define shl_4x16b(dest, source, shift)                                         \
109   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift))            \
110
111 #define shr_4x32b(dest, source, shift)                                         \
112   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
113
114 #define shr_narrow_4x32b(dest, source, shift)                                  \
115   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
116
117 #define shl_8x16b(dest, source, shift)                                         \
118   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
119
120 #define shl_4x32b(dest, source, shift)                                         \
121   foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift))                 \
122
123 #define shl_2x32b(dest, source, shift)                                         \
124   foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift))                 \
125
126 #define shl_1x64b(dest, source, shift)                                         \
127   ((dest).e[0] = (source).e[0] << (shift))                                     \
128
129 #define shl_2x64b(dest, source, shift)                                         \
130   foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift))                 \
131
132 #define shl_variable_2x64b(dest, source_a, source_b)                           \
133   foreach_element(2,                                                           \
134    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
135
136 #define shl_variable_8x16b(dest, source_a, source_b)                           \
137   foreach_element(8,                                                           \
138    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
139
140 #define shl_variable_4x16b(dest, source_a, source_b)                           \
141   foreach_element(4,                                                           \
142    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
143
144 #define shr_1x64b(dest, source, shift)                                         \
145   ((dest).e[0] = (source).e[0] >> (shift))                                     \
146
147 #define shl_long_8x8b(dest, source, shift)                                     \
148   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
149
150 #define shl_long_4x16b(dest, source, shift)                                    \
151   foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift))                 \
152
153 #define shrq_narrow_signed_8x16b(dest, source, shift)                          \
154   foreach_element(8,                                                           \
155   {                                                                            \
156     s32 result = ((s16)(source).e[_i]) >> shift;                               \
157     if(result < 0)                                                             \
158       result = 0;                                                              \
159     if(result > 0xFF)                                                          \
160       result = 0xFF;                                                           \
161     (dest).e[_i] = result;                                                     \
162   })                                                                           \
163
164 #define shl_reg_4x32b(dest, source_a, source_b)                                \
165   foreach_element(4,                                                           \
166   {                                                                            \
167     s8 shift  = (source_b).e[_i];                                              \
168     if(shift < 0)                                                              \
169       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
170     else                                                                       \
171       dest.e[_i] = (source_a).e[_i] << shift;                                  \
172   })                                                                           \
173
174 #define shl_reg_2x32b(dest, source_a, source_b)                                \
175   foreach_element(2,                                                           \
176   {                                                                            \
177     s8 shift  = (source_b).e[_i];                                              \
178     if(shift < 0)                                                              \
179       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
180     else                                                                       \
181       dest.e[_i] = (source_a).e[_i] << shift;                                  \
182   })                                                                           \
183
184 #define shl_reg_2x64b(dest, source_a, source_b)                                \
185   foreach_element(2,                                                           \
186   {                                                                            \
187     s8 shift  = (source_b).e[_i];                                              \
188     if(shift < 0)                                                              \
189       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
190     else                                                                       \
191       dest.e[_i] = (source_a).e[_i] << shift;                                  \
192   })                                                                           \
193
194
195 #define sri_8x8b(dest, source, shift)                                          \
196   foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF >> (shift))) |      \
197    ((u8)(source).e[_i] >> (shift)))                                            \
198
199 #define sli_8x8b(dest, source, shift)                                          \
200   foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF << (shift))) |      \
201    ((source).e[_i] << (shift)))                                                \
202
203
204
205 #define mov_narrow_8x16b(dest, source)                                         \
206   foreach_element(8, (dest).e[_i] = (source).e[_i])                            \
207
208 #define mov_narrow_4x32b(dest, source)                                         \
209   foreach_element(4, (dest).e[_i] = (source).e[_i])                            \
210
211 #define mov_narrow_2x64b(dest, source)                                         \
212   foreach_element(2, (dest).e[_i] = (source).e[_i])                            \
213
214 #define mov_wide_8x8b(dest, source)                                            \
215   foreach_element(8, (dest).e[_i] = (source).e[_i])                            \
216
217 #define mov_wide_2x32b(dest, source)                                           \
218   foreach_element(2, (dest).e[_i] = (source).e[_i])                            \
219
220 #define mvn_4x16b(dest, source)                                                \
221   foreach_element(4, (dest).e[_i] = ~((source).e[_i]))                         \
222
223 #define add_4x16b(dest, source_a, source_b)                                    \
224   foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
225
226 #define add_4x32b(dest, source_a, source_b)                                    \
227   foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
228
229 #define add_2x32b(dest, source_a, source_b)                                    \
230   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
231
232 #define add_8x16b(dest, source_a, source_b)                                    \
233   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
234
235 #define add_16x8b(dest, source_a, source_b)                                    \
236   foreach_element(16, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])      \
237
238 #define add_8x8b(dest, source_a, source_b)                                     \
239   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
240
241 #define add_1x64b(dest, source_a, source_b)                                    \
242   (dest).e[0] = (source_a).e[0] + (source_b).e[0]                              \
243
244 #define add_2x64b(dest, source_a, source_b)                                    \
245   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
246
247 #define add_high_narrow_2x64b(dest, source_a, source_b)                        \
248   foreach_element(2,                                                           \
249    ((dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) >> 32)                 \
250
251 #define add_high_narrow_4x32b(dest, source_a, source_b)                        \
252   foreach_element(4,                                                           \
253    ((dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 16))               \
254
255 #define sub_4x16b(dest, source_a, source_b)                                    \
256   foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
257
258 #define sub_4x32b(dest, source_a, source_b)                                    \
259   foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
260
261 #define sub_2x32b(dest, source_a, source_b)                                    \
262   foreach_element(2, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
263
264 #define sub_wide_8x8b(dest, source_a, source_b)                                \
265   foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
266
267 #define add_wide_8x8b(dest, source_a, source_b)                                \
268   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
269
270 #define add_wide_2x32b(dest, source_a, source_b)                               \
271   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
272
273 #define addq_8x8b(dest, source_a, source_b)                                    \
274   foreach_element(8,                                                           \
275   {                                                                            \
276     u32 result = (source_a).e[_i] + (source_b).e[_i];                          \
277     if(result > 0xFF)                                                          \
278       result = 0xFF;                                                           \
279     (dest).e[_i] = result;                                                     \
280   })                                                                           \
281
282 #define subq_8x8b(dest, source_a, source_b)                                    \
283   foreach_element(8,                                                           \
284   {                                                                            \
285     u32 result = (source_a).e[_i] - (source_b).e[_i];                          \
286     if(result > 0xFF)                                                          \
287       result = 0;                                                              \
288     (dest).e[_i] = result;                                                     \
289   })                                                                           \
290
291 #define subs_long_8x8b(dest, source_a, source_b)                               \
292   subs_8x8b(dest, source_a, source_b)                                          \
293
294 #define subs_16x8b(dest, source_a, source_b)                                   \
295   foreach_element(16,                                                          \
296   {                                                                            \
297     u32 result = (source_a).e[_i] - (source_b).e[_i];                          \
298     if(result > 0xFF)                                                          \
299       result = 0;                                                              \
300     (dest).e[_i] = result;                                                     \
301   })                                                                           \
302
303 #define subs_8x16b(dest, source_a, source_b)                                   \
304   foreach_element(8,                                                           \
305   {                                                                            \
306     s32 result = (source_a).e[_i] - (source_b).e[_i];                          \
307     if(result < 0)                                                             \
308       result = 0;                                                              \
309                                                                                \
310     (dest).e[_i] = result;                                                     \
311   })                                                                           \
312
313 #define sub_8x16b(dest, source_a, source_b)                                    \
314   foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
315
316 #define sub_16x8b(dest, source_a, source_b)                                    \
317   foreach_element(16, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])      \
318
319 #define orn_8x16b(dest, source_a, source_b)                                    \
320   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | ~((source_b).e[_i]))    \
321
322 #define and_4x16b(dest, source_a, source_b)                                    \
323   foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
324
325 #define and_8x16b(dest, source_a, source_b)                                    \
326   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
327
328 #define and_4x32b(dest, source_a, source_b)                                    \
329   foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
330
331 #define and_16x8b(dest, source_a, source_b)                                    \
332   foreach_element(16, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])      \
333
334 #define and_8x8b(dest, source_a, source_b)                                     \
335   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
336
337 #define and_2x32b(dest, source_a, source_b)                                    \
338   foreach_element(2, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
339
340 #define bic_8x8b(dest, source_a, source_b)                                     \
341   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i]))    \
342
343 #define bic_8x16b(dest, source_a, source_b)                                    \
344   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i]))    \
345
346 #define bic_immediate_4x16b(dest, value)                                       \
347   foreach_element(4, (dest).e[_i] = (dest).e[_i] & ~(value))                   \
348
349 #define bic_immediate_8x16b(dest, value)                                       \
350   foreach_element(8, (dest).e[_i] = (dest).e[_i] & ~(value))                   \
351
352 #define or_8x16b(dest, source_a, source_b)                                     \
353   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (source_b).e[_i])       \
354
355 #define or_immediate_8x16b(dest, source_a, value)                              \
356   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (value))                \
357
358 #define eor_8x16b(dest, source_a, source_b)                                    \
359   foreach_element(8, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
360
361 #define eor_4x32b(dest, source_a, source_b)                                    \
362   foreach_element(4, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
363
364 #define eor_2x32b(dest, source_a, source_b)                                    \
365   foreach_element(2, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
366
367 #define zip_8x16b(dest, source_a, source_b)                                    \
368   foreach_element(8, (dest).e[_i] =                                            \
369    (u8)(source_a).e[_i] | ((u8)(source_b).e[_i] << 8))                         \
370
371 #define zip_4x32b(dest, source_a, source_b)                                    \
372   foreach_element(4, (dest).e[_i] =                                            \
373    (u16)(source_a).e[_i] | ((u16)(source_b).e[_i] << 16))                      \
374
375 #define zip_2x64b(dest, source_a, source_b)                                    \
376   foreach_element(2, (dest).e[_i] =                                            \
377    (u64)(source_a).e[_i] | ((u64)(source_b).e[_i] << 32))                      \
378
379 #define unzip_8x8b(dest_a, dest_b, source)                                     \
380   foreach_element(8,                                                           \
381   {                                                                            \
382     (dest_a).e[_i] = (source).e[_i];                                           \
383     (dest_b).e[_i] = ((source).e[_i]) >> 8;                                    \
384   })                                                                           \
385
386 #define unzip_16x8b(dest_a, dest_b, source_a, source_b)                        \
387   foreach_element(8,                                                           \
388   {                                                                            \
389     (dest_a).e[_i] = (source_a).e[_i];                                         \
390     (dest_b).e[_i] = (source_a).e[_i] >> 8;                                    \
391   });                                                                          \
392   foreach_element(8,                                                           \
393   {                                                                            \
394     (dest_a).e[_i + 8] = (source_b).e[_i];                                     \
395     (dest_b).e[_i + 8] = (source_b).e[_i] >> 8;                                \
396   })                                                                           \
397
398 #define tbl_16(dest, indexes, table)                                           \
399   foreach_element(8,                                                           \
400   {                                                                            \
401     u32 index = indexes.e[_i];                                                 \
402     if(index < 16)                                                             \
403       (dest).e[_i] = table.e[index];                                           \
404     else                                                                       \
405       (dest).e[_i] = 0;                                                        \
406   })                                                                           \
407
408 #define cmpeqz_8x16b(dest, source)                                             \
409   foreach_element(8, (dest).e[_i] = ~(((source).e[_i] == 0) - 1))              \
410
411 #define cmpltz_8x16b(dest, source)                                             \
412   foreach_element(8, (dest).e[_i] = ((s16)(source).e[_i] >> 15))               \
413
414 #define cmpltz_4x32b(dest, source)                                             \
415   foreach_element(4, (dest).e[_i] = ((s32)(source).e[_i] >> 31))               \
416
417 #define cmpltz_2x32b(dest, source)                                             \
418   foreach_element(2, (dest).e[_i] = ((s32)(source).e[_i] >> 31))               \
419
420 #define cmplte_4x16b(dest, source_a, source_b)                                 \
421   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] <= source_b.e[_i]) - 1)) \
422
423 #define cmplt_4x16b(dest, source_a, source_b)                                  \
424   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] < source_b.e[_i]) - 1))  \
425
426 #define cmpgt_4x16b(dest, source_a, source_b)                                  \
427   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] > source_b.e[_i]) - 1))  \
428
429 #define tst_8x16b(dest, source_a, source_b)                                    \
430   foreach_element(8,                                                           \
431    (dest).e[_i] = ~(((source_a.e[_i] & source_b.e[_i]) != 0) - 1))             \
432
433 #define andi_8x8b(dest, source_a, value)                                       \
434   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & value)                  \
435
436 #define average_8x16b(dest, source_a, source_b)                                \
437   foreach_element(8,                                                           \
438    (dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 1)                  \
439
440
441 #define mul_8x8b(dest, source_a, source_b)                                     \
442   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
443
444 #define mul_8x16b(dest, source_a, source_b)                                    \
445   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
446
447 #define mul_2x32b(dest, source_a, source_b)                                    \
448   foreach_element(2, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
449
450 #define mul_4x32b(dest, source_a, source_b)                                    \
451   foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
452
453 #define mul_long_8x8b(dest, source_a, source_b)                                \
454   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
455
456 #define mul_long_4x16b(dest, source_a, source_b)                               \
457   foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
458
459 #define mul_long_2x32b(dest, source_a, source_b)                               \
460   foreach_element(2,                                                           \
461    (dest).e[_i] = (source_a).e[_i] * (s64)((source_b).e[_i]))                  \
462
463 #define mul_scalar_2x32b(dest, source, value)                                  \
464   foreach_element(2, (dest).e[_i] = (source).e[_i] * value)                    \
465
466 #define mul_scalar_long_8x16b(dest, source, value)                             \
467   foreach_element(8, (dest).e[_i] = (source).e[_i] * value)                    \
468
469 #define mul_scalar_long_2x32b(dest, source, value)                             \
470   foreach_element(2, (dest).e[_i] = (source).e[_i] * value)                    \
471
472 #define mla_2x32b(dest, source_a, source_b)                                    \
473   foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
474
475 #define mla_4x32b(dest, source_a, source_b)                                    \
476   foreach_element(4, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
477
478 #define mla_scalar_long_2x32b(dest, source, value)                             \
479   foreach_element(2, (dest).e[_i] += (source).e[_i] * value)                   \
480
481 #define mla_long_8x8b(dest, source_a, source_b)                                \
482   foreach_element(8, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
483
484 #define mla_long_2x32b(dest, source_a, source_b)                               \
485   foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (s64)(source_b).e[_i]) \
486
487 #define mla_scalar_4x32b(dest, source, value)                                  \
488   foreach_element(4, (dest).e[_i] += (source).e[_i] * value)                   \
489
490 #define mla_scalar_2x32b(dest, source, value)                                  \
491   foreach_element(2, (dest).e[_i] += (source).e[_i] * value)                   \
492
493 #define mls_scalar_4x32b(dest, source, value)                                  \
494   foreach_element(4, (dest).e[_i] -= (source).e[_i] * value)                   \
495
496 #define mls_scalar_2x32b(dest, source, value)                                  \
497   foreach_element(2, (dest).e[_i] -= (source).e[_i] * value)                   \
498
499 #define mls_scalar_long_2x32b(dest, source, value)                             \
500   foreach_element(2, (dest).e[_i] -= (source).e[_i] * value)                   \
501
502 #define rev_2x32b(dest, source)                                                \
503 {                                                                              \
504   u32 tmp = source.e[1];                                                       \
505   (dest).e[1] = source.e[0];                                                   \
506   (dest).e[0] = tmp;                                                           \
507 }                                                                              \
508
509 #define abs_4x32b(dest, source)                                                \
510   foreach_element(4, (dest).e[_i] = abs(source.e[_i]))                         \
511
512 #define abs_2x32b(dest, source)                                                \
513   foreach_element(2, (dest).e[_i] = abs(source.e[_i]))                         \
514
515 #define neg_2x32b(dest, source)                                                \
516   foreach_element(2, (dest).e[_i] = -((source).e[_i]))                         \
517
518
519 #define shrq_narrow_8x16b(dest, source, shift)                                 \
520   foreach_element(8,                                                           \
521   {                                                                            \
522     u32 result = ((source).e[_i]) >> shift;                                    \
523     if(result > 0xFF)                                                          \
524       result = 0xFF;                                                           \
525     (dest).e[_i] = result;                                                     \
526   })                                                                           \
527
528 #define min_4x16b(dest, source_a, source_b)                                    \
529   foreach_element(4,                                                           \
530   {                                                                            \
531     s32 result = (source_a).e[_i];                                             \
532     if((source_b).e[_i] < result)                                              \
533       result = (source_b).e[_i];                                               \
534     (dest).e[_i] = result;                                                     \
535   })                                                                           \
536
537 #define min_8x16b(dest, source_a, source_b)                                    \
538   foreach_element(8,                                                           \
539   {                                                                            \
540     s32 result = (source_a).e[_i];                                             \
541     if((source_b).e[_i] < result)                                              \
542       result = (source_b).e[_i];                                               \
543     (dest).e[_i] = result;                                                     \
544   })                                                                           \
545
546 #define min_8x8b(dest, source_a, source_b)                                     \
547   foreach_element(8,                                                           \
548   {                                                                            \
549     u32 result = (source_a).e[_i];                                             \
550     if((source_b).e[_i] < result)                                              \
551       result = (source_b).e[_i];                                               \
552     (dest).e[_i] = result;                                                     \
553   })                                                                           \
554
555 #define min_16x8b(dest, source_a, source_b)                                    \
556   foreach_element(16,                                                          \
557   {                                                                            \
558     u32 result = (source_a).e[_i];                                             \
559     if((source_b).e[_i] < result)                                              \
560       result = (source_b).e[_i];                                               \
561     (dest).e[_i] = result;                                                     \
562   })                                                                           \
563
564 #define max_8x16b(dest, source_a, source_b)                                    \
565   foreach_element(8,                                                           \
566   {                                                                            \
567     s32 result = (source_a).e[_i];                                             \
568     if((source_b).e[_i] > result)                                              \
569       result = (source_b).e[_i];                                               \
570     (dest).e[_i] = result;                                                     \
571   })                                                                           \
572
573 #define bsl_8x16b(dest_mask, source_a, source_b)                               \
574   foreach_element(8, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) |  \
575    ((source_b).e[_i] & ~(dest_mask.e[_i])))                                    \
576
577 #define bif_8x16b(dest, source, mask)                                          \
578   foreach_element(8, dest.e[_i] = ((source).e[_i] & ~(mask.e[_i])) |           \
579    ((dest).e[_i] & mask.e[_i]))                                                \
580
581 #define bsl_4x32b(dest_mask, source_a, source_b)                               \
582   foreach_element(4, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) |  \
583    ((source_b).e[_i] & ~(dest_mask.e[_i])))                                    \
584
585 #define bit_4x16b(dest, source, mask)                                          \
586   foreach_element(4, dest.e[_i] = ((source).e[_i] & mask.e[_i]) |              \
587    ((dest).e[_i] & ~(mask.e[_i])))                                             \
588
589 #endif