gpu_neon: flush cmd buffer before blit too
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / vector_ops.h
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of
7  * the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  */
14
15 #ifndef VECTOR_OPS
16 #define VECTOR_OPS
17
18 #define build_vector_type_pair(sign, size, count, count_x2)                    \
19 typedef struct                                                                 \
20 {                                                                              \
21   sign##size e[count];                                                         \
22 } vec_##count##x##size##sign;                                                  \
23                                                                                \
24 typedef struct                                                                 \
25 {                                                                              \
26   union                                                                        \
27   {                                                                            \
28     sign##size e[count_x2];                                                    \
29     struct                                                                     \
30     {                                                                          \
31       vec_##count##x##size##sign low;                                          \
32       vec_##count##x##size##sign high;                                         \
33     };                                                                         \
34   };                                                                           \
35 } vec_##count_x2##x##size##sign                                                \
36
37 #define build_vector_types(sign)                                               \
38   build_vector_type_pair(sign, 8, 8, 16);                                      \
39   build_vector_type_pair(sign, 16, 4, 8);                                      \
40   build_vector_type_pair(sign, 32, 2, 4);                                      \
41   build_vector_type_pair(sign, 64, 1, 2)                                       \
42
43 build_vector_types(u);
44 build_vector_types(s);
45
46
47 #define foreach_element(iterations, operation)                                 \
48 {                                                                              \
49   u32 _i;                                                                      \
50   for(_i = 0; _i < iterations; _i++)                                           \
51   {                                                                            \
52     operation;                                                                 \
53   }                                                                            \
54 }                                                                              \
55
56 #define load_64b(dest, source)                                                 \
57  *((u64 *)(dest).e) = *((u64 *)(source))                                       \
58
59 #define load_128b(dest, source)                                                \
60  *((u64 *)(dest).e) = *((u64 *)(source));                                      \
61  *((u64 *)(dest).e + 1) = *(((u64 *)(source)) + 1)                             \
62
63 #define load_8x16b(dest, source)                                               \
64   foreach_element(8, (dest).e[_i] = ((u16 *)(source))[_i])                     \
65
66 #define store_64b(source, dest)                                                \
67  *((u64 *)(dest)) = *((u64 *)(source).e)                                       \
68
69 #define store_128b(source, dest)                                               \
70  *((u64 *)(dest)) = *((u64 *)(source).e);                                      \
71  *(((u64 *)(dest)) + 1) = *((u64 *)(source).e + 1)                             \
72
73 #define store_8x16b(source, dest)                                              \
74   foreach_element(8, ((u16 *)dest)[_i] = (source).e[_i])                       \
75
76
77 #define split_8x16b(dest, source)                                              \
78   foreach_element(8,                                                           \
79   {                                                                            \
80     (dest).e[_i * 2] = (source).e[_i];                                         \
81     (dest).e[(_i * 2) + 1] = (source).e[_i] >> 8;                              \
82   })                                                                           \
83
84 #define merge_16x8b(dest, source)                                              \
85   foreach_element(8,                                                           \
86     (dest).e[_i] = (source).e[_i * 2] | ((source).e[(_i * 2) + 1] << 8))       \
87
88 #define vector_cast(vec_to, source)                                            \
89   (*((volatile vec_to *)(&(source))))                                          \
90
91 #define vector_cast_high(vec_to, source)                                       \
92   (*((volatile vec_to *)((u8 *)source.e + (sizeof(source.e) / 2))))            \
93
94
95 #define dup_8x8b(dest, value)                                                  \
96   foreach_element(8, (dest).e[_i] = value)                                     \
97
98 #define dup_16x8b(dest, value)                                                 \
99   foreach_element(16, (dest).e[_i] = value)                                    \
100
101 #define dup_4x16b(dest, value)                                                 \
102   foreach_element(4, (dest).e[_i] = value)                                     \
103
104 #define dup_8x16b(dest, value)                                                 \
105   foreach_element(8, (dest).e[_i] = value)                                     \
106
107 #define dup_2x32b(dest, value)                                                 \
108   foreach_element(2, (dest).e[_i] = value)                                     \
109
110 #define dup_4x32b(dest, value)                                                 \
111   foreach_element(4, (dest).e[_i] = value)                                     \
112
113 #define shr_narrow_8x16b(dest, source, shift)                                  \
114   foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
115
116 #define shr_narrow_2x64b(dest, source, shift)                                  \
117   foreach_element(2, (dest).e[_i] = (source).e[_i] >> (shift))                 \
118
119 #define shr_8x8b(dest, source, shift)                                          \
120   foreach_element(8, (dest).e[_i] = (u8)(source).e[_i] >> (shift))             \
121
122 #define shl_8x8b(dest, source, shift)                                          \
123   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
124
125 #define shr_8x16b(dest, source, shift)                                         \
126   foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
127
128 #define shr_2x32b(dest, source, shift)                                         \
129   foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
130
131 #define shr_4x16b(dest, source, shift)                                         \
132   foreach_element(4, (dest).e[_i] = (source).e[_i] >> (shift))                 \
133
134 #define shl_4x16b(dest, source, shift)                                         \
135   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift))            \
136
137 #define shr_4x32b(dest, source, shift)                                         \
138   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
139
140 #define shr_narrow_4x32b(dest, source, shift)                                  \
141   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
142
143 #define shl_8x16b(dest, source, shift)                                         \
144   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
145
146 #define shl_4x32b(dest, source, shift)                                         \
147   foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift))                 \
148
149 #define shl_2x32b(dest, source, shift)                                         \
150   foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift))                 \
151
152 #define shl_1x64b(dest, source, shift)                                         \
153   ((dest).e[0] = (source).e[0] << (shift))                                     \
154
155 #define shl_2x64b(dest, source, shift)                                         \
156   foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift))                 \
157
158 #define shl_variable_2x64b(dest, source_a, source_b)                           \
159   foreach_element(2,                                                           \
160    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
161
162 #define shl_variable_8x16b(dest, source_a, source_b)                           \
163   foreach_element(8,                                                           \
164    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
165
166 #define shl_variable_4x16b(dest, source_a, source_b)                           \
167   foreach_element(4,                                                           \
168    (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF))               \
169
170 #define shr_1x64b(dest, source, shift)                                         \
171   ((dest).e[0] = (source).e[0] >> (shift))                                     \
172
173 #define shl_long_8x8b(dest, source, shift)                                     \
174   foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift))                 \
175
176 #define shl_long_4x16b(dest, source, shift)                                    \
177   foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift))                 \
178
179 #define shrq_narrow_signed_8x16b(dest, source, shift)                          \
180   foreach_element(8,                                                           \
181   {                                                                            \
182     s32 result = ((s16)(source).e[_i]) >> shift;                               \
183     if(result < 0)                                                             \
184       result = 0;                                                              \
185     if(result > 0xFF)                                                          \
186       result = 0xFF;                                                           \
187     (dest).e[_i] = result;                                                     \
188   })                                                                           \
189
190 #define shl_reg_4x32b(dest, source_a, source_b)                                \
191   foreach_element(4,                                                           \
192   {                                                                            \
193     s8 shift  = (source_b).e[_i];                                              \
194     if(shift < 0)                                                              \
195       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
196     else                                                                       \
197       dest.e[_i] = (source_a).e[_i] << shift;                                  \
198   })                                                                           \
199
200 #define shl_reg_2x32b(dest, source_a, source_b)                                \
201   foreach_element(2,                                                           \
202   {                                                                            \
203     s8 shift  = (source_b).e[_i];                                              \
204     if(shift < 0)                                                              \
205       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
206     else                                                                       \
207       dest.e[_i] = (source_a).e[_i] << shift;                                  \
208   })                                                                           \
209
210 #define shl_reg_2x64b(dest, source_a, source_b)                                \
211   foreach_element(2,                                                           \
212   {                                                                            \
213     s8 shift  = (source_b).e[_i];                                              \
214     if(shift < 0)                                                              \
215       dest.e[_i] = (source_a).e[_i] >> (-shift);                               \
216     else                                                                       \
217       dest.e[_i] = (source_a).e[_i] << shift;                                  \
218   })                                                                           \
219
220
221 #define sri_8x8b(dest, source, shift)                                          \
222   foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF >> (shift))) |      \
223    ((u8)(source).e[_i] >> (shift)))                                            \
224
225 #define sli_8x8b(dest, source, shift)                                          \
226   foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF << (shift))) |      \
227    ((source).e[_i] << (shift)))                                                \
228
229
230
231 #define mov_narrow_8x16b(dest, source)                                         \
232   foreach_element(8, (dest).e[_i] = (source).e[_i])                            \
233
234 #define mov_narrow_4x32b(dest, source)                                         \
235   foreach_element(4, (dest).e[_i] = (source).e[_i])                            \
236
237 #define mov_narrow_2x64b(dest, source)                                         \
238   foreach_element(2, (dest).e[_i] = (source).e[_i])                            \
239
240 #define mov_wide_8x8b(dest, source)                                            \
241   foreach_element(8, (dest).e[_i] = (source).e[_i])                            \
242
243 #define mov_wide_2x32b(dest, source)                                           \
244   foreach_element(2, (dest).e[_i] = (source).e[_i])                            \
245
246 #define mvn_4x16b(dest, source)                                                \
247   foreach_element(4, (dest).e[_i] = ~((source).e[_i]))                         \
248
249 #define add_4x16b(dest, source_a, source_b)                                    \
250   foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
251
252 #define add_4x32b(dest, source_a, source_b)                                    \
253   foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
254
255 #define add_2x32b(dest, source_a, source_b)                                    \
256   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
257
258 #define add_8x16b(dest, source_a, source_b)                                    \
259   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
260
261 #define add_16x8b(dest, source_a, source_b)                                    \
262   foreach_element(16, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])      \
263
264 #define add_8x8b(dest, source_a, source_b)                                     \
265   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
266
267 #define add_1x64b(dest, source_a, source_b)                                    \
268   (dest).e[0] = (source_a).e[0] + (source_b).e[0]                              \
269
270 #define add_2x64b(dest, source_a, source_b)                                    \
271   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
272
273 #define add_high_narrow_2x64b(dest, source_a, source_b)                        \
274   foreach_element(2,                                                           \
275    ((dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) >> 32)                 \
276
277 #define add_high_narrow_4x32b(dest, source_a, source_b)                        \
278   foreach_element(4,                                                           \
279    ((dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 16))               \
280
281 #define sub_4x16b(dest, source_a, source_b)                                    \
282   foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
283
284 #define sub_4x32b(dest, source_a, source_b)                                    \
285   foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
286
287 #define sub_2x32b(dest, source_a, source_b)                                    \
288   foreach_element(2, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
289
290 #define sub_wide_8x8b(dest, source_a, source_b)                                \
291   foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
292
293 #define add_wide_8x8b(dest, source_a, source_b)                                \
294   foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
295
296 #define add_wide_2x32b(dest, source_a, source_b)                               \
297   foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i])       \
298
299 #define addq_8x8b(dest, source_a, source_b)                                    \
300   foreach_element(8,                                                           \
301   {                                                                            \
302     u32 result = (source_a).e[_i] + (source_b).e[_i];                          \
303     if(result > 0xFF)                                                          \
304       result = 0xFF;                                                           \
305     (dest).e[_i] = result;                                                     \
306   })                                                                           \
307
308 #define subq_8x8b(dest, source_a, source_b)                                    \
309   foreach_element(8,                                                           \
310   {                                                                            \
311     u32 result = (source_a).e[_i] - (source_b).e[_i];                          \
312     if(result > 0xFF)                                                          \
313       result = 0;                                                              \
314     (dest).e[_i] = result;                                                     \
315   })                                                                           \
316
317 #define subs_long_8x8b(dest, source_a, source_b)                               \
318   subs_8x8b(dest, source_a, source_b)                                          \
319
320 #define subs_16x8b(dest, source_a, source_b)                                   \
321   foreach_element(16,                                                          \
322   {                                                                            \
323     u32 result = (source_a).e[_i] - (source_b).e[_i];                          \
324     if(result > 0xFF)                                                          \
325       result = 0;                                                              \
326     (dest).e[_i] = result;                                                     \
327   })                                                                           \
328
329 #define subs_8x16b(dest, source_a, source_b)                                   \
330   foreach_element(8,                                                           \
331   {                                                                            \
332     s32 result = (source_a).e[_i] - (source_b).e[_i];                          \
333     if(result < 0)                                                             \
334       result = 0;                                                              \
335                                                                                \
336     (dest).e[_i] = result;                                                     \
337   })                                                                           \
338
339 #define sub_8x16b(dest, source_a, source_b)                                    \
340   foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])       \
341
342 #define sub_16x8b(dest, source_a, source_b)                                    \
343   foreach_element(16, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i])      \
344
345 #define orn_8x16b(dest, source_a, source_b)                                    \
346   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | ~((source_b).e[_i]))    \
347
348 #define and_4x16b(dest, source_a, source_b)                                    \
349   foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
350
351 #define and_8x16b(dest, source_a, source_b)                                    \
352   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
353
354 #define and_4x32b(dest, source_a, source_b)                                    \
355   foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
356
357 #define and_16x8b(dest, source_a, source_b)                                    \
358   foreach_element(16, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])      \
359
360 #define and_8x8b(dest, source_a, source_b)                                     \
361   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
362
363 #define and_2x32b(dest, source_a, source_b)                                    \
364   foreach_element(2, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i])       \
365
366 #define bic_8x8b(dest, source_a, source_b)                                     \
367   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i]))    \
368
369 #define bic_8x16b(dest, source_a, source_b)                                    \
370   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i]))    \
371
372 #define bic_immediate_4x16b(dest, value)                                       \
373   foreach_element(4, (dest).e[_i] = (dest).e[_i] & ~(value))                   \
374
375 #define bic_immediate_8x16b(dest, value)                                       \
376   foreach_element(8, (dest).e[_i] = (dest).e[_i] & ~(value))                   \
377
378 #define or_8x16b(dest, source_a, source_b)                                     \
379   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (source_b).e[_i])       \
380
381 #define or_immediate_8x16b(dest, source_a, value)                              \
382   foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (value))                \
383
384 #define eor_8x16b(dest, source_a, source_b)                                    \
385   foreach_element(8, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
386
387 #define eor_4x32b(dest, source_a, source_b)                                    \
388   foreach_element(4, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
389
390 #define eor_2x32b(dest, source_a, source_b)                                    \
391   foreach_element(2, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i])       \
392
393 #define zip_8x16b(dest, source_a, source_b)                                    \
394   foreach_element(8, (dest).e[_i] =                                            \
395    (u8)(source_a).e[_i] | ((u8)(source_b).e[_i] << 8))                         \
396
397 #define zip_2x64b(dest, source_a, source_b)                                    \
398   foreach_element(2, (dest).e[_i] =                                            \
399    (u64)(source_a).e[_i] | ((u64)(source_b).e[_i] << 32))                      \
400
401 #define unzip_8x8b(dest_a, dest_b, source)                                     \
402   foreach_element(8,                                                           \
403   {                                                                            \
404     (dest_a).e[_i] = (source).e[_i];                                           \
405     (dest_b).e[_i] = ((source).e[_i]) >> 8;                                    \
406   })                                                                           \
407
408 #define unzip_16x8b(dest_a, dest_b, source_a, source_b)                        \
409   foreach_element(8,                                                           \
410   {                                                                            \
411     (dest_a).e[_i] = (source_a).e[_i];                                         \
412     (dest_b).e[_i] = (source_a).e[_i] >> 8;                                    \
413   });                                                                          \
414   foreach_element(8,                                                           \
415   {                                                                            \
416     (dest_a).e[_i + 8] = (source_b).e[_i];                                     \
417     (dest_b).e[_i + 8] = (source_b).e[_i] >> 8;                                \
418   })                                                                           \
419
420 #define tbl_16(dest, indexes, table)                                           \
421   foreach_element(8,                                                           \
422   {                                                                            \
423     u32 index = indexes.e[_i];                                                 \
424     if(index < 16)                                                             \
425       (dest).e[_i] = table.e[index];                                           \
426     else                                                                       \
427       (dest).e[_i] = 0;                                                        \
428   })                                                                           \
429
430 #define cmpeqz_8x16b(dest, source)                                             \
431   foreach_element(8, (dest).e[_i] = ~(((source).e[_i] == 0) - 1))              \
432
433 #define cmpltz_8x16b(dest, source)                                             \
434   foreach_element(8, (dest).e[_i] = ((s16)(source).e[_i] >> 15))               \
435
436 #define cmpltz_4x32b(dest, source)                                             \
437   foreach_element(4, (dest).e[_i] = ((s32)(source).e[_i] >> 31))               \
438
439 #define cmpltz_2x32b(dest, source)                                             \
440   foreach_element(2, (dest).e[_i] = ((s32)(source).e[_i] >> 31))               \
441
442 #define cmplte_4x16b(dest, source_a, source_b)                                 \
443   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] <= source_b.e[_i]) - 1)) \
444
445 #define cmplt_4x16b(dest, source_a, source_b)                                  \
446   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] < source_b.e[_i]) - 1))  \
447
448 #define cmpgt_4x16b(dest, source_a, source_b)                                  \
449   foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] > source_b.e[_i]) - 1))  \
450
451 #define tst_8x16b(dest, source_a, source_b)                                    \
452   foreach_element(8,                                                           \
453    (dest).e[_i] = ~(((source_a.e[_i] & source_b.e[_i]) != 0) - 1))             \
454
455 #define andi_8x8b(dest, source_a, value)                                       \
456   foreach_element(8, (dest).e[_i] = (source_a).e[_i] & value)                  \
457
458 #define average_8x16b(dest, source_a, source_b)                                \
459   foreach_element(8,                                                           \
460    (dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 1)                  \
461
462
463 #define mul_8x8b(dest, source_a, source_b)                                     \
464   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
465
466 #define mul_8x16b(dest, source_a, source_b)                                    \
467   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
468
469 #define mul_2x32b(dest, source_a, source_b)                                    \
470   foreach_element(2, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
471
472 #define mul_4x32b(dest, source_a, source_b)                                    \
473   foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
474
475 #define mul_long_8x8b(dest, source_a, source_b)                                \
476   foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
477
478 #define mul_long_4x16b(dest, source_a, source_b)                               \
479   foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i])       \
480
481 #define mul_long_2x32b(dest, source_a, source_b)                               \
482   foreach_element(2,                                                           \
483    (dest).e[_i] = (source_a).e[_i] * (s64)((source_b).e[_i]))                  \
484
485 #define mul_scalar_2x32b(dest, source, value)                                  \
486   foreach_element(2, (dest).e[_i] = (source).e[_i] * value)                    \
487
488 #define mul_scalar_long_8x16b(dest, source, value)                             \
489   foreach_element(8, (dest).e[_i] = (source).e[_i] * value)                    \
490
491 #define mul_scalar_long_2x32b(dest, source, value)                             \
492   foreach_element(2, (dest).e[_i] = (source).e[_i] * value)                    \
493
494 #define mla_2x32b(dest, source_a, source_b)                                    \
495   foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
496
497 #define mla_4x32b(dest, source_a, source_b)                                    \
498   foreach_element(4, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
499
500 #define mla_scalar_long_2x32b(dest, source, value)                             \
501   foreach_element(2, (dest).e[_i] += (source).e[_i] * value)                   \
502
503 #define mla_long_8x8b(dest, source_a, source_b)                                \
504   foreach_element(8, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i])      \
505
506 #define mla_long_2x32b(dest, source_a, source_b)                               \
507   foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (s64)(source_b).e[_i]) \
508
509 #define mla_scalar_4x32b(dest, source, value)                                  \
510   foreach_element(4, (dest).e[_i] += (source).e[_i] * value)                   \
511
512 #define mla_scalar_2x32b(dest, source, value)                                  \
513   foreach_element(2, (dest).e[_i] += (source).e[_i] * value)                   \
514
515 #define mls_scalar_4x32b(dest, source, value)                                  \
516   foreach_element(4, (dest).e[_i] -= (source).e[_i] * value)                   \
517
518 #define mls_scalar_2x32b(dest, source, value)                                  \
519   foreach_element(2, (dest).e[_i] -= (source).e[_i] * value)                   \
520
521 #define mls_scalar_long_2x32b(dest, source, value)                             \
522   foreach_element(2, (dest).e[_i] -= (source).e[_i] * value)                   \
523
524 #define rev_2x32b(dest, source)                                                \
525 {                                                                              \
526   u32 tmp = source.e[1];                                                       \
527   (dest).e[1] = source.e[0];                                                   \
528   (dest).e[0] = tmp;                                                           \
529 }                                                                              \
530
531 #define abs_4x32b(dest, source)                                                \
532   foreach_element(4, (dest).e[_i] = abs(source.e[_i]))                         \
533
534 #define abs_2x32b(dest, source)                                                \
535   foreach_element(2, (dest).e[_i] = abs(source.e[_i]))                         \
536
537 #define neg_2x32b(dest, source)                                                \
538   foreach_element(2, (dest).e[_i] = -((source).e[_i]))                         \
539
540
541 #define shrq_narrow_8x16b(dest, source, shift)                                 \
542   foreach_element(8,                                                           \
543   {                                                                            \
544     u32 result = ((source).e[_i]) >> shift;                                    \
545     if(result > 0xFF)                                                          \
546       result = 0xFF;                                                           \
547     (dest).e[_i] = result;                                                     \
548   })                                                                           \
549
550 #define min_8x16b(dest, source_a, source_b)                                    \
551   foreach_element(8,                                                           \
552   {                                                                            \
553     s32 result = (source_a).e[_i];                                             \
554     if((source_b).e[_i] < result)                                              \
555       result = (source_b).e[_i];                                               \
556     (dest).e[_i] = result;                                                     \
557   })                                                                           \
558
559 #define min_8x8b(dest, source_a, source_b)                                     \
560   foreach_element(8,                                                           \
561   {                                                                            \
562     u32 result = (source_a).e[_i];                                             \
563     if((source_b).e[_i] < result)                                              \
564       result = (source_b).e[_i];                                               \
565     (dest).e[_i] = result;                                                     \
566   })                                                                           \
567
568 #define min_16x8b(dest, source_a, source_b)                                    \
569   foreach_element(16,                                                          \
570   {                                                                            \
571     u32 result = (source_a).e[_i];                                             \
572     if((source_b).e[_i] < result)                                              \
573       result = (source_b).e[_i];                                               \
574     (dest).e[_i] = result;                                                     \
575   })                                                                           \
576
577 #define max_8x16b(dest, source_a, source_b)                                    \
578   foreach_element(8,                                                           \
579   {                                                                            \
580     s32 result = (source_a).e[_i];                                             \
581     if((source_b).e[_i] > result)                                              \
582       result = (source_b).e[_i];                                               \
583     (dest).e[_i] = result;                                                     \
584   })                                                                           \
585
586 #define bsl_8x16b(dest_mask, source_a, source_b)                               \
587   foreach_element(8, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) |  \
588    ((source_b).e[_i] & ~(dest_mask.e[_i])))                                    \
589
590 #define bif_8x16b(dest, source, mask)                                          \
591   foreach_element(8, dest.e[_i] = ((source).e[_i] & ~(mask.e[_i])) |           \
592    ((dest).e[_i] & mask.e[_i]))                                                \
593
594 #define bsl_4x32b(dest_mask, source_a, source_b)                               \
595   foreach_element(4, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) |  \
596    ((source_b).e[_i] & ~(dest_mask.e[_i])))                                    \
597
598 #define bit_4x16b(dest, source, mask)                                          \
599   foreach_element(4, dest.e[_i] = ((source).e[_i] & mask.e[_i]) |              \
600    ((dest).e[_i] & ~(mask.e[_i])))                                             \
601
602 #endif