2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of
7 * the License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
18 #include "vector_types.h"
21 #define foreach_element(iterations, operation) \
24 for(_i = 0; _i < iterations; _i++) \
30 #define load_64b(dest, source) \
31 *((u64 *)(dest).e) = *((u64 *)(source)) \
33 #define load_128b(dest, source) \
34 *((u64 *)(dest).e) = *((u64 *)(source)); \
35 *((u64 *)(dest).e + 1) = *(((u64 *)(source)) + 1) \
37 #define load_8x16b(dest, source) \
38 foreach_element(8, (dest).e[_i] = ((u16 *)(source))[_i]) \
40 #define store_64b(source, dest) \
41 *((u64 *)(dest)) = *((u64 *)(source).e) \
43 #define store_128b(source, dest) \
44 *((u64 *)(dest)) = *((u64 *)(source).e); \
45 *(((u64 *)(dest)) + 1) = *((u64 *)(source).e + 1) \
47 #define store_8x16b(source, dest) \
48 foreach_element(8, ((u16 *)dest)[_i] = (source).e[_i]) \
51 #define split_8x16b(dest, source) \
54 (dest).e[_i * 2] = (source).e[_i]; \
55 (dest).e[(_i * 2) + 1] = (source).e[_i] >> 8; \
58 #define merge_16x8b(dest, source) \
60 (dest).e[_i] = (source).e[_i * 2] | ((source).e[(_i * 2) + 1] << 8)) \
62 #define vector_cast(vec_to, source) \
63 (*((volatile vec_to *)(&(source)))) \
65 #define vector_cast_high(vec_to, source) \
66 (*((volatile vec_to *)((u8 *)source.e + (sizeof(source.e) / 2)))) \
69 #define dup_8x8b(dest, value) \
70 foreach_element(8, (dest).e[_i] = value) \
72 #define dup_16x8b(dest, value) \
73 foreach_element(16, (dest).e[_i] = value) \
75 #define dup_4x16b(dest, value) \
76 foreach_element(4, (dest).e[_i] = value) \
78 #define dup_8x16b(dest, value) \
79 foreach_element(8, (dest).e[_i] = value) \
81 #define dup_2x32b(dest, value) \
82 foreach_element(2, (dest).e[_i] = value) \
84 #define dup_4x32b(dest, value) \
85 foreach_element(4, (dest).e[_i] = value) \
87 #define shr_narrow_8x16b(dest, source, shift) \
88 foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
90 #define shr_narrow_2x64b(dest, source, shift) \
91 foreach_element(2, (dest).e[_i] = (source).e[_i] >> (shift)) \
93 #define shr_8x8b(dest, source, shift) \
94 foreach_element(8, (dest).e[_i] = (u8)(source).e[_i] >> (shift)) \
96 #define shl_8x8b(dest, source, shift) \
97 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
99 #define shr_8x16b(dest, source, shift) \
100 foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
102 #define shr_2x32b(dest, source, shift) \
103 foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
105 #define shr_4x16b(dest, source, shift) \
106 foreach_element(4, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
108 #define shl_4x16b(dest, source, shift) \
109 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift)) \
111 #define shr_4x32b(dest, source, shift) \
112 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
114 #define shr_narrow_4x32b(dest, source, shift) \
115 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
117 #define shl_8x16b(dest, source, shift) \
118 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
120 #define shl_4x32b(dest, source, shift) \
121 foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift)) \
123 #define shl_2x32b(dest, source, shift) \
124 foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift)) \
126 #define shl_1x64b(dest, source, shift) \
127 ((dest).e[0] = (source).e[0] << (shift)) \
129 #define shl_2x64b(dest, source, shift) \
130 foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift)) \
132 #define shl_variable_2x64b(dest, source_a, source_b) \
134 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
136 #define shl_variable_8x16b(dest, source_a, source_b) \
138 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
140 #define shl_variable_4x16b(dest, source_a, source_b) \
142 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
144 #define shr_1x64b(dest, source, shift) \
145 ((dest).e[0] = (source).e[0] >> (shift)) \
147 #define shl_long_8x8b(dest, source, shift) \
148 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
150 #define shl_long_4x16b(dest, source, shift) \
151 foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift)) \
153 #define shrq_narrow_signed_8x16b(dest, source, shift) \
156 s32 result = ((s16)(source).e[_i]) >> shift; \
161 (dest).e[_i] = result; \
164 #define shl_reg_4x32b(dest, source_a, source_b) \
167 s8 shift = (source_b).e[_i]; \
169 dest.e[_i] = (source_a).e[_i] >> (-shift); \
171 dest.e[_i] = (source_a).e[_i] << shift; \
174 #define shl_reg_2x32b(dest, source_a, source_b) \
177 s8 shift = (source_b).e[_i]; \
179 dest.e[_i] = (source_a).e[_i] >> (-shift); \
181 dest.e[_i] = (source_a).e[_i] << shift; \
184 #define shl_reg_2x64b(dest, source_a, source_b) \
187 s8 shift = (source_b).e[_i]; \
189 dest.e[_i] = (source_a).e[_i] >> (-shift); \
191 dest.e[_i] = (source_a).e[_i] << shift; \
195 #define sri_8x8b(dest, source, shift) \
196 foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF >> (shift))) | \
197 ((u8)(source).e[_i] >> (shift))) \
199 #define sli_8x8b(dest, source, shift) \
200 foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF << (shift))) | \
201 ((source).e[_i] << (shift))) \
205 #define mov_narrow_8x16b(dest, source) \
206 foreach_element(8, (dest).e[_i] = (source).e[_i]) \
208 #define mov_narrow_4x32b(dest, source) \
209 foreach_element(4, (dest).e[_i] = (source).e[_i]) \
211 #define mov_narrow_2x64b(dest, source) \
212 foreach_element(2, (dest).e[_i] = (source).e[_i]) \
214 #define mov_wide_8x8b(dest, source) \
215 foreach_element(8, (dest).e[_i] = (source).e[_i]) \
217 #define mov_wide_2x32b(dest, source) \
218 foreach_element(2, (dest).e[_i] = (source).e[_i]) \
220 #define mvn_4x16b(dest, source) \
221 foreach_element(4, (dest).e[_i] = ~((source).e[_i])) \
223 #define add_4x16b(dest, source_a, source_b) \
224 foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
226 #define add_4x32b(dest, source_a, source_b) \
227 foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
229 #define add_2x32b(dest, source_a, source_b) \
230 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
232 #define add_8x16b(dest, source_a, source_b) \
233 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
235 #define add_16x8b(dest, source_a, source_b) \
236 foreach_element(16, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
238 #define add_8x8b(dest, source_a, source_b) \
239 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
241 #define add_1x64b(dest, source_a, source_b) \
242 (dest).e[0] = (source_a).e[0] + (source_b).e[0] \
244 #define add_2x64b(dest, source_a, source_b) \
245 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
247 #define add_high_narrow_2x64b(dest, source_a, source_b) \
249 ((dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) >> 32) \
251 #define add_high_narrow_4x32b(dest, source_a, source_b) \
253 ((dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 16)) \
255 #define sub_4x16b(dest, source_a, source_b) \
256 foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
258 #define sub_4x32b(dest, source_a, source_b) \
259 foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
261 #define sub_2x32b(dest, source_a, source_b) \
262 foreach_element(2, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
264 #define sub_wide_8x8b(dest, source_a, source_b) \
265 foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
267 #define add_wide_8x8b(dest, source_a, source_b) \
268 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
270 #define add_wide_2x32b(dest, source_a, source_b) \
271 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
273 #define addq_8x8b(dest, source_a, source_b) \
276 u32 result = (source_a).e[_i] + (source_b).e[_i]; \
279 (dest).e[_i] = result; \
282 #define subq_8x8b(dest, source_a, source_b) \
285 u32 result = (source_a).e[_i] - (source_b).e[_i]; \
288 (dest).e[_i] = result; \
291 #define subs_long_8x8b(dest, source_a, source_b) \
292 subs_8x8b(dest, source_a, source_b) \
294 #define subs_16x8b(dest, source_a, source_b) \
295 foreach_element(16, \
297 u32 result = (source_a).e[_i] - (source_b).e[_i]; \
300 (dest).e[_i] = result; \
303 #define subs_8x16b(dest, source_a, source_b) \
306 s32 result = (source_a).e[_i] - (source_b).e[_i]; \
310 (dest).e[_i] = result; \
313 #define sub_8x16b(dest, source_a, source_b) \
314 foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
316 #define sub_16x8b(dest, source_a, source_b) \
317 foreach_element(16, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
319 #define orn_8x16b(dest, source_a, source_b) \
320 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | ~((source_b).e[_i])) \
322 #define and_4x16b(dest, source_a, source_b) \
323 foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
325 #define and_8x16b(dest, source_a, source_b) \
326 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
328 #define and_4x32b(dest, source_a, source_b) \
329 foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
331 #define and_16x8b(dest, source_a, source_b) \
332 foreach_element(16, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
334 #define and_8x8b(dest, source_a, source_b) \
335 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
337 #define and_2x32b(dest, source_a, source_b) \
338 foreach_element(2, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
340 #define bic_8x8b(dest, source_a, source_b) \
341 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i])) \
343 #define bic_8x16b(dest, source_a, source_b) \
344 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i])) \
346 #define bic_immediate_4x16b(dest, value) \
347 foreach_element(4, (dest).e[_i] = (dest).e[_i] & ~(value)) \
349 #define bic_immediate_8x16b(dest, value) \
350 foreach_element(8, (dest).e[_i] = (dest).e[_i] & ~(value)) \
352 #define or_8x16b(dest, source_a, source_b) \
353 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (source_b).e[_i]) \
355 #define or_immediate_8x16b(dest, source_a, value) \
356 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (value)) \
358 #define eor_8x16b(dest, source_a, source_b) \
359 foreach_element(8, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
361 #define eor_4x32b(dest, source_a, source_b) \
362 foreach_element(4, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
364 #define eor_2x32b(dest, source_a, source_b) \
365 foreach_element(2, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
367 #define zip_8x16b(dest, source_a, source_b) \
368 foreach_element(8, (dest).e[_i] = \
369 (u8)(source_a).e[_i] | ((u8)(source_b).e[_i] << 8)) \
371 #define zip_4x32b(dest, source_a, source_b) \
372 foreach_element(4, (dest).e[_i] = \
373 (u16)(source_a).e[_i] | ((u16)(source_b).e[_i] << 16)) \
375 #define zip_2x64b(dest, source_a, source_b) \
376 foreach_element(2, (dest).e[_i] = \
377 (u64)(source_a).e[_i] | ((u64)(source_b).e[_i] << 32)) \
379 #define unzip_8x8b(dest_a, dest_b, source) \
382 (dest_a).e[_i] = (source).e[_i]; \
383 (dest_b).e[_i] = ((source).e[_i]) >> 8; \
386 #define unzip_16x8b(dest_a, dest_b, source_a, source_b) \
389 (dest_a).e[_i] = (source_a).e[_i]; \
390 (dest_b).e[_i] = (source_a).e[_i] >> 8; \
394 (dest_a).e[_i + 8] = (source_b).e[_i]; \
395 (dest_b).e[_i + 8] = (source_b).e[_i] >> 8; \
398 #define tbl_16(dest, indexes, table) \
401 u32 index = indexes.e[_i]; \
403 (dest).e[_i] = table.e[index]; \
408 #define cmpeqz_8x16b(dest, source) \
409 foreach_element(8, (dest).e[_i] = ~(((source).e[_i] == 0) - 1)) \
411 #define cmpltz_8x16b(dest, source) \
412 foreach_element(8, (dest).e[_i] = ((s16)(source).e[_i] >> 15)) \
414 #define cmpltz_4x32b(dest, source) \
415 foreach_element(4, (dest).e[_i] = ((s32)(source).e[_i] >> 31)) \
417 #define cmpltz_2x32b(dest, source) \
418 foreach_element(2, (dest).e[_i] = ((s32)(source).e[_i] >> 31)) \
420 #define cmplte_4x16b(dest, source_a, source_b) \
421 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] <= source_b.e[_i]) - 1)) \
423 #define cmplt_4x16b(dest, source_a, source_b) \
424 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] < source_b.e[_i]) - 1)) \
426 #define cmpgt_4x16b(dest, source_a, source_b) \
427 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] > source_b.e[_i]) - 1)) \
429 #define tst_8x16b(dest, source_a, source_b) \
431 (dest).e[_i] = ~(((source_a.e[_i] & source_b.e[_i]) != 0) - 1)) \
433 #define andi_8x8b(dest, source_a, value) \
434 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & value) \
436 #define average_8x16b(dest, source_a, source_b) \
438 (dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 1) \
441 #define mul_8x8b(dest, source_a, source_b) \
442 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
444 #define mul_8x16b(dest, source_a, source_b) \
445 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
447 #define mul_2x32b(dest, source_a, source_b) \
448 foreach_element(2, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
450 #define mul_4x32b(dest, source_a, source_b) \
451 foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
453 #define mul_long_8x8b(dest, source_a, source_b) \
454 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
456 #define mul_long_4x16b(dest, source_a, source_b) \
457 foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
459 #define mul_long_2x32b(dest, source_a, source_b) \
461 (dest).e[_i] = (source_a).e[_i] * (s64)((source_b).e[_i])) \
463 #define mul_scalar_2x32b(dest, source, value) \
464 foreach_element(2, (dest).e[_i] = (source).e[_i] * value) \
466 #define mul_scalar_long_8x16b(dest, source, value) \
467 foreach_element(8, (dest).e[_i] = (source).e[_i] * value) \
469 #define mul_scalar_long_2x32b(dest, source, value) \
470 foreach_element(2, (dest).e[_i] = (source).e[_i] * value) \
472 #define mla_2x32b(dest, source_a, source_b) \
473 foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
475 #define mla_4x32b(dest, source_a, source_b) \
476 foreach_element(4, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
478 #define mla_scalar_long_2x32b(dest, source, value) \
479 foreach_element(2, (dest).e[_i] += (source).e[_i] * value) \
481 #define mla_long_8x8b(dest, source_a, source_b) \
482 foreach_element(8, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
484 #define mla_long_2x32b(dest, source_a, source_b) \
485 foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (s64)(source_b).e[_i]) \
487 #define mla_scalar_4x32b(dest, source, value) \
488 foreach_element(4, (dest).e[_i] += (source).e[_i] * value) \
490 #define mla_scalar_2x32b(dest, source, value) \
491 foreach_element(2, (dest).e[_i] += (source).e[_i] * value) \
493 #define mls_scalar_4x32b(dest, source, value) \
494 foreach_element(4, (dest).e[_i] -= (source).e[_i] * value) \
496 #define mls_scalar_2x32b(dest, source, value) \
497 foreach_element(2, (dest).e[_i] -= (source).e[_i] * value) \
499 #define mls_scalar_long_2x32b(dest, source, value) \
500 foreach_element(2, (dest).e[_i] -= (source).e[_i] * value) \
502 #define rev_2x32b(dest, source) \
504 u32 tmp = source.e[1]; \
505 (dest).e[1] = source.e[0]; \
509 #define abs_4x32b(dest, source) \
510 foreach_element(4, (dest).e[_i] = abs(source.e[_i])) \
512 #define abs_2x32b(dest, source) \
513 foreach_element(2, (dest).e[_i] = abs(source.e[_i])) \
515 #define neg_2x32b(dest, source) \
516 foreach_element(2, (dest).e[_i] = -((source).e[_i])) \
519 #define shrq_narrow_8x16b(dest, source, shift) \
522 u32 result = ((source).e[_i]) >> shift; \
525 (dest).e[_i] = result; \
528 #define min_8x16b(dest, source_a, source_b) \
531 s32 result = (source_a).e[_i]; \
532 if((source_b).e[_i] < result) \
533 result = (source_b).e[_i]; \
534 (dest).e[_i] = result; \
537 #define min_8x8b(dest, source_a, source_b) \
540 u32 result = (source_a).e[_i]; \
541 if((source_b).e[_i] < result) \
542 result = (source_b).e[_i]; \
543 (dest).e[_i] = result; \
546 #define min_16x8b(dest, source_a, source_b) \
547 foreach_element(16, \
549 u32 result = (source_a).e[_i]; \
550 if((source_b).e[_i] < result) \
551 result = (source_b).e[_i]; \
552 (dest).e[_i] = result; \
555 #define max_8x16b(dest, source_a, source_b) \
558 s32 result = (source_a).e[_i]; \
559 if((source_b).e[_i] > result) \
560 result = (source_b).e[_i]; \
561 (dest).e[_i] = result; \
564 #define bsl_8x16b(dest_mask, source_a, source_b) \
565 foreach_element(8, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) | \
566 ((source_b).e[_i] & ~(dest_mask.e[_i]))) \
568 #define bif_8x16b(dest, source, mask) \
569 foreach_element(8, dest.e[_i] = ((source).e[_i] & ~(mask.e[_i])) | \
570 ((dest).e[_i] & mask.e[_i])) \
572 #define bsl_4x32b(dest_mask, source_a, source_b) \
573 foreach_element(4, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) | \
574 ((source_b).e[_i] & ~(dest_mask.e[_i]))) \
576 #define bit_4x16b(dest, source, mask) \
577 foreach_element(4, dest.e[_i] = ((source).e[_i] & mask.e[_i]) | \
578 ((dest).e[_i] & ~(mask.e[_i]))) \