2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of
7 * the License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
18 #define build_vector_type_pair(sign, size, count, count_x2) \
21 sign##size e[count]; \
22 } vec_##count##x##size##sign; \
28 sign##size e[count_x2]; \
31 vec_##count##x##size##sign low; \
32 vec_##count##x##size##sign high; \
35 } vec_##count_x2##x##size##sign \
37 #define build_vector_types(sign) \
38 build_vector_type_pair(sign, 8, 8, 16); \
39 build_vector_type_pair(sign, 16, 4, 8); \
40 build_vector_type_pair(sign, 32, 2, 4); \
41 build_vector_type_pair(sign, 64, 1, 2) \
43 build_vector_types(u);
44 build_vector_types(s);
47 #define foreach_element(iterations, operation) \
50 for(_i = 0; _i < iterations; _i++) \
56 #define load_64b(dest, source) \
57 *((u64 *)(dest).e) = *((u64 *)(source)) \
59 #define load_128b(dest, source) \
60 *((u64 *)(dest).e) = *((u64 *)(source)); \
61 *((u64 *)(dest).e + 1) = *(((u64 *)(source)) + 1) \
63 #define load_8x16b(dest, source) \
64 foreach_element(8, (dest).e[_i] = ((u16 *)(source))[_i]) \
66 #define store_64b(source, dest) \
67 *((u64 *)(dest)) = *((u64 *)(source).e) \
69 #define store_128b(source, dest) \
70 *((u64 *)(dest)) = *((u64 *)(source).e); \
71 *(((u64 *)(dest)) + 1) = *((u64 *)(source).e + 1) \
73 #define store_8x16b(source, dest) \
74 foreach_element(8, ((u16 *)dest)[_i] = (source).e[_i]) \
77 #define split_8x16b(dest, source) \
80 (dest).e[_i * 2] = (source).e[_i]; \
81 (dest).e[(_i * 2) + 1] = (source).e[_i] >> 8; \
84 #define merge_16x8b(dest, source) \
86 (dest).e[_i] = (source).e[_i * 2] | ((source).e[(_i * 2) + 1] << 8)) \
88 #define vector_cast(vec_to, source) \
89 (*((volatile vec_to *)(&(source)))) \
91 #define vector_cast_high(vec_to, source) \
92 (*((volatile vec_to *)((u8 *)source.e + (sizeof(source.e) / 2)))) \
95 #define dup_8x8b(dest, value) \
96 foreach_element(8, (dest).e[_i] = value) \
98 #define dup_16x8b(dest, value) \
99 foreach_element(16, (dest).e[_i] = value) \
101 #define dup_4x16b(dest, value) \
102 foreach_element(4, (dest).e[_i] = value) \
104 #define dup_8x16b(dest, value) \
105 foreach_element(8, (dest).e[_i] = value) \
107 #define dup_2x32b(dest, value) \
108 foreach_element(2, (dest).e[_i] = value) \
110 #define dup_4x32b(dest, value) \
111 foreach_element(4, (dest).e[_i] = value) \
113 #define shr_narrow_8x16b(dest, source, shift) \
114 foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
116 #define shr_narrow_2x64b(dest, source, shift) \
117 foreach_element(2, (dest).e[_i] = (source).e[_i] >> (shift)) \
119 #define shr_8x8b(dest, source, shift) \
120 foreach_element(8, (dest).e[_i] = (u8)(source).e[_i] >> (shift)) \
122 #define shl_8x8b(dest, source, shift) \
123 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
125 #define shr_8x16b(dest, source, shift) \
126 foreach_element(8, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
128 #define shr_2x32b(dest, source, shift) \
129 foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
131 #define shr_4x16b(dest, source, shift) \
132 foreach_element(4, (dest).e[_i] = (source).e[_i] >> (shift)) \
134 #define shl_4x16b(dest, source, shift) \
135 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift)) \
137 #define shr_4x32b(dest, source, shift) \
138 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
140 #define shr_narrow_4x32b(dest, source, shift) \
141 foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
143 #define shl_8x16b(dest, source, shift) \
144 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
146 #define shl_4x32b(dest, source, shift) \
147 foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift)) \
149 #define shl_2x32b(dest, source, shift) \
150 foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift)) \
152 #define shl_1x64b(dest, source, shift) \
153 ((dest).e[0] = (source).e[0] << (shift)) \
155 #define shl_2x64b(dest, source, shift) \
156 foreach_element(2, (dest).e[_i] = (source).e[_i] << (shift)) \
158 #define shl_variable_2x64b(dest, source_a, source_b) \
160 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
162 #define shl_variable_8x16b(dest, source_a, source_b) \
164 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
166 #define shl_variable_4x16b(dest, source_a, source_b) \
168 (dest).e[_i] = (source_a).e[_i] << ((source_b).e[_i] & 0xFF)) \
170 #define shr_1x64b(dest, source, shift) \
171 ((dest).e[0] = (source).e[0] >> (shift)) \
173 #define shl_long_8x8b(dest, source, shift) \
174 foreach_element(8, (dest).e[_i] = (source).e[_i] << (shift)) \
176 #define shl_long_4x16b(dest, source, shift) \
177 foreach_element(4, (dest).e[_i] = (source).e[_i] << (shift)) \
179 #define shrq_narrow_signed_8x16b(dest, source, shift) \
182 s32 result = ((s16)(source).e[_i]) >> shift; \
187 (dest).e[_i] = result; \
190 #define shl_reg_4x32b(dest, source_a, source_b) \
193 s8 shift = (source_b).e[_i]; \
195 dest.e[_i] = (source_a).e[_i] >> (-shift); \
197 dest.e[_i] = (source_a).e[_i] << shift; \
200 #define shl_reg_2x32b(dest, source_a, source_b) \
203 s8 shift = (source_b).e[_i]; \
205 dest.e[_i] = (source_a).e[_i] >> (-shift); \
207 dest.e[_i] = (source_a).e[_i] << shift; \
210 #define shl_reg_2x64b(dest, source_a, source_b) \
213 s8 shift = (source_b).e[_i]; \
215 dest.e[_i] = (source_a).e[_i] >> (-shift); \
217 dest.e[_i] = (source_a).e[_i] << shift; \
221 #define sri_8x8b(dest, source, shift) \
222 foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF >> (shift))) | \
223 ((u8)(source).e[_i] >> (shift))) \
225 #define sli_8x8b(dest, source, shift) \
226 foreach_element(8, (dest).e[_i] = ((dest).e[_i] & ~(0xFF << (shift))) | \
227 ((source).e[_i] << (shift))) \
231 #define mov_narrow_8x16b(dest, source) \
232 foreach_element(8, (dest).e[_i] = (source).e[_i]) \
234 #define mov_narrow_4x32b(dest, source) \
235 foreach_element(4, (dest).e[_i] = (source).e[_i]) \
237 #define mov_narrow_2x64b(dest, source) \
238 foreach_element(2, (dest).e[_i] = (source).e[_i]) \
240 #define mov_wide_8x8b(dest, source) \
241 foreach_element(8, (dest).e[_i] = (source).e[_i]) \
243 #define mov_wide_2x32b(dest, source) \
244 foreach_element(2, (dest).e[_i] = (source).e[_i]) \
246 #define mvn_4x16b(dest, source) \
247 foreach_element(4, (dest).e[_i] = ~((source).e[_i])) \
249 #define add_4x16b(dest, source_a, source_b) \
250 foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
252 #define add_4x32b(dest, source_a, source_b) \
253 foreach_element(4, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
255 #define add_2x32b(dest, source_a, source_b) \
256 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
258 #define add_8x16b(dest, source_a, source_b) \
259 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
261 #define add_16x8b(dest, source_a, source_b) \
262 foreach_element(16, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
264 #define add_8x8b(dest, source_a, source_b) \
265 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
267 #define add_1x64b(dest, source_a, source_b) \
268 (dest).e[0] = (source_a).e[0] + (source_b).e[0] \
270 #define add_2x64b(dest, source_a, source_b) \
271 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
273 #define add_high_narrow_2x64b(dest, source_a, source_b) \
275 ((dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) >> 32) \
277 #define add_high_narrow_4x32b(dest, source_a, source_b) \
279 ((dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 16)) \
281 #define sub_4x16b(dest, source_a, source_b) \
282 foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
284 #define sub_4x32b(dest, source_a, source_b) \
285 foreach_element(4, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
287 #define sub_2x32b(dest, source_a, source_b) \
288 foreach_element(2, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
290 #define sub_wide_8x8b(dest, source_a, source_b) \
291 foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
293 #define add_wide_8x8b(dest, source_a, source_b) \
294 foreach_element(8, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
296 #define add_wide_2x32b(dest, source_a, source_b) \
297 foreach_element(2, (dest).e[_i] = (source_a).e[_i] + (source_b).e[_i]) \
299 #define addq_8x8b(dest, source_a, source_b) \
302 u32 result = (source_a).e[_i] + (source_b).e[_i]; \
305 (dest).e[_i] = result; \
308 #define subq_8x8b(dest, source_a, source_b) \
311 u32 result = (source_a).e[_i] - (source_b).e[_i]; \
314 (dest).e[_i] = result; \
317 #define subs_long_8x8b(dest, source_a, source_b) \
318 subs_8x8b(dest, source_a, source_b) \
320 #define subs_16x8b(dest, source_a, source_b) \
321 foreach_element(16, \
323 u32 result = (source_a).e[_i] - (source_b).e[_i]; \
326 (dest).e[_i] = result; \
329 #define subs_8x16b(dest, source_a, source_b) \
332 s32 result = (source_a).e[_i] - (source_b).e[_i]; \
336 (dest).e[_i] = result; \
339 #define sub_8x16b(dest, source_a, source_b) \
340 foreach_element(8, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
342 #define sub_16x8b(dest, source_a, source_b) \
343 foreach_element(16, (dest).e[_i] = (source_a).e[_i] - (source_b).e[_i]) \
345 #define orn_8x16b(dest, source_a, source_b) \
346 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | ~((source_b).e[_i])) \
348 #define and_4x16b(dest, source_a, source_b) \
349 foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
351 #define and_8x16b(dest, source_a, source_b) \
352 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
354 #define and_4x32b(dest, source_a, source_b) \
355 foreach_element(4, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
357 #define and_16x8b(dest, source_a, source_b) \
358 foreach_element(16, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
360 #define and_8x8b(dest, source_a, source_b) \
361 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
363 #define and_2x32b(dest, source_a, source_b) \
364 foreach_element(2, (dest).e[_i] = (source_a).e[_i] & (source_b).e[_i]) \
366 #define bic_8x8b(dest, source_a, source_b) \
367 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i])) \
369 #define bic_8x16b(dest, source_a, source_b) \
370 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & ~((source_b).e[_i])) \
372 #define bic_immediate_4x16b(dest, value) \
373 foreach_element(4, (dest).e[_i] = (dest).e[_i] & ~(value)) \
375 #define bic_immediate_8x16b(dest, value) \
376 foreach_element(8, (dest).e[_i] = (dest).e[_i] & ~(value)) \
378 #define or_8x16b(dest, source_a, source_b) \
379 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (source_b).e[_i]) \
381 #define or_immediate_8x16b(dest, source_a, value) \
382 foreach_element(8, (dest).e[_i] = (source_a).e[_i] | (value)) \
384 #define eor_8x16b(dest, source_a, source_b) \
385 foreach_element(8, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
387 #define eor_4x32b(dest, source_a, source_b) \
388 foreach_element(4, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
390 #define eor_2x32b(dest, source_a, source_b) \
391 foreach_element(2, (dest).e[_i] = (source_a).e[_i] ^ (source_b).e[_i]) \
393 #define zip_8x16b(dest, source_a, source_b) \
394 foreach_element(8, (dest).e[_i] = \
395 (u8)(source_a).e[_i] | ((u8)(source_b).e[_i] << 8)) \
397 #define zip_2x64b(dest, source_a, source_b) \
398 foreach_element(2, (dest).e[_i] = \
399 (u64)(source_a).e[_i] | ((u64)(source_b).e[_i] << 32)) \
401 #define unzip_8x8b(dest_a, dest_b, source) \
404 (dest_a).e[_i] = (source).e[_i]; \
405 (dest_b).e[_i] = ((source).e[_i]) >> 8; \
408 #define unzip_16x8b(dest_a, dest_b, source_a, source_b) \
411 (dest_a).e[_i] = (source_a).e[_i]; \
412 (dest_b).e[_i] = (source_a).e[_i] >> 8; \
416 (dest_a).e[_i + 8] = (source_b).e[_i]; \
417 (dest_b).e[_i + 8] = (source_b).e[_i] >> 8; \
420 #define tbl_16(dest, indexes, table) \
423 u32 index = indexes.e[_i]; \
425 (dest).e[_i] = table.e[index]; \
430 #define cmpeqz_8x16b(dest, source) \
431 foreach_element(8, (dest).e[_i] = ~(((source).e[_i] == 0) - 1)) \
433 #define cmpltz_8x16b(dest, source) \
434 foreach_element(8, (dest).e[_i] = ((s16)(source).e[_i] >> 15)) \
436 #define cmpltz_4x32b(dest, source) \
437 foreach_element(4, (dest).e[_i] = ((s32)(source).e[_i] >> 31)) \
439 #define cmpltz_2x32b(dest, source) \
440 foreach_element(2, (dest).e[_i] = ((s32)(source).e[_i] >> 31)) \
442 #define cmplte_4x16b(dest, source_a, source_b) \
443 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] <= source_b.e[_i]) - 1)) \
445 #define cmplt_4x16b(dest, source_a, source_b) \
446 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] < source_b.e[_i]) - 1)) \
448 #define cmpgt_4x16b(dest, source_a, source_b) \
449 foreach_element(4, (dest).e[_i] = ~((source_a.e[_i] > source_b.e[_i]) - 1)) \
451 #define tst_8x16b(dest, source_a, source_b) \
453 (dest).e[_i] = ~(((source_a.e[_i] & source_b.e[_i]) != 0) - 1)) \
455 #define andi_8x8b(dest, source_a, value) \
456 foreach_element(8, (dest).e[_i] = (source_a).e[_i] & value) \
458 #define average_8x16b(dest, source_a, source_b) \
460 (dest).e[_i] = ((source_a).e[_i] + (source_b).e[_i]) >> 1) \
463 #define mul_8x8b(dest, source_a, source_b) \
464 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
466 #define mul_8x16b(dest, source_a, source_b) \
467 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
469 #define mul_2x32b(dest, source_a, source_b) \
470 foreach_element(2, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
472 #define mul_4x32b(dest, source_a, source_b) \
473 foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
475 #define mul_long_8x8b(dest, source_a, source_b) \
476 foreach_element(8, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
478 #define mul_long_4x16b(dest, source_a, source_b) \
479 foreach_element(4, (dest).e[_i] = (source_a).e[_i] * (source_b).e[_i]) \
481 #define mul_long_2x32b(dest, source_a, source_b) \
483 (dest).e[_i] = (source_a).e[_i] * (s64)((source_b).e[_i])) \
485 #define mul_scalar_2x32b(dest, source, value) \
486 foreach_element(2, (dest).e[_i] = (source).e[_i] * value) \
488 #define mul_scalar_long_8x16b(dest, source, value) \
489 foreach_element(8, (dest).e[_i] = (source).e[_i] * value) \
491 #define mul_scalar_long_2x32b(dest, source, value) \
492 foreach_element(2, (dest).e[_i] = (source).e[_i] * value) \
494 #define mla_2x32b(dest, source_a, source_b) \
495 foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
497 #define mla_4x32b(dest, source_a, source_b) \
498 foreach_element(4, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
500 #define mla_scalar_long_2x32b(dest, source, value) \
501 foreach_element(2, (dest).e[_i] += (source).e[_i] * value) \
503 #define mla_long_8x8b(dest, source_a, source_b) \
504 foreach_element(8, (dest).e[_i] += (source_a).e[_i] * (source_b).e[_i]) \
506 #define mla_long_2x32b(dest, source_a, source_b) \
507 foreach_element(2, (dest).e[_i] += (source_a).e[_i] * (s64)(source_b).e[_i]) \
509 #define mla_scalar_4x32b(dest, source, value) \
510 foreach_element(4, (dest).e[_i] += (source).e[_i] * value) \
512 #define mla_scalar_2x32b(dest, source, value) \
513 foreach_element(2, (dest).e[_i] += (source).e[_i] * value) \
515 #define mls_scalar_4x32b(dest, source, value) \
516 foreach_element(4, (dest).e[_i] -= (source).e[_i] * value) \
518 #define mls_scalar_2x32b(dest, source, value) \
519 foreach_element(2, (dest).e[_i] -= (source).e[_i] * value) \
521 #define mls_scalar_long_2x32b(dest, source, value) \
522 foreach_element(2, (dest).e[_i] -= (source).e[_i] * value) \
524 #define rev_2x32b(dest, source) \
526 u32 tmp = source.e[1]; \
527 (dest).e[1] = source.e[0]; \
531 #define abs_4x32b(dest, source) \
532 foreach_element(4, (dest).e[_i] = abs(source.e[_i])) \
534 #define abs_2x32b(dest, source) \
535 foreach_element(2, (dest).e[_i] = abs(source.e[_i])) \
537 #define neg_2x32b(dest, source) \
538 foreach_element(2, (dest).e[_i] = -((source).e[_i])) \
541 #define shrq_narrow_8x16b(dest, source, shift) \
544 u32 result = ((source).e[_i]) >> shift; \
547 (dest).e[_i] = result; \
550 #define min_8x16b(dest, source_a, source_b) \
553 s32 result = (source_a).e[_i]; \
554 if((source_b).e[_i] < result) \
555 result = (source_b).e[_i]; \
556 (dest).e[_i] = result; \
559 #define min_8x8b(dest, source_a, source_b) \
562 u32 result = (source_a).e[_i]; \
563 if((source_b).e[_i] < result) \
564 result = (source_b).e[_i]; \
565 (dest).e[_i] = result; \
568 #define min_16x8b(dest, source_a, source_b) \
569 foreach_element(16, \
571 u32 result = (source_a).e[_i]; \
572 if((source_b).e[_i] < result) \
573 result = (source_b).e[_i]; \
574 (dest).e[_i] = result; \
577 #define max_8x16b(dest, source_a, source_b) \
580 s32 result = (source_a).e[_i]; \
581 if((source_b).e[_i] > result) \
582 result = (source_b).e[_i]; \
583 (dest).e[_i] = result; \
586 #define bsl_8x16b(dest_mask, source_a, source_b) \
587 foreach_element(8, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) | \
588 ((source_b).e[_i] & ~(dest_mask.e[_i]))) \
590 #define bif_8x16b(dest, source, mask) \
591 foreach_element(8, dest.e[_i] = ((source).e[_i] & ~(mask.e[_i])) | \
592 ((dest).e[_i] & mask.e[_i])) \
594 #define bsl_4x32b(dest_mask, source_a, source_b) \
595 foreach_element(4, dest_mask.e[_i] = ((source_a).e[_i] & dest_mask.e[_i]) | \
596 ((source_b).e[_i] & ~(dest_mask.e[_i]))) \
598 #define bit_4x16b(dest, source, mask) \
599 foreach_element(4, dest.e[_i] = ((source).e[_i] & mask.e[_i]) | \
600 ((dest).e[_i] & ~(mask.e[_i]))) \