git subrepo clone https://github.com/libretro/libretro-common.git deps/libretro-common
[pcsx_rearmed.git] / deps / libretro-common / gfx / scaler / pixconv.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (pixconv.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include <stdio.h>
24#include <stdint.h>
25#include <stdlib.h>
26#include <string.h>
27
28#include <retro_inline.h>
29
30#include <gfx/scaler/pixconv.h>
31
32#if _MSC_VER && _MSC_VER <= 1800
33#define SCALER_NO_SIMD
34#endif
35
36#ifdef SCALER_NO_SIMD
37#undef __SSE2__
38#endif
39
40#if defined(__SSE2__)
41#include <emmintrin.h>
42#elif defined(__MMX__)
43#include <mmintrin.h>
44#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
45#include <arm_neon.h>
46#endif
47
48void conv_rgb565_0rgb1555(void *output_, const void *input_,
49 int width, int height,
50 int out_stride, int in_stride)
51{
52 int h;
53 const uint16_t *input = (const uint16_t*)input_;
54 uint16_t *output = (uint16_t*)output_;
55
56#if defined(__SSE2__)
57 int max_width = width - 7;
58 const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
59 const __m128i lo_mask = _mm_set1_epi16(0x1f);
60#endif
61
62 for (h = 0; h < height;
63 h++, output += out_stride >> 1, input += in_stride >> 1)
64 {
65 int w = 0;
66#if defined(__SSE2__)
67 for (; w < max_width; w += 8)
68 {
69 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
70 __m128i hi = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
71 __m128i lo = _mm_and_si128(in, lo_mask);
72 _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(hi, lo));
73 }
74#endif
75
76 for (; w < width; w++)
77 {
78 uint16_t col = input[w];
79 uint16_t hi = (col >> 1) & 0x7fe0;
80 uint16_t lo = col & 0x1f;
81 output[w] = hi | lo;
82 }
83 }
84}
85
86void conv_0rgb1555_rgb565(void *output_, const void *input_,
87 int width, int height,
88 int out_stride, int in_stride)
89{
90 int h;
91 const uint16_t *input = (const uint16_t*)input_;
92 uint16_t *output = (uint16_t*)output_;
93
94#if defined(__SSE2__)
95 int max_width = width - 7;
96
97 const __m128i hi_mask = _mm_set1_epi16(
98 (int16_t)((0x1f << 11) | (0x1f << 6)));
99 const __m128i lo_mask = _mm_set1_epi16(0x1f);
100 const __m128i glow_mask = _mm_set1_epi16(1 << 5);
101#endif
102
103 for (h = 0; h < height;
104 h++, output += out_stride >> 1, input += in_stride >> 1)
105 {
106 int w = 0;
107#if defined(__SSE2__)
108 for (; w < max_width; w += 8)
109 {
110 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
111 __m128i rg = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
112 __m128i b = _mm_and_si128(in, lo_mask);
113 __m128i glow = _mm_and_si128(_mm_srli_epi16(in, 4), glow_mask);
114 _mm_storeu_si128((__m128i*)(output + w),
115 _mm_or_si128(rg, _mm_or_si128(b, glow)));
116 }
117#endif
118
119 for (; w < width; w++)
120 {
121 uint16_t col = input[w];
122 uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
123 uint16_t b = col & 0x1f;
124 uint16_t glow = (col >> 4) & (1 << 5);
125 output[w] = rg | b | glow;
126 }
127 }
128}
129
130void conv_0rgb1555_argb8888(void *output_, const void *input_,
131 int width, int height,
132 int out_stride, int in_stride)
133{
134 int h;
135 const uint16_t *input = (const uint16_t*)input_;
136 uint32_t *output = (uint32_t*)output_;
137
138#ifdef __SSE2__
139 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
140 const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
141 const __m128i mul15_mid = _mm_set1_epi16(0x4200);
142 const __m128i mul15_hi = _mm_set1_epi16(0x0210);
143 const __m128i a = _mm_set1_epi16(0x00ff);
144
145 int max_width = width - 7;
146#endif
147
148 for (h = 0; h < height;
149 h++, output += out_stride >> 2, input += in_stride >> 1)
150 {
151 int w = 0;
152#ifdef __SSE2__
153 for (; w < max_width; w += 8)
154 {
155 __m128i res_lo_bg, res_hi_bg;
156 __m128i res_lo_ra, res_hi_ra;
157 __m128i res_lo, res_hi;
158 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
159 __m128i r = _mm_and_si128(in, pix_mask_r);
160 __m128i g = _mm_and_si128(in, pix_mask_gb);
161 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_gb);
162
163 r = _mm_mulhi_epi16(r, mul15_hi);
164 g = _mm_mulhi_epi16(g, mul15_mid);
165 b = _mm_mulhi_epi16(b, mul15_mid);
166
167 res_lo_bg = _mm_unpacklo_epi8(b, g);
168 res_hi_bg = _mm_unpackhi_epi8(b, g);
169 res_lo_ra = _mm_unpacklo_epi8(r, a);
170 res_hi_ra = _mm_unpackhi_epi8(r, a);
171
172 res_lo = _mm_or_si128(res_lo_bg,
173 _mm_slli_si128(res_lo_ra, 2));
174 res_hi = _mm_or_si128(res_hi_bg,
175 _mm_slli_si128(res_hi_ra, 2));
176
177 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
178 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
179 }
180#endif
181
182 for (; w < width; w++)
183 {
184 uint32_t col = input[w];
185 uint32_t r = (col >> 10) & 0x1f;
186 uint32_t g = (col >> 5) & 0x1f;
187 uint32_t b = (col >> 0) & 0x1f;
188 r = (r << 3) | (r >> 2);
189 g = (g << 3) | (g >> 2);
190 b = (b << 3) | (b >> 2);
191
192 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
193 }
194 }
195}
196
197void conv_rgb565_argb8888(void *output_, const void *input_,
198 int width, int height,
199 int out_stride, int in_stride)
200{
201 int h;
202 const uint16_t *input = (const uint16_t*)input_;
203 uint32_t *output = (uint32_t*)output_;
204
205#if defined(__SSE2__)
206 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
207 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
208 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
209 const __m128i mul16_r = _mm_set1_epi16(0x0210);
210 const __m128i mul16_g = _mm_set1_epi16(0x2080);
211 const __m128i mul16_b = _mm_set1_epi16(0x4200);
212 const __m128i a = _mm_set1_epi16(0x00ff);
213
214 int max_width = width - 7;
215#elif defined(__MMX__)
216 const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10);
217 const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5);
218 const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5);
219 const __m64 mul16_r = _mm_set1_pi16(0x0210);
220 const __m64 mul16_g = _mm_set1_pi16(0x2080);
221 const __m64 mul16_b = _mm_set1_pi16(0x4200);
222 const __m64 a = _mm_set1_pi16(0x00ff);
223
224 int max_width = width - 3;
225#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
226 int max_width = width - 7;
227#endif
228
229 for (h = 0; h < height;
230 h++, output += out_stride >> 2, input += in_stride >> 1)
231 {
232 int w = 0;
233#if defined(__SSE2__)
234 for (; w < max_width; w += 8)
235 {
236 __m128i res_lo, res_hi;
237 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
238 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
239 __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
240 __m128i g = _mm_and_si128(in, pix_mask_g);
241 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
242
243 r = _mm_mulhi_epi16(r, mul16_r);
244 g = _mm_mulhi_epi16(g, mul16_g);
245 b = _mm_mulhi_epi16(b, mul16_b);
246
247 res_lo_bg = _mm_unpacklo_epi8(b, g);
248 res_hi_bg = _mm_unpackhi_epi8(b, g);
249 res_lo_ra = _mm_unpacklo_epi8(r, a);
250 res_hi_ra = _mm_unpackhi_epi8(r, a);
251
252 res_lo = _mm_or_si128(res_lo_bg,
253 _mm_slli_si128(res_lo_ra, 2));
254 res_hi = _mm_or_si128(res_hi_bg,
255 _mm_slli_si128(res_hi_ra, 2));
256
257 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
258 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
259 }
260#elif defined(__MMX__)
261 for (; w < max_width; w += 4)
262 {
263 __m64 res_lo, res_hi;
264 __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
265 const __m64 in = *((__m64*)(input + w));
266 __m64 r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r);
267 __m64 g = _mm_and_si64(in, pix_mask_g);
268 __m64 b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b);
269
270 r = _mm_mulhi_pi16(r, mul16_r);
271 g = _mm_mulhi_pi16(g, mul16_g);
272 b = _mm_mulhi_pi16(b, mul16_b);
273
274 res_lo_bg = _mm_unpacklo_pi8(b, g);
275 res_hi_bg = _mm_unpackhi_pi8(b, g);
276 res_lo_ra = _mm_unpacklo_pi8(r, a);
277 res_hi_ra = _mm_unpackhi_pi8(r, a);
278
279 res_lo = _mm_or_si64(res_lo_bg,
280 _mm_slli_si64(res_lo_ra, 16));
281 res_hi = _mm_or_si64(res_hi_bg,
282 _mm_slli_si64(res_hi_ra, 16));
283
284 *((__m64*)(output + w + 0)) = res_lo;
285 *((__m64*)(output + w + 2)) = res_hi;
286 }
287
288 _mm_empty();
289#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
290 for (; w < max_width; w += 8)
291 {
292 uint16x8_t in = vld1q_u16(input + w);
293
294 uint16x8_t r = vsriq_n_u16(in, in, 5);
295 uint16x8_t b = vsliq_n_u16(in, in, 5);
296 uint16x8_t g = vsriq_n_u16(b, b, 6);
297
298 uint8x8x4_t res;
299 res.val[3] = vdup_n_u8(0xffu);
300 res.val[2] = vshrn_n_u16(r, 8);
301 res.val[1] = vshrn_n_u16(g, 8);
302 res.val[0] = vshrn_n_u16(b, 2);
303
304 vst4_u8((uint8_t*)(output + w), res);
305 }
306#endif
307
308 for (; w < width; w++)
309 {
310 uint32_t col = input[w];
311 uint32_t r = (col >> 11) & 0x1f;
312 uint32_t g = (col >> 5) & 0x3f;
313 uint32_t b = (col >> 0) & 0x1f;
314 r = (r << 3) | (r >> 2);
315 g = (g << 2) | (g >> 4);
316 b = (b << 3) | (b >> 2);
317
318 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
319 }
320 }
321}
322
323void conv_rgb565_abgr8888(void *output_, const void *input_,
324 int width, int height,
325 int out_stride, int in_stride)
326{
327 int h;
328 const uint16_t *input = (const uint16_t*)input_;
329 uint32_t *output = (uint32_t*)output_;
330 #if defined(__SSE2__)
331 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
332 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
333 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
334 const __m128i mul16_r = _mm_set1_epi16(0x0210);
335 const __m128i mul16_g = _mm_set1_epi16(0x2080);
336 const __m128i mul16_b = _mm_set1_epi16(0x4200);
337 const __m128i a = _mm_set1_epi16(0x00ff);
338 int max_width = width - 7;
339#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
340 int max_width = width - 7;
341#endif
342 for (h = 0; h < height;
343 h++, output += out_stride >> 2, input += in_stride >> 1)
344 {
345 int w = 0;
346#if defined(__SSE2__)
347 for (; w < max_width; w += 8)
348 {
349 __m128i res_lo, res_hi;
350 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
351 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
352 __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
353 __m128i g = _mm_and_si128(in, pix_mask_g);
354 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
355 r = _mm_mulhi_epi16(r, mul16_r);
356 g = _mm_mulhi_epi16(g, mul16_g);
357 b = _mm_mulhi_epi16(b, mul16_b);
358 res_lo_bg = _mm_unpacklo_epi8(b, g);
359 res_hi_bg = _mm_unpackhi_epi8(b, g);
360 res_lo_ra = _mm_unpacklo_epi8(r, a);
361 res_hi_ra = _mm_unpackhi_epi8(r, a);
362 res_lo = _mm_or_si128(res_lo_bg,
363 _mm_slli_si128(res_lo_ra, 2));
364 res_hi = _mm_or_si128(res_hi_bg,
365 _mm_slli_si128(res_hi_ra, 2));
366 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
367 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
368 }
369#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
370 for (; w < max_width; w += 8)
371 {
372 uint16x8_t in = vld1q_u16(input + w);
373
374 uint16x8_t r = vsriq_n_u16(in, in, 5);
375 uint16x8_t b = vsliq_n_u16(in, in, 5);
376 uint16x8_t g = vsriq_n_u16(b, b, 6);
377
378 uint8x8x4_t res;
379 res.val[3] = vdup_n_u8(0xffu);
380 res.val[2] = vshrn_n_u16(b, 2);
381 res.val[1] = vshrn_n_u16(g, 8);
382 res.val[0] = vshrn_n_u16(r, 8);
383
384 vst4_u8((uint8_t*)(output + w), res);
385 }
386#endif
387 for (; w < width; w++)
388 {
389 uint32_t col = input[w];
390 uint32_t r = (col >> 11) & 0x1f;
391 uint32_t g = (col >> 5) & 0x3f;
392 uint32_t b = (col >> 0) & 0x1f;
393 r = (r << 3) | (r >> 2);
394 g = (g << 2) | (g >> 4);
395 b = (b << 3) | (b >> 2);
396 output[w] = (0xffu << 24) | (b << 16) | (g << 8) | (r << 0);
397 }
398 }
399}
400
401void conv_argb8888_rgba4444(void *output_, const void *input_,
402 int width, int height,
403 int out_stride, int in_stride)
404{
405 int h, w;
406 const uint32_t *input = (const uint32_t*)input_;
407 uint16_t *output = (uint16_t*)output_;
408
409 for (h = 0; h < height;
410 h++, output += out_stride >> 2, input += in_stride >> 1)
411 {
412 for (w = 0; w < width; w++)
413 {
414 uint32_t col = input[w];
415 uint32_t r = (col >> 16) & 0xf;
416 uint32_t g = (col >> 8) & 0xf;
417 uint32_t b = (col) & 0xf;
418 uint32_t a = (col >> 24) & 0xf;
419 r = (r >> 4) | r;
420 g = (g >> 4) | g;
421 b = (b >> 4) | b;
422 a = (a >> 4) | a;
423
424 output[w] = (r << 12) | (g << 8) | (b << 4) | a;
425 }
426 }
427}
428
429void conv_rgba4444_argb8888(void *output_, const void *input_,
430 int width, int height,
431 int out_stride, int in_stride)
432{
433 int h;
434 const uint16_t *input = (const uint16_t*)input_;
435 uint32_t *output = (uint32_t*)output_;
436
437#if defined(__MMX__)
438 const __m64 pix_mask_r = _mm_set1_pi16(0xf << 10);
439 const __m64 pix_mask_g = _mm_set1_pi16(0xf << 8);
440 const __m64 pix_mask_b = _mm_set1_pi16(0xf << 8);
441 const __m64 mul16_r = _mm_set1_pi16(0x0440);
442 const __m64 mul16_g = _mm_set1_pi16(0x1100);
443 const __m64 mul16_b = _mm_set1_pi16(0x1100);
444 const __m64 a = _mm_set1_pi16(0x00ff);
445
446 int max_width = width - 3;
447#endif
448
449 for (h = 0; h < height;
450 h++, output += out_stride >> 2, input += in_stride >> 1)
451 {
452 int w = 0;
453#if defined(__MMX__)
454 for (; w < max_width; w += 4)
455 {
456 __m64 res_lo, res_hi;
457 __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
458 const __m64 in = *((__m64*)(input + w));
459 __m64 r = _mm_and_si64(_mm_srli_pi16(in, 2), pix_mask_r);
460 __m64 g = _mm_and_si64(in, pix_mask_g);
461 __m64 b = _mm_and_si64(_mm_slli_pi16(in, 4), pix_mask_b);
462
463 r = _mm_mulhi_pi16(r, mul16_r);
464 g = _mm_mulhi_pi16(g, mul16_g);
465 b = _mm_mulhi_pi16(b, mul16_b);
466
467 res_lo_bg = _mm_unpacklo_pi8(b, g);
468 res_hi_bg = _mm_unpackhi_pi8(b, g);
469 res_lo_ra = _mm_unpacklo_pi8(r, a);
470 res_hi_ra = _mm_unpackhi_pi8(r, a);
471
472 res_lo = _mm_or_si64(res_lo_bg,
473 _mm_slli_si64(res_lo_ra, 16));
474 res_hi = _mm_or_si64(res_hi_bg,
475 _mm_slli_si64(res_hi_ra, 16));
476
477 *((__m64*)(output + w + 0)) = res_lo;
478 *((__m64*)(output + w + 2)) = res_hi;
479 }
480
481 _mm_empty();
482#endif
483
484 for (; w < width; w++)
485 {
486 uint32_t col = input[w];
487 uint32_t r = (col >> 12) & 0xf;
488 uint32_t g = (col >> 8) & 0xf;
489 uint32_t b = (col >> 4) & 0xf;
490 uint32_t a = (col >> 0) & 0xf;
491 r = (r << 4) | r;
492 g = (g << 4) | g;
493 b = (b << 4) | b;
494 a = (a << 4) | a;
495
496 output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
497 }
498 }
499}
500
501void conv_rgba4444_rgb565(void *output_, const void *input_,
502 int width, int height,
503 int out_stride, int in_stride)
504{
505 int h, w;
506 const uint16_t *input = (const uint16_t*)input_;
507 uint16_t *output = (uint16_t*)output_;
508
509 for (h = 0; h < height;
510 h++, output += out_stride >> 1, input += in_stride >> 1)
511 {
512 for (w = 0; w < width; w++)
513 {
514 uint32_t col = input[w];
515 uint32_t r = (col >> 12) & 0xf;
516 uint32_t g = (col >> 8) & 0xf;
517 uint32_t b = (col >> 4) & 0xf;
518
519 output[w] = (r << 12) | (g << 7) | (b << 1);
520 }
521 }
522}
523
524#if defined(__SSE2__)
525/* :( TODO: Make this saner. */
526static INLINE void store_bgr24_sse2(void *output, __m128i a,
527 __m128i b, __m128i c, __m128i d)
528{
529 const __m128i mask_0 = _mm_set_epi32(0, 0, 0, 0x00ffffff);
530 const __m128i mask_1 = _mm_set_epi32(0, 0, 0x00ffffff, 0);
531 const __m128i mask_2 = _mm_set_epi32(0, 0x00ffffff, 0, 0);
532 const __m128i mask_3 = _mm_set_epi32(0x00ffffff, 0, 0, 0);
533
534 __m128i a0 = _mm_and_si128(a, mask_0);
535 __m128i a1 = _mm_srli_si128(_mm_and_si128(a, mask_1), 1);
536 __m128i a2 = _mm_srli_si128(_mm_and_si128(a, mask_2), 2);
537 __m128i a3 = _mm_srli_si128(_mm_and_si128(a, mask_3), 3);
538 __m128i a4 = _mm_slli_si128(_mm_and_si128(b, mask_0), 12);
539 __m128i a5 = _mm_slli_si128(_mm_and_si128(b, mask_1), 11);
540
541 __m128i b0 = _mm_srli_si128(_mm_and_si128(b, mask_1), 5);
542 __m128i b1 = _mm_srli_si128(_mm_and_si128(b, mask_2), 6);
543 __m128i b2 = _mm_srli_si128(_mm_and_si128(b, mask_3), 7);
544 __m128i b3 = _mm_slli_si128(_mm_and_si128(c, mask_0), 8);
545 __m128i b4 = _mm_slli_si128(_mm_and_si128(c, mask_1), 7);
546 __m128i b5 = _mm_slli_si128(_mm_and_si128(c, mask_2), 6);
547
548 __m128i c0 = _mm_srli_si128(_mm_and_si128(c, mask_2), 10);
549 __m128i c1 = _mm_srli_si128(_mm_and_si128(c, mask_3), 11);
550 __m128i c2 = _mm_slli_si128(_mm_and_si128(d, mask_0), 4);
551 __m128i c3 = _mm_slli_si128(_mm_and_si128(d, mask_1), 3);
552 __m128i c4 = _mm_slli_si128(_mm_and_si128(d, mask_2), 2);
553 __m128i c5 = _mm_slli_si128(_mm_and_si128(d, mask_3), 1);
554
555 __m128i *out = (__m128i*)output;
556
557 _mm_storeu_si128(out + 0,
558 _mm_or_si128(a0, _mm_or_si128(a1, _mm_or_si128(a2,
559 _mm_or_si128(a3, _mm_or_si128(a4, a5))))));
560
561 _mm_storeu_si128(out + 1,
562 _mm_or_si128(b0, _mm_or_si128(b1, _mm_or_si128(b2,
563 _mm_or_si128(b3, _mm_or_si128(b4, b5))))));
564
565 _mm_storeu_si128(out + 2,
566 _mm_or_si128(c0, _mm_or_si128(c1, _mm_or_si128(c2,
567 _mm_or_si128(c3, _mm_or_si128(c4, c5))))));
568}
569#endif
570
571void conv_0rgb1555_bgr24(void *output_, const void *input_,
572 int width, int height,
573 int out_stride, int in_stride)
574{
575 int h;
576 const uint16_t *input = (const uint16_t*)input_;
577 uint8_t *output = (uint8_t*)output_;
578
579#if defined(__SSE2__)
580 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
581 const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
582 const __m128i mul15_mid = _mm_set1_epi16(0x4200);
583 const __m128i mul15_hi = _mm_set1_epi16(0x0210);
584 const __m128i a = _mm_set1_epi16(0x00ff);
585
586 int max_width = width - 15;
587#endif
588
589 for (h = 0; h < height;
590 h++, output += out_stride, input += in_stride >> 1)
591 {
592 uint8_t *out = output;
593 int w = 0;
594
595#if defined(__SSE2__)
596 for (; w < max_width; w += 16, out += 48)
597 {
598 __m128i res_lo_bg0, res_lo_bg1, res_hi_bg0, res_hi_bg1,
599 res_lo_ra0, res_lo_ra1, res_hi_ra0, res_hi_ra1,
600 res_lo0, res_lo1, res_hi0, res_hi1;
601 const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
602 const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
603 __m128i r0 = _mm_and_si128(in0, pix_mask_r);
604 __m128i r1 = _mm_and_si128(in1, pix_mask_r);
605 __m128i g0 = _mm_and_si128(in0, pix_mask_gb);
606 __m128i g1 = _mm_and_si128(in1, pix_mask_gb);
607 __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
608 __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
609
610 r0 = _mm_mulhi_epi16(r0, mul15_hi);
611 r1 = _mm_mulhi_epi16(r1, mul15_hi);
612 g0 = _mm_mulhi_epi16(g0, mul15_mid);
613 g1 = _mm_mulhi_epi16(g1, mul15_mid);
614 b0 = _mm_mulhi_epi16(b0, mul15_mid);
615 b1 = _mm_mulhi_epi16(b1, mul15_mid);
616
617 res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
618 res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
619 res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
620 res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
621 res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
622 res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
623 res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
624 res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
625
626 res_lo0 = _mm_or_si128(res_lo_bg0,
627 _mm_slli_si128(res_lo_ra0, 2));
628 res_lo1 = _mm_or_si128(res_lo_bg1,
629 _mm_slli_si128(res_lo_ra1, 2));
630 res_hi0 = _mm_or_si128(res_hi_bg0,
631 _mm_slli_si128(res_hi_ra0, 2));
632 res_hi1 = _mm_or_si128(res_hi_bg1,
633 _mm_slli_si128(res_hi_ra1, 2));
634
635 /* Non-POT pixel sizes for the loss */
636 store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
637 }
638#endif
639
640 for (; w < width; w++)
641 {
642 uint32_t col = input[w];
643 uint32_t b = (col >> 0) & 0x1f;
644 uint32_t g = (col >> 5) & 0x1f;
645 uint32_t r = (col >> 10) & 0x1f;
646 b = (b << 3) | (b >> 2);
647 g = (g << 3) | (g >> 2);
648 r = (r << 3) | (r >> 2);
649
650 *out++ = b;
651 *out++ = g;
652 *out++ = r;
653 }
654 }
655}
656
657void conv_rgb565_bgr24(void *output_, const void *input_,
658 int width, int height,
659 int out_stride, int in_stride)
660{
661 int h;
662 const uint16_t *input = (const uint16_t*)input_;
663 uint8_t *output = (uint8_t*)output_;
664
665#if defined(__SSE2__)
666 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
667 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
668 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
669 const __m128i mul16_r = _mm_set1_epi16(0x0210);
670 const __m128i mul16_g = _mm_set1_epi16(0x2080);
671 const __m128i mul16_b = _mm_set1_epi16(0x4200);
672 const __m128i a = _mm_set1_epi16(0x00ff);
673
674 int max_width = width - 15;
675#endif
676
677 for (h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
678 {
679 uint8_t *out = output;
680 int w = 0;
681#if defined(__SSE2__)
682 for (; w < max_width; w += 16, out += 48)
683 {
684 __m128i res_lo_bg0, res_hi_bg0, res_lo_ra0, res_hi_ra0;
685 __m128i res_lo_bg1, res_hi_bg1, res_lo_ra1, res_hi_ra1;
686 __m128i res_lo0, res_hi0, res_lo1, res_hi1;
687 const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w));
688 const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
689 __m128i r0 = _mm_and_si128(_mm_srli_epi16(in0, 1), pix_mask_r);
690 __m128i g0 = _mm_and_si128(in0, pix_mask_g);
691 __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_b);
692 __m128i r1 = _mm_and_si128(_mm_srli_epi16(in1, 1), pix_mask_r);
693 __m128i g1 = _mm_and_si128(in1, pix_mask_g);
694 __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
695
696 r0 = _mm_mulhi_epi16(r0, mul16_r);
697 g0 = _mm_mulhi_epi16(g0, mul16_g);
698 b0 = _mm_mulhi_epi16(b0, mul16_b);
699 r1 = _mm_mulhi_epi16(r1, mul16_r);
700 g1 = _mm_mulhi_epi16(g1, mul16_g);
701 b1 = _mm_mulhi_epi16(b1, mul16_b);
702
703 res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
704 res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
705 res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
706 res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
707 res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
708 res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
709 res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
710 res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
711
712 res_lo0 = _mm_or_si128(res_lo_bg0,
713 _mm_slli_si128(res_lo_ra0, 2));
714 res_hi0 = _mm_or_si128(res_hi_bg0,
715 _mm_slli_si128(res_hi_ra0, 2));
716 res_lo1 = _mm_or_si128(res_lo_bg1,
717 _mm_slli_si128(res_lo_ra1, 2));
718 res_hi1 = _mm_or_si128(res_hi_bg1,
719 _mm_slli_si128(res_hi_ra1, 2));
720
721 store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
722 }
723#endif
724
725 for (; w < width; w++)
726 {
727 uint32_t col = input[w];
728 uint32_t r = (col >> 11) & 0x1f;
729 uint32_t g = (col >> 5) & 0x3f;
730 uint32_t b = (col >> 0) & 0x1f;
731 r = (r << 3) | (r >> 2);
732 g = (g << 2) | (g >> 4);
733 b = (b << 3) | (b >> 2);
734
735 *out++ = b;
736 *out++ = g;
737 *out++ = r;
738 }
739 }
740}
741
742void conv_bgr24_argb8888(void *output_, const void *input_,
743 int width, int height,
744 int out_stride, int in_stride)
745{
746 int h, w;
747 const uint8_t *input = (const uint8_t*)input_;
748 uint32_t *output = (uint32_t*)output_;
749
750 for (h = 0; h < height;
751 h++, output += out_stride >> 2, input += in_stride)
752 {
753 const uint8_t *inp = input;
754 for (w = 0; w < width; w++)
755 {
756 uint32_t b = *inp++;
757 uint32_t g = *inp++;
758 uint32_t r = *inp++;
759 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
760 }
761 }
762}
763
764void conv_bgr24_rgb565(void *output_, const void *input_,
765 int width, int height,
766 int out_stride, int in_stride)
767{
768 int h, w;
769 const uint8_t *input = (const uint8_t*)input_;
770 uint16_t *output = (uint16_t*)output_;
771 for (h = 0; h < height;
772 h++, output += out_stride, input += in_stride)
773 {
774 const uint8_t *inp = input;
775 for (w = 0; w < width; w++)
776 {
777 uint16_t b = *inp++;
778 uint16_t g = *inp++;
779 uint16_t r = *inp++;
780
781 output[w] = ((r & 0x00F8) << 8) | ((g&0x00FC) << 3) | ((b&0x00F8) >> 3);
782 }
783 }
784}
785
786void conv_argb8888_0rgb1555(void *output_, const void *input_,
787 int width, int height,
788 int out_stride, int in_stride)
789{
790 int h, w;
791 const uint32_t *input = (const uint32_t*)input_;
792 uint16_t *output = (uint16_t*)output_;
793
794 for (h = 0; h < height;
795 h++, output += out_stride >> 1, input += in_stride >> 2)
796 {
797 for (w = 0; w < width; w++)
798 {
799 uint32_t col = input[w];
800 uint16_t r = (col >> 19) & 0x1f;
801 uint16_t g = (col >> 11) & 0x1f;
802 uint16_t b = (col >> 3) & 0x1f;
803 output[w] = (r << 10) | (g << 5) | (b << 0);
804 }
805 }
806}
807
808void conv_argb8888_bgr24(void *output_, const void *input_,
809 int width, int height,
810 int out_stride, int in_stride)
811{
812 int h;
813 const uint32_t *input = (const uint32_t*)input_;
814 uint8_t *output = (uint8_t*)output_;
815
816#if defined(__SSE2__)
817 int max_width = width - 15;
818#endif
819
820 for (h = 0; h < height;
821 h++, output += out_stride, input += in_stride >> 2)
822 {
823 uint8_t *out = output;
824 int w = 0;
825#if defined(__SSE2__)
826 for (; w < max_width; w += 16, out += 48)
827 {
828 __m128i l0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
829 __m128i l1 = _mm_loadu_si128((const __m128i*)(input + w + 4));
830 __m128i l2 = _mm_loadu_si128((const __m128i*)(input + w + 8));
831 __m128i l3 = _mm_loadu_si128((const __m128i*)(input + w + 12));
832 store_bgr24_sse2(out, l0, l1, l2, l3);
833 }
834#endif
835
836 for (; w < width; w++)
837 {
838 uint32_t col = input[w];
839 *out++ = (uint8_t)(col >> 0);
840 *out++ = (uint8_t)(col >> 8);
841 *out++ = (uint8_t)(col >> 16);
842 }
843 }
844}
845
846#if defined(__SSE2__)
847static INLINE __m128i conv_shuffle_rb_epi32(__m128i c)
848{
849 /* SSSE3 plz */
850 const __m128i b_mask = _mm_set1_epi32(0x000000ff);
851 const __m128i g_mask = _mm_set1_epi32(0x0000ff00);
852 const __m128i r_mask = _mm_set1_epi32(0x00ff0000);
853 __m128i sl = _mm_and_si128(_mm_slli_epi32(c, 16), r_mask);
854 __m128i sr = _mm_and_si128(_mm_srli_epi32(c, 16), b_mask);
855 __m128i g = _mm_and_si128(c, g_mask);
856 __m128i rb = _mm_or_si128(sl, sr);
857 return _mm_or_si128(g, rb);
858}
859#endif
860
861void conv_abgr8888_bgr24(void *output_, const void *input_,
862 int width, int height,
863 int out_stride, int in_stride)
864{
865 int h;
866 const uint32_t *input = (const uint32_t*)input_;
867 uint8_t *output = (uint8_t*)output_;
868
869#if defined(__SSE2__)
870 int max_width = width - 15;
871#endif
872
873 for (h = 0; h < height;
874 h++, output += out_stride, input += in_stride >> 2)
875 {
876 uint8_t *out = output;
877 int w = 0;
878#if defined(__SSE2__)
879 for (; w < max_width; w += 16, out += 48)
880 {
881 __m128i a = _mm_loadu_si128((const __m128i*)(input + w + 0));
882 __m128i b = _mm_loadu_si128((const __m128i*)(input + w + 4));
883 __m128i c = _mm_loadu_si128((const __m128i*)(input + w + 8));
884 __m128i d = _mm_loadu_si128((const __m128i*)(input + w + 12));
885 a = conv_shuffle_rb_epi32(a);
886 b = conv_shuffle_rb_epi32(b);
887 c = conv_shuffle_rb_epi32(c);
888 d = conv_shuffle_rb_epi32(d);
889 store_bgr24_sse2(out, a, b, c, d);
890 }
891#endif
892
893 for (; w < width; w++)
894 {
895 uint32_t col = input[w];
896 *out++ = (uint8_t)(col >> 16);
897 *out++ = (uint8_t)(col >> 8);
898 *out++ = (uint8_t)(col >> 0);
899 }
900 }
901}
902
903void conv_argb8888_abgr8888(void *output_, const void *input_,
904 int width, int height,
905 int out_stride, int in_stride)
906{
907 int h, w;
908 const uint32_t *input = (const uint32_t*)input_;
909 uint32_t *output = (uint32_t*)output_;
910
911 for (h = 0; h < height;
912 h++, output += out_stride >> 2, input += in_stride >> 2)
913 {
914 for (w = 0; w < width; w++)
915 {
916 uint32_t col = input[w];
917 output[w] = ((col << 16) & 0xff0000) |
918 ((col >> 16) & 0xff) | (col & 0xff00ff00);
919 }
920 }
921}
922
923#define YUV_SHIFT 6
924#define YUV_OFFSET (1 << (YUV_SHIFT - 1))
925#define YUV_MAT_Y (1 << 6)
926#define YUV_MAT_U_G (-22)
927#define YUV_MAT_U_B (113)
928#define YUV_MAT_V_R (90)
929#define YUV_MAT_V_G (-46)
930
931void conv_yuyv_argb8888(void *output_, const void *input_,
932 int width, int height,
933 int out_stride, int in_stride)
934{
935 int h;
936 const uint8_t *input = (const uint8_t*)input_;
937 uint32_t *output = (uint32_t*)output_;
938
939#if defined(__SSE2__)
940 const __m128i mask_y = _mm_set1_epi16(0xffu);
941 const __m128i mask_u = _mm_set1_epi32(0xffu << 8);
942 const __m128i mask_v = _mm_set1_epi32(0xffu << 24);
943 const __m128i chroma_offset = _mm_set1_epi16(128);
944 const __m128i round_offset = _mm_set1_epi16(YUV_OFFSET);
945
946 const __m128i yuv_mul = _mm_set1_epi16(YUV_MAT_Y);
947 const __m128i u_g_mul = _mm_set1_epi16(YUV_MAT_U_G);
948 const __m128i u_b_mul = _mm_set1_epi16(YUV_MAT_U_B);
949 const __m128i v_r_mul = _mm_set1_epi16(YUV_MAT_V_R);
950 const __m128i v_g_mul = _mm_set1_epi16(YUV_MAT_V_G);
951 const __m128i a = _mm_cmpeq_epi16(
952 _mm_setzero_si128(), _mm_setzero_si128());
953#endif
954
955 for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
956 {
957 const uint8_t *src = input;
958 uint32_t *dst = output;
959 int w = 0;
960
961#if defined(__SSE2__)
962 /* Each loop processes 16 pixels. */
963 for (; w + 16 <= width; w += 16, src += 32, dst += 16)
964 {
965 __m128i u, v, u0_g, u1_g, u0_b, u1_b, v0_r, v1_r, v0_g, v1_g,
966 r0, g0, b0, r1, g1, b1;
967 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
968 __m128i res0, res1, res2, res3;
969 __m128i yuv0 = _mm_loadu_si128((const __m128i*)(src + 0)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
970 __m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
971
972 __m128i _y0 = _mm_and_si128(yuv0, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
973 __m128i u0 = _mm_and_si128(yuv0, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
974 __m128i v0 = _mm_and_si128(yuv0, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
975 __m128i _y1 = _mm_and_si128(yuv1, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
976 __m128i u1 = _mm_and_si128(yuv1, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
977 __m128i v1 = _mm_and_si128(yuv1, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
978
979 /* Juggle around to get U and V in the same 16-bit format as Y. */
980 u0 = _mm_srli_si128(u0, 1);
981 v0 = _mm_srli_si128(v0, 3);
982 u1 = _mm_srli_si128(u1, 1);
983 v1 = _mm_srli_si128(v1, 3);
984 u = _mm_packs_epi32(u0, u1);
985 v = _mm_packs_epi32(v0, v1);
986
987 /* Apply YUV offsets (U, V) -= (-128, -128). */
988 u = _mm_sub_epi16(u, chroma_offset);
989 v = _mm_sub_epi16(v, chroma_offset);
990
991 /* Upscale chroma horizontally (nearest). */
992 u0 = _mm_unpacklo_epi16(u, u);
993 u1 = _mm_unpackhi_epi16(u, u);
994 v0 = _mm_unpacklo_epi16(v, v);
995 v1 = _mm_unpackhi_epi16(v, v);
996
997 /* Apply transformations. */
998 _y0 = _mm_mullo_epi16(_y0, yuv_mul);
999 _y1 = _mm_mullo_epi16(_y1, yuv_mul);
1000 u0_g = _mm_mullo_epi16(u0, u_g_mul);
1001 u1_g = _mm_mullo_epi16(u1, u_g_mul);
1002 u0_b = _mm_mullo_epi16(u0, u_b_mul);
1003 u1_b = _mm_mullo_epi16(u1, u_b_mul);
1004 v0_r = _mm_mullo_epi16(v0, v_r_mul);
1005 v1_r = _mm_mullo_epi16(v1, v_r_mul);
1006 v0_g = _mm_mullo_epi16(v0, v_g_mul);
1007 v1_g = _mm_mullo_epi16(v1, v_g_mul);
1008
1009 /* Add contibutions from the transformed components. */
1010 r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_y0, v0_r),
1011 round_offset), YUV_SHIFT);
1012 g0 = _mm_srai_epi16(_mm_adds_epi16(
1013 _mm_adds_epi16(_mm_adds_epi16(_y0, v0_g), u0_g), round_offset), YUV_SHIFT);
1014 b0 = _mm_srai_epi16(_mm_adds_epi16(
1015 _mm_adds_epi16(_y0, u0_b), round_offset), YUV_SHIFT);
1016
1017 r1 = _mm_srai_epi16(_mm_adds_epi16(
1018 _mm_adds_epi16(_y1, v1_r), round_offset), YUV_SHIFT);
1019 g1 = _mm_srai_epi16(_mm_adds_epi16(
1020 _mm_adds_epi16(_mm_adds_epi16(_y1, v1_g), u1_g), round_offset), YUV_SHIFT);
1021 b1 = _mm_srai_epi16(_mm_adds_epi16(
1022 _mm_adds_epi16(_y1, u1_b), round_offset), YUV_SHIFT);
1023
1024 /* Saturate into 8-bit. */
1025 r0 = _mm_packus_epi16(r0, r1);
1026 g0 = _mm_packus_epi16(g0, g1);
1027 b0 = _mm_packus_epi16(b0, b1);
1028
1029 /* Interleave into ARGB. */
1030 res_lo_bg = _mm_unpacklo_epi8(b0, g0);
1031 res_hi_bg = _mm_unpackhi_epi8(b0, g0);
1032 res_lo_ra = _mm_unpacklo_epi8(r0, a);
1033 res_hi_ra = _mm_unpackhi_epi8(r0, a);
1034 res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra);
1035 res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra);
1036 res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra);
1037 res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra);
1038
1039 _mm_storeu_si128((__m128i*)(dst + 0), res0);
1040 _mm_storeu_si128((__m128i*)(dst + 4), res1);
1041 _mm_storeu_si128((__m128i*)(dst + 8), res2);
1042 _mm_storeu_si128((__m128i*)(dst + 12), res3);
1043 }
1044#endif
1045
1046 /* Finish off the rest (if any) in C. */
1047 for (; w < width; w += 2, src += 4, dst += 2)
1048 {
1049 int _y0 = src[0];
1050 int u = src[1] - 128;
1051 int _y1 = src[2];
1052 int v = src[3] - 128;
1053
1054 uint8_t r0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1055 uint8_t g0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1056 uint8_t b0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
1057
1058 uint8_t r1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1059 uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1060 uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
1061
1062 dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
1063 dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
1064 }
1065 }
1066}
1067
1068void conv_copy(void *output_, const void *input_,
1069 int width, int height,
1070 int out_stride, int in_stride)
1071{
1072 int h;
1073 int copy_len = abs(out_stride);
1074 const uint8_t *input = (const uint8_t*)input_;
1075 uint8_t *output = (uint8_t*)output_;
1076
1077 if (abs(in_stride) < copy_len)
1078 copy_len = abs(in_stride);
1079
1080 for (h = 0; h < height;
1081 h++, output += out_stride, input += in_stride)
1082 memcpy(output, input, copy_len);
1083}