deps/libretro-common/gfx/scaler/pixconv.c

   1 /* Copyright  (C) 2010-2020 The RetroArch team
   2  *
   3  * ---------------------------------------------------------------------------------------
   4  * The following license statement only applies to this file (pixconv.c).
   5  * ---------------------------------------------------------------------------------------
   6  *
   7  * Permission is hereby granted, free of charge,
   8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation the rights to
  10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include <stdio.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27
  28 #include <retro_inline.h>
  29
  30 #include <gfx/scaler/pixconv.h>
  31
  32 #if _MSC_VER && _MSC_VER <= 1800
  33 #define SCALER_NO_SIMD
  34 #endif
  35
  36 #ifdef SCALER_NO_SIMD
  37 #undef __SSE2__
  38 #endif
  39
  40 #if defined(__SSE2__)
  41 #include <emmintrin.h>
  42 #elif defined(__MMX__)
  43 #include <mmintrin.h>
  44 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
  45 #include <arm_neon.h>
  46 #endif
  47
  48 void conv_rgb565_0rgb1555(void *output_, const void *input_,
  49       int width, int height,
  50       int out_stride, int in_stride)
  51 {
  52    int h;
  53    const uint16_t *input = (const uint16_t*)input_;
  54    uint16_t *output = (uint16_t*)output_;
  55
  56 #if defined(__SSE2__)
  57    int max_width           = width - 7;
  58    const __m128i hi_mask   = _mm_set1_epi16(0x7fe0);
  59    const __m128i lo_mask   = _mm_set1_epi16(0x1f);
  60 #endif
  61
  62    for (h = 0; h < height;
  63          h++, output += out_stride >> 1, input += in_stride >> 1)
  64    {
  65       int w = 0;
  66 #if defined(__SSE2__)
  67       for (; w < max_width; w += 8)
  68       {
  69          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
  70          __m128i hi = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
  71          __m128i lo = _mm_and_si128(in, lo_mask);
  72          _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(hi, lo));
  73       }
  74 #endif
  75
  76       for (; w < width; w++)
  77       {
  78          uint16_t col = input[w];
  79          uint16_t hi  = (col >> 1) & 0x7fe0;
  80          uint16_t lo  = col & 0x1f;
  81          output[w]    = hi | lo;
  82       }
  83    }
  84 }
  85
  86 void conv_0rgb1555_rgb565(void *output_, const void *input_,
  87       int width, int height,
  88       int out_stride, int in_stride)
  89 {
  90    int h;
  91    const uint16_t *input   = (const uint16_t*)input_;
  92    uint16_t *output        = (uint16_t*)output_;
  93
  94 #if defined(__SSE2__)
  95    int max_width           = width - 7;
  96
  97    const __m128i hi_mask   = _mm_set1_epi16(
  98          (int16_t)((0x1f << 11) | (0x1f << 6)));
  99    const __m128i lo_mask   = _mm_set1_epi16(0x1f);
 100    const __m128i glow_mask = _mm_set1_epi16(1 << 5);
 101 #endif
 102
 103    for (h = 0; h < height;
 104          h++, output += out_stride >> 1, input += in_stride >> 1)
 105    {
 106       int w = 0;
 107 #if defined(__SSE2__)
 108       for (; w < max_width; w += 8)
 109       {
 110          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
 111          __m128i rg   = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
 112          __m128i b    = _mm_and_si128(in, lo_mask);
 113          __m128i glow = _mm_and_si128(_mm_srli_epi16(in, 4), glow_mask);
 114          _mm_storeu_si128((__m128i*)(output + w),
 115                _mm_or_si128(rg, _mm_or_si128(b, glow)));
 116       }
 117 #endif
 118
 119       for (; w < width; w++)
 120       {
 121          uint16_t col  = input[w];
 122          uint16_t rg   = (col << 1) & ((0x1f << 11) | (0x1f << 6));
 123          uint16_t b    = col & 0x1f;
 124          uint16_t glow = (col >> 4) & (1 << 5);
 125          output[w]     = rg | b | glow;
 126       }
 127    }
 128 }
 129
 130 void conv_0rgb1555_argb8888(void *output_, const void *input_,
 131       int width, int height,
 132       int out_stride, int in_stride)
 133 {
 134    int h;
 135    const uint16_t *input = (const uint16_t*)input_;
 136    uint32_t *output      = (uint32_t*)output_;
 137
 138 #ifdef __SSE2__
 139    const __m128i pix_mask_r  = _mm_set1_epi16(0x1f << 10);
 140    const __m128i pix_mask_gb = _mm_set1_epi16(0x1f <<  5);
 141    const __m128i mul15_mid   = _mm_set1_epi16(0x4200);
 142    const __m128i mul15_hi    = _mm_set1_epi16(0x0210);
 143    const __m128i a           = _mm_set1_epi16(0x00ff);
 144
 145    int max_width = width - 7;
 146 #endif
 147
 148    for (h = 0; h < height;
 149          h++, output += out_stride >> 2, input += in_stride >> 1)
 150    {
 151       int w = 0;
 152 #ifdef __SSE2__
 153       for (; w < max_width; w += 8)
 154       {
 155          __m128i res_lo_bg, res_hi_bg;
 156          __m128i res_lo_ra, res_hi_ra;
 157          __m128i res_lo, res_hi;
 158          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
 159          __m128i r = _mm_and_si128(in, pix_mask_r);
 160          __m128i g = _mm_and_si128(in, pix_mask_gb);
 161          __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_gb);
 162
 163          r = _mm_mulhi_epi16(r, mul15_hi);
 164          g = _mm_mulhi_epi16(g, mul15_mid);
 165          b = _mm_mulhi_epi16(b, mul15_mid);
 166
 167          res_lo_bg = _mm_unpacklo_epi8(b, g);
 168          res_hi_bg = _mm_unpackhi_epi8(b, g);
 169          res_lo_ra = _mm_unpacklo_epi8(r, a);
 170          res_hi_ra = _mm_unpackhi_epi8(r, a);
 171
 172          res_lo = _mm_or_si128(res_lo_bg,
 173                _mm_slli_si128(res_lo_ra, 2));
 174          res_hi = _mm_or_si128(res_hi_bg,
 175                _mm_slli_si128(res_hi_ra, 2));
 176
 177          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
 178          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
 179       }
 180 #endif
 181
 182       for (; w < width; w++)
 183       {
 184          uint32_t col = input[w];
 185          uint32_t r   = (col >> 10) & 0x1f;
 186          uint32_t g   = (col >>  5) & 0x1f;
 187          uint32_t b   = (col >>  0) & 0x1f;
 188          r            = (r << 3) | (r >> 2);
 189          g            = (g << 3) | (g >> 2);
 190          b            = (b << 3) | (b >> 2);
 191
 192          output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
 193       }
 194    }
 195 }
 196
 197 void conv_rgb565_argb8888(void *output_, const void *input_,
 198       int width, int height,
 199       int out_stride, int in_stride)
 200 {
 201    int h;
 202    const uint16_t *input    = (const uint16_t*)input_;
 203    uint32_t *output         = (uint32_t*)output_;
 204
 205 #if defined(__SSE2__)
 206    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
 207    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
 208    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
 209    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
 210    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
 211    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
 212    const __m128i a          = _mm_set1_epi16(0x00ff);
 213
 214    int max_width            = width - 7;
 215 #elif defined(__MMX__)
 216    const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10);
 217    const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5);
 218    const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5);
 219    const __m64 mul16_r    = _mm_set1_pi16(0x0210);
 220    const __m64 mul16_g    = _mm_set1_pi16(0x2080);
 221    const __m64 mul16_b    = _mm_set1_pi16(0x4200);
 222    const __m64 a          = _mm_set1_pi16(0x00ff);
 223
 224    int max_width            = width - 3;
 225 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
 226    int max_width            = width - 7;
 227 #endif
 228
 229    for (h = 0; h < height;
 230          h++, output += out_stride >> 2, input += in_stride >> 1)
 231    {
 232       int w = 0;
 233 #if defined(__SSE2__)
 234       for (; w < max_width; w += 8)
 235       {
 236          __m128i res_lo, res_hi;
 237          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
 238          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
 239          __m128i        r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
 240          __m128i        g = _mm_and_si128(in, pix_mask_g);
 241          __m128i        b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
 242
 243          r                = _mm_mulhi_epi16(r, mul16_r);
 244          g                = _mm_mulhi_epi16(g, mul16_g);
 245          b                = _mm_mulhi_epi16(b, mul16_b);
 246
 247          res_lo_bg        = _mm_unpacklo_epi8(b, g);
 248          res_hi_bg        = _mm_unpackhi_epi8(b, g);
 249          res_lo_ra        = _mm_unpacklo_epi8(r, a);
 250          res_hi_ra        = _mm_unpackhi_epi8(r, a);
 251
 252          res_lo           = _mm_or_si128(res_lo_bg,
 253                _mm_slli_si128(res_lo_ra, 2));
 254          res_hi           = _mm_or_si128(res_hi_bg,
 255                _mm_slli_si128(res_hi_ra, 2));
 256
 257          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
 258          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
 259       }
 260 #elif defined(__MMX__)
 261       for (; w < max_width; w += 4)
 262       {
 263          __m64 res_lo, res_hi;
 264          __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
 265          const __m64 in = *((__m64*)(input + w));
 266          __m64          r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r);
 267          __m64          g = _mm_and_si64(in, pix_mask_g);
 268          __m64          b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b);
 269
 270          r                = _mm_mulhi_pi16(r, mul16_r);
 271          g                = _mm_mulhi_pi16(g, mul16_g);
 272          b                = _mm_mulhi_pi16(b, mul16_b);
 273
 274          res_lo_bg        = _mm_unpacklo_pi8(b, g);
 275          res_hi_bg        = _mm_unpackhi_pi8(b, g);
 276          res_lo_ra        = _mm_unpacklo_pi8(r, a);
 277          res_hi_ra        = _mm_unpackhi_pi8(r, a);
 278
 279          res_lo           = _mm_or_si64(res_lo_bg,
 280                _mm_slli_si64(res_lo_ra, 16));
 281          res_hi           = _mm_or_si64(res_hi_bg,
 282                _mm_slli_si64(res_hi_ra, 16));
 283
 284          *((__m64*)(output + w + 0)) = res_lo;
 285          *((__m64*)(output + w + 2)) = res_hi;
 286       }
 287
 288       _mm_empty();
 289 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
 290       for (; w < max_width; w += 8)
 291       {
 292          uint16x8_t in = vld1q_u16(input + w);
 293
 294          uint16x8_t r = vsriq_n_u16(in, in, 5);
 295          uint16x8_t b = vsliq_n_u16(in, in, 5);
 296          uint16x8_t g = vsriq_n_u16(b,  b,  6);
 297
 298          uint8x8x4_t res;
 299          res.val[3] = vdup_n_u8(0xffu);
 300          res.val[2] = vshrn_n_u16(r, 8);
 301          res.val[1] = vshrn_n_u16(g, 8);
 302          res.val[0] = vshrn_n_u16(b, 2);
 303
 304          vst4_u8((uint8_t*)(output + w), res);
 305       }
 306 #endif
 307
 308       for (; w < width; w++)
 309       {
 310          uint32_t col = input[w];
 311          uint32_t r   = (col >> 11) & 0x1f;
 312          uint32_t g   = (col >>  5) & 0x3f;
 313          uint32_t b   = (col >>  0) & 0x1f;
 314          r            = (r << 3) | (r >> 2);
 315          g            = (g << 2) | (g >> 4);
 316          b            = (b << 3) | (b >> 2);
 317
 318          output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
 319       }
 320    }
 321 }
 322
 323 void conv_rgb565_abgr8888(void *output_, const void *input_,
 324       int width, int height,
 325       int out_stride, int in_stride)
 326 {
 327    int h;
 328    const uint16_t *input    = (const uint16_t*)input_;
 329    uint32_t *output         = (uint32_t*)output_;
 330  #if defined(__SSE2__)
 331    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
 332    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
 333    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
 334    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
 335    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
 336    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
 337    const __m128i a          = _mm_set1_epi16(0x00ff);
 338     int max_width            = width - 7;
 339 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
 340    int max_width            = width - 7;
 341 #endif
 342     for (h = 0; h < height;
 343          h++, output += out_stride >> 2, input += in_stride >> 1)
 344    {
 345       int w = 0;
 346 #if defined(__SSE2__)
 347       for (; w < max_width; w += 8)
 348       {
 349          __m128i res_lo, res_hi;
 350          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
 351          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
 352          __m128i        r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
 353          __m128i        g = _mm_and_si128(in, pix_mask_g);
 354          __m128i        b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
 355          r                = _mm_mulhi_epi16(r, mul16_r);
 356          g                = _mm_mulhi_epi16(g, mul16_g);
 357          b                = _mm_mulhi_epi16(b, mul16_b);
 358          res_lo_bg        = _mm_unpacklo_epi8(b, g);
 359          res_hi_bg        = _mm_unpackhi_epi8(b, g);
 360          res_lo_ra        = _mm_unpacklo_epi8(r, a);
 361          res_hi_ra        = _mm_unpackhi_epi8(r, a);
 362          res_lo           = _mm_or_si128(res_lo_bg,
 363                _mm_slli_si128(res_lo_ra, 2));
 364          res_hi           = _mm_or_si128(res_hi_bg,
 365                _mm_slli_si128(res_hi_ra, 2));
 366          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
 367          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
 368       }
 369 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
 370       for (; w < max_width; w += 8)
 371       {
 372          uint16x8_t in = vld1q_u16(input + w);
 373
 374          uint16x8_t r = vsriq_n_u16(in, in, 5);
 375          uint16x8_t b = vsliq_n_u16(in, in, 5);
 376          uint16x8_t g = vsriq_n_u16(b,  b,  6);
 377
 378          uint8x8x4_t res;
 379          res.val[3] = vdup_n_u8(0xffu);
 380          res.val[2] = vshrn_n_u16(b, 2);
 381          res.val[1] = vshrn_n_u16(g, 8);
 382          res.val[0] = vshrn_n_u16(r, 8);
 383
 384          vst4_u8((uint8_t*)(output + w), res);
 385       }
 386 #endif
 387        for (; w < width; w++)
 388       {
 389          uint32_t col = input[w];
 390          uint32_t r   = (col >> 11) & 0x1f;
 391          uint32_t g   = (col >>  5) & 0x3f;
 392          uint32_t b   = (col >>  0) & 0x1f;
 393          r            = (r << 3) | (r >> 2);
 394          g            = (g << 2) | (g >> 4);
 395          b            = (b << 3) | (b >> 2);
 396          output[w]    = (0xffu << 24) | (b << 16) | (g << 8) | (r << 0);
 397       }
 398    }
 399 }
 400
 401 void conv_argb8888_rgba4444(void *output_, const void *input_,
 402       int width, int height,
 403       int out_stride, int in_stride)
 404 {
 405    int h, w;
 406    const uint32_t *input = (const uint32_t*)input_;
 407    uint16_t *output      = (uint16_t*)output_;
 408
 409    for (h = 0; h < height;
 410          h++, output += out_stride >> 2, input += in_stride >> 1)
 411    {
 412       for (w = 0; w < width; w++)
 413       {
 414          uint32_t col = input[w];
 415          uint32_t r   = (col >> 16) & 0xf;
 416          uint32_t g   = (col >>  8) & 0xf;
 417          uint32_t b   = (col) & 0xf;
 418          uint32_t a   = (col >>  24) & 0xf;
 419          r            = (r >> 4) | r;
 420          g            = (g >> 4) | g;
 421          b            = (b >> 4) | b;
 422          a            = (a >> 4) | a;
 423
 424          output[w]    = (r << 12) | (g << 8) | (b << 4) | a;
 425       }
 426    }
 427 }
 428
 429 void conv_rgba4444_argb8888(void *output_, const void *input_,
 430       int width, int height,
 431       int out_stride, int in_stride)
 432 {
 433    int h;
 434    const uint16_t *input = (const uint16_t*)input_;
 435    uint32_t *output      = (uint32_t*)output_;
 436
 437 #if defined(__MMX__)
 438    const __m64 pix_mask_r = _mm_set1_pi16(0xf << 10);
 439    const __m64 pix_mask_g = _mm_set1_pi16(0xf << 8);
 440    const __m64 pix_mask_b = _mm_set1_pi16(0xf << 8);
 441    const __m64 mul16_r    = _mm_set1_pi16(0x0440);
 442    const __m64 mul16_g    = _mm_set1_pi16(0x1100);
 443    const __m64 mul16_b    = _mm_set1_pi16(0x1100);
 444    const __m64 a          = _mm_set1_pi16(0x00ff);
 445
 446    int max_width            = width - 3;
 447 #endif
 448
 449    for (h = 0; h < height;
 450          h++, output += out_stride >> 2, input += in_stride >> 1)
 451    {
 452       int w = 0;
 453 #if defined(__MMX__)
 454       for (; w < max_width; w += 4)
 455       {
 456          __m64 res_lo, res_hi;
 457          __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
 458          const __m64 in = *((__m64*)(input + w));
 459          __m64          r = _mm_and_si64(_mm_srli_pi16(in, 2), pix_mask_r);
 460          __m64          g = _mm_and_si64(in, pix_mask_g);
 461          __m64          b = _mm_and_si64(_mm_slli_pi16(in, 4), pix_mask_b);
 462
 463          r                = _mm_mulhi_pi16(r, mul16_r);
 464          g                = _mm_mulhi_pi16(g, mul16_g);
 465          b                = _mm_mulhi_pi16(b, mul16_b);
 466
 467          res_lo_bg        = _mm_unpacklo_pi8(b, g);
 468          res_hi_bg        = _mm_unpackhi_pi8(b, g);
 469          res_lo_ra        = _mm_unpacklo_pi8(r, a);
 470          res_hi_ra        = _mm_unpackhi_pi8(r, a);
 471
 472          res_lo           = _mm_or_si64(res_lo_bg,
 473                _mm_slli_si64(res_lo_ra, 16));
 474          res_hi           = _mm_or_si64(res_hi_bg,
 475                _mm_slli_si64(res_hi_ra, 16));
 476
 477          *((__m64*)(output + w + 0)) = res_lo;
 478          *((__m64*)(output + w + 2)) = res_hi;
 479       }
 480
 481       _mm_empty();
 482 #endif
 483
 484       for (; w < width; w++)
 485       {
 486          uint32_t col = input[w];
 487          uint32_t r   = (col >> 12) & 0xf;
 488          uint32_t g   = (col >>  8) & 0xf;
 489          uint32_t b   = (col >>  4) & 0xf;
 490          uint32_t a   = (col >>  0) & 0xf;
 491          r            = (r << 4) | r;
 492          g            = (g << 4) | g;
 493          b            = (b << 4) | b;
 494          a            = (a << 4) | a;
 495
 496          output[w]    = (a << 24) | (r << 16) | (g << 8) | (b << 0);
 497       }
 498    }
 499 }
 500
 501 void conv_rgba4444_rgb565(void *output_, const void *input_,
 502       int width, int height,
 503       int out_stride, int in_stride)
 504 {
 505    int h, w;
 506    const uint16_t *input = (const uint16_t*)input_;
 507    uint16_t *output      = (uint16_t*)output_;
 508
 509    for (h = 0; h < height;
 510          h++, output += out_stride >> 1, input += in_stride >> 1)
 511    {
 512       for (w = 0; w < width; w++)
 513       {
 514          uint32_t col = input[w];
 515          uint32_t r   = (col >> 12) & 0xf;
 516          uint32_t g   = (col >>  8) & 0xf;
 517          uint32_t b   = (col >>  4) & 0xf;
 518
 519          output[w]    = (r << 12) | (g << 7) | (b << 1);
 520       }
 521    }
 522 }
 523
 524 #if defined(__SSE2__)
 525 /* :( TODO: Make this saner. */
 526 static INLINE void store_bgr24_sse2(void *output, __m128i a,
 527       __m128i b, __m128i c, __m128i d)
 528 {
 529    const __m128i mask_0 = _mm_set_epi32(0, 0, 0, 0x00ffffff);
 530    const __m128i mask_1 = _mm_set_epi32(0, 0, 0x00ffffff, 0);
 531    const __m128i mask_2 = _mm_set_epi32(0, 0x00ffffff, 0, 0);
 532    const __m128i mask_3 = _mm_set_epi32(0x00ffffff, 0, 0, 0);
 533
 534    __m128i a0 = _mm_and_si128(a, mask_0);
 535    __m128i a1 = _mm_srli_si128(_mm_and_si128(a, mask_1),  1);
 536    __m128i a2 = _mm_srli_si128(_mm_and_si128(a, mask_2),  2);
 537    __m128i a3 = _mm_srli_si128(_mm_and_si128(a, mask_3),  3);
 538    __m128i a4 = _mm_slli_si128(_mm_and_si128(b, mask_0), 12);
 539    __m128i a5 = _mm_slli_si128(_mm_and_si128(b, mask_1), 11);
 540
 541    __m128i b0 = _mm_srli_si128(_mm_and_si128(b, mask_1), 5);
 542    __m128i b1 = _mm_srli_si128(_mm_and_si128(b, mask_2), 6);
 543    __m128i b2 = _mm_srli_si128(_mm_and_si128(b, mask_3), 7);
 544    __m128i b3 = _mm_slli_si128(_mm_and_si128(c, mask_0), 8);
 545    __m128i b4 = _mm_slli_si128(_mm_and_si128(c, mask_1), 7);
 546    __m128i b5 = _mm_slli_si128(_mm_and_si128(c, mask_2), 6);
 547
 548    __m128i c0 = _mm_srli_si128(_mm_and_si128(c, mask_2), 10);
 549    __m128i c1 = _mm_srli_si128(_mm_and_si128(c, mask_3), 11);
 550    __m128i c2 = _mm_slli_si128(_mm_and_si128(d, mask_0),  4);
 551    __m128i c3 = _mm_slli_si128(_mm_and_si128(d, mask_1),  3);
 552    __m128i c4 = _mm_slli_si128(_mm_and_si128(d, mask_2),  2);
 553    __m128i c5 = _mm_slli_si128(_mm_and_si128(d, mask_3),  1);
 554
 555    __m128i *out = (__m128i*)output;
 556
 557    _mm_storeu_si128(out + 0,
 558          _mm_or_si128(a0, _mm_or_si128(a1, _mm_or_si128(a2,
 559                   _mm_or_si128(a3, _mm_or_si128(a4, a5))))));
 560
 561    _mm_storeu_si128(out + 1,
 562          _mm_or_si128(b0, _mm_or_si128(b1, _mm_or_si128(b2,
 563                   _mm_or_si128(b3, _mm_or_si128(b4, b5))))));
 564
 565    _mm_storeu_si128(out + 2,
 566          _mm_or_si128(c0, _mm_or_si128(c1, _mm_or_si128(c2,
 567                   _mm_or_si128(c3, _mm_or_si128(c4, c5))))));
 568 }
 569 #endif
 570
 571 void conv_0rgb1555_bgr24(void *output_, const void *input_,
 572       int width, int height,
 573       int out_stride, int in_stride)
 574 {
 575    int h;
 576    const uint16_t *input     = (const uint16_t*)input_;
 577    uint8_t *output           = (uint8_t*)output_;
 578
 579 #if defined(__SSE2__)
 580    const __m128i pix_mask_r  = _mm_set1_epi16(0x1f << 10);
 581    const __m128i pix_mask_gb = _mm_set1_epi16(0x1f <<  5);
 582    const __m128i mul15_mid   = _mm_set1_epi16(0x4200);
 583    const __m128i mul15_hi    = _mm_set1_epi16(0x0210);
 584    const __m128i a           = _mm_set1_epi16(0x00ff);
 585
 586    int max_width             = width - 15;
 587 #endif
 588
 589    for (h = 0; h < height;
 590          h++, output += out_stride, input += in_stride >> 1)
 591    {
 592       uint8_t *out = output;
 593       int   w = 0;
 594
 595 #if defined(__SSE2__)
 596       for (; w < max_width; w += 16, out += 48)
 597       {
 598          __m128i res_lo_bg0, res_lo_bg1, res_hi_bg0, res_hi_bg1,
 599                  res_lo_ra0, res_lo_ra1, res_hi_ra0, res_hi_ra1,
 600                  res_lo0, res_lo1, res_hi0, res_hi1;
 601          const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
 602          const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
 603          __m128i r0        = _mm_and_si128(in0, pix_mask_r);
 604          __m128i r1        = _mm_and_si128(in1, pix_mask_r);
 605          __m128i g0        = _mm_and_si128(in0, pix_mask_gb);
 606          __m128i g1        = _mm_and_si128(in1, pix_mask_gb);
 607          __m128i b0        = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
 608          __m128i b1        = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
 609
 610          r0                = _mm_mulhi_epi16(r0, mul15_hi);
 611          r1                = _mm_mulhi_epi16(r1, mul15_hi);
 612          g0                = _mm_mulhi_epi16(g0, mul15_mid);
 613          g1                = _mm_mulhi_epi16(g1, mul15_mid);
 614          b0                = _mm_mulhi_epi16(b0, mul15_mid);
 615          b1                = _mm_mulhi_epi16(b1, mul15_mid);
 616
 617          res_lo_bg0        = _mm_unpacklo_epi8(b0, g0);
 618          res_lo_bg1        = _mm_unpacklo_epi8(b1, g1);
 619          res_hi_bg0        = _mm_unpackhi_epi8(b0, g0);
 620          res_hi_bg1        = _mm_unpackhi_epi8(b1, g1);
 621          res_lo_ra0        = _mm_unpacklo_epi8(r0, a);
 622          res_lo_ra1        = _mm_unpacklo_epi8(r1, a);
 623          res_hi_ra0        = _mm_unpackhi_epi8(r0, a);
 624          res_hi_ra1        = _mm_unpackhi_epi8(r1, a);
 625
 626          res_lo0           = _mm_or_si128(res_lo_bg0,
 627                _mm_slli_si128(res_lo_ra0, 2));
 628          res_lo1           = _mm_or_si128(res_lo_bg1,
 629                _mm_slli_si128(res_lo_ra1, 2));
 630          res_hi0           = _mm_or_si128(res_hi_bg0,
 631                _mm_slli_si128(res_hi_ra0, 2));
 632          res_hi1           = _mm_or_si128(res_hi_bg1,
 633                _mm_slli_si128(res_hi_ra1, 2));
 634
 635          /* Non-POT pixel sizes for the loss */
 636          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
 637       }
 638 #endif
 639
 640       for (; w < width; w++)
 641       {
 642          uint32_t col = input[w];
 643          uint32_t b   = (col >>  0) & 0x1f;
 644          uint32_t g   = (col >>  5) & 0x1f;
 645          uint32_t r   = (col >> 10) & 0x1f;
 646          b            = (b << 3) | (b >> 2);
 647          g            = (g << 3) | (g >> 2);
 648          r            = (r << 3) | (r >> 2);
 649
 650          *out++       = b;
 651          *out++       = g;
 652          *out++       = r;
 653       }
 654    }
 655 }
 656
 657 void conv_rgb565_bgr24(void *output_, const void *input_,
 658       int width, int height,
 659       int out_stride, int in_stride)
 660 {
 661    int h;
 662    const uint16_t *input    = (const uint16_t*)input_;
 663    uint8_t *output          = (uint8_t*)output_;
 664
 665 #if defined(__SSE2__)
 666    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
 667    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
 668    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
 669    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
 670    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
 671    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
 672    const __m128i a          = _mm_set1_epi16(0x00ff);
 673
 674    int max_width            = width - 15;
 675 #endif
 676
 677    for (h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
 678    {
 679       uint8_t *out = output;
 680       int        w = 0;
 681 #if defined(__SSE2__)
 682       for (; w < max_width; w += 16, out += 48)
 683       {
 684          __m128i res_lo_bg0, res_hi_bg0, res_lo_ra0, res_hi_ra0;
 685          __m128i res_lo_bg1, res_hi_bg1, res_lo_ra1, res_hi_ra1;
 686          __m128i res_lo0, res_hi0, res_lo1, res_hi1;
 687          const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w));
 688          const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
 689          __m128i r0 = _mm_and_si128(_mm_srli_epi16(in0, 1), pix_mask_r);
 690          __m128i g0 = _mm_and_si128(in0, pix_mask_g);
 691          __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_b);
 692          __m128i r1 = _mm_and_si128(_mm_srli_epi16(in1, 1), pix_mask_r);
 693          __m128i g1 = _mm_and_si128(in1, pix_mask_g);
 694          __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
 695
 696          r0         = _mm_mulhi_epi16(r0, mul16_r);
 697          g0         = _mm_mulhi_epi16(g0, mul16_g);
 698          b0         = _mm_mulhi_epi16(b0, mul16_b);
 699          r1         = _mm_mulhi_epi16(r1, mul16_r);
 700          g1         = _mm_mulhi_epi16(g1, mul16_g);
 701          b1         = _mm_mulhi_epi16(b1, mul16_b);
 702
 703          res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
 704          res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
 705          res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
 706          res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
 707          res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
 708          res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
 709          res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
 710          res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
 711
 712          res_lo0    = _mm_or_si128(res_lo_bg0,
 713                _mm_slli_si128(res_lo_ra0, 2));
 714          res_hi0    = _mm_or_si128(res_hi_bg0,
 715                _mm_slli_si128(res_hi_ra0, 2));
 716          res_lo1    = _mm_or_si128(res_lo_bg1,
 717                _mm_slli_si128(res_lo_ra1, 2));
 718          res_hi1    = _mm_or_si128(res_hi_bg1,
 719                _mm_slli_si128(res_hi_ra1, 2));
 720
 721          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
 722       }
 723 #endif
 724
 725       for (; w < width; w++)
 726       {
 727          uint32_t col = input[w];
 728          uint32_t r   = (col >> 11) & 0x1f;
 729          uint32_t g   = (col >>  5) & 0x3f;
 730          uint32_t b   = (col >>  0) & 0x1f;
 731          r = (r << 3) | (r >> 2);
 732          g = (g << 2) | (g >> 4);
 733          b = (b << 3) | (b >> 2);
 734
 735          *out++ = b;
 736          *out++ = g;
 737          *out++ = r;
 738       }
 739    }
 740 }
 741
 742 void conv_bgr24_argb8888(void *output_, const void *input_,
 743       int width, int height,
 744       int out_stride, int in_stride)
 745 {
 746    int h, w;
 747    const uint8_t *input = (const uint8_t*)input_;
 748    uint32_t *output     = (uint32_t*)output_;
 749
 750    for (h = 0; h < height;
 751          h++, output += out_stride >> 2, input += in_stride)
 752    {
 753       const uint8_t *inp = input;
 754       for (w = 0; w < width; w++)
 755       {
 756          uint32_t b = *inp++;
 757          uint32_t g = *inp++;
 758          uint32_t r = *inp++;
 759          output[w]  = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
 760       }
 761    }
 762 }
 763
 764 void conv_bgr24_rgb565(void *output_, const void *input_,
 765       int width, int height,
 766       int out_stride, int in_stride)
 767 {
 768    int h, w;
 769    const uint8_t *input = (const uint8_t*)input_;
 770    uint16_t *output     = (uint16_t*)output_;
 771    for (h = 0; h < height;
 772          h++, output += out_stride, input += in_stride)
 773    {
 774       const uint8_t *inp = input;
 775       for (w = 0; w < width; w++)
 776       {
 777          uint16_t b = *inp++;
 778          uint16_t g = *inp++;
 779          uint16_t r = *inp++;
 780
 781          output[w] = ((r & 0x00F8) << 8) | ((g&0x00FC) << 3) | ((b&0x00F8) >> 3);
 782       }
 783    }
 784 }
 785
 786 void conv_argb8888_0rgb1555(void *output_, const void *input_,
 787       int width, int height,
 788       int out_stride, int in_stride)
 789 {
 790    int h, w;
 791    const uint32_t *input = (const uint32_t*)input_;
 792    uint16_t *output      = (uint16_t*)output_;
 793
 794    for (h = 0; h < height;
 795          h++, output += out_stride >> 1, input += in_stride >> 2)
 796    {
 797       for (w = 0; w < width; w++)
 798       {
 799          uint32_t col = input[w];
 800          uint16_t r   = (col >> 19) & 0x1f;
 801          uint16_t g   = (col >> 11) & 0x1f;
 802          uint16_t b   = (col >>  3) & 0x1f;
 803          output[w]    = (r << 10) | (g << 5) | (b << 0);
 804       }
 805    }
 806 }
 807
 808 void conv_argb8888_bgr24(void *output_, const void *input_,
 809       int width, int height,
 810       int out_stride, int in_stride)
 811 {
 812    int h;
 813    const uint32_t *input = (const uint32_t*)input_;
 814    uint8_t *output       = (uint8_t*)output_;
 815
 816 #if defined(__SSE2__)
 817    int max_width = width - 15;
 818 #endif
 819
 820    for (h = 0; h < height;
 821          h++, output += out_stride, input += in_stride >> 2)
 822    {
 823       uint8_t *out = output;
 824       int        w = 0;
 825 #if defined(__SSE2__)
 826       for (; w < max_width; w += 16, out += 48)
 827       {
 828          __m128i l0 = _mm_loadu_si128((const __m128i*)(input + w +  0));
 829          __m128i l1 = _mm_loadu_si128((const __m128i*)(input + w +  4));
 830          __m128i l2 = _mm_loadu_si128((const __m128i*)(input + w +  8));
 831          __m128i l3 = _mm_loadu_si128((const __m128i*)(input + w + 12));
 832          store_bgr24_sse2(out, l0, l1, l2, l3);
 833       }
 834 #endif
 835
 836       for (; w < width; w++)
 837       {
 838          uint32_t col = input[w];
 839          *out++       = (uint8_t)(col >>  0);
 840          *out++       = (uint8_t)(col >>  8);
 841          *out++       = (uint8_t)(col >> 16);
 842       }
 843    }
 844 }
 845
 846 #if defined(__SSE2__)
 847 static INLINE __m128i conv_shuffle_rb_epi32(__m128i c)
 848 {
 849    /* SSSE3 plz */
 850    const __m128i b_mask = _mm_set1_epi32(0x000000ff);
 851    const __m128i g_mask = _mm_set1_epi32(0x0000ff00);
 852    const __m128i r_mask = _mm_set1_epi32(0x00ff0000);
 853    __m128i sl = _mm_and_si128(_mm_slli_epi32(c, 16), r_mask);
 854    __m128i sr = _mm_and_si128(_mm_srli_epi32(c, 16), b_mask);
 855    __m128i g  = _mm_and_si128(c, g_mask);
 856    __m128i rb = _mm_or_si128(sl, sr);
 857    return _mm_or_si128(g, rb);
 858 }
 859 #endif
 860
 861 void conv_abgr8888_bgr24(void *output_, const void *input_,
 862       int width, int height,
 863       int out_stride, int in_stride)
 864 {
 865    int h;
 866    const uint32_t *input = (const uint32_t*)input_;
 867    uint8_t *output       = (uint8_t*)output_;
 868
 869 #if defined(__SSE2__)
 870    int max_width = width - 15;
 871 #endif
 872
 873    for (h = 0; h < height;
 874          h++, output += out_stride, input += in_stride >> 2)
 875    {
 876       uint8_t *out = output;
 877       int        w = 0;
 878 #if defined(__SSE2__)
 879       for (; w < max_width; w += 16, out += 48)
 880       {
 881                  __m128i a = _mm_loadu_si128((const __m128i*)(input + w +  0));
 882                  __m128i b = _mm_loadu_si128((const __m128i*)(input + w +  4));
 883                  __m128i c = _mm_loadu_si128((const __m128i*)(input + w +  8));
 884                  __m128i d = _mm_loadu_si128((const __m128i*)(input + w + 12));
 885          a = conv_shuffle_rb_epi32(a);
 886          b = conv_shuffle_rb_epi32(b);
 887          c = conv_shuffle_rb_epi32(c);
 888          d = conv_shuffle_rb_epi32(d);
 889          store_bgr24_sse2(out, a, b, c, d);
 890       }
 891 #endif
 892
 893       for (; w < width; w++)
 894       {
 895          uint32_t col = input[w];
 896          *out++       = (uint8_t)(col >> 16);
 897          *out++       = (uint8_t)(col >>  8);
 898          *out++       = (uint8_t)(col >>  0);
 899       }
 900    }
 901 }
 902
 903 void conv_argb8888_abgr8888(void *output_, const void *input_,
 904       int width, int height,
 905       int out_stride, int in_stride)
 906 {
 907    int h, w;
 908    const uint32_t *input = (const uint32_t*)input_;
 909    uint32_t *output      = (uint32_t*)output_;
 910
 911    for (h = 0; h < height;
 912          h++, output += out_stride >> 2, input += in_stride >> 2)
 913    {
 914       for (w = 0; w < width; w++)
 915       {
 916          uint32_t col = input[w];
 917          output[w]    = ((col << 16) & 0xff0000) |
 918             ((col >> 16) & 0xff) | (col & 0xff00ff00);
 919       }
 920    }
 921 }
 922
 923 #define YUV_SHIFT 6
 924 #define YUV_OFFSET (1 << (YUV_SHIFT - 1))
 925 #define YUV_MAT_Y (1 << 6)
 926 #define YUV_MAT_U_G (-22)
 927 #define YUV_MAT_U_B (113)
 928 #define YUV_MAT_V_R (90)
 929 #define YUV_MAT_V_G (-46)
 930
 931 void conv_yuyv_argb8888(void *output_, const void *input_,
 932       int width, int height,
 933       int out_stride, int in_stride)
 934 {
 935    int h;
 936    const uint8_t *input        = (const uint8_t*)input_;
 937    uint32_t *output            = (uint32_t*)output_;
 938
 939 #if defined(__SSE2__)
 940    const __m128i mask_y        = _mm_set1_epi16(0xffu);
 941    const __m128i mask_u        = _mm_set1_epi32(0xffu << 8);
 942    const __m128i mask_v        = _mm_set1_epi32(0xffu << 24);
 943    const __m128i chroma_offset = _mm_set1_epi16(128);
 944    const __m128i round_offset  = _mm_set1_epi16(YUV_OFFSET);
 945
 946    const __m128i yuv_mul       = _mm_set1_epi16(YUV_MAT_Y);
 947    const __m128i u_g_mul       = _mm_set1_epi16(YUV_MAT_U_G);
 948    const __m128i u_b_mul       = _mm_set1_epi16(YUV_MAT_U_B);
 949    const __m128i v_r_mul       = _mm_set1_epi16(YUV_MAT_V_R);
 950    const __m128i v_g_mul       = _mm_set1_epi16(YUV_MAT_V_G);
 951    const __m128i a             = _mm_cmpeq_epi16(
 952          _mm_setzero_si128(), _mm_setzero_si128());
 953 #endif
 954
 955    for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
 956    {
 957       const uint8_t *src = input;
 958       uint32_t      *dst = output;
 959       int              w = 0;
 960
 961 #if defined(__SSE2__)
 962       /* Each loop processes 16 pixels. */
 963       for (; w + 16 <= width; w += 16, src += 32, dst += 16)
 964       {
 965          __m128i u, v, u0_g, u1_g, u0_b, u1_b, v0_r, v1_r, v0_g, v1_g,
 966                  r0, g0, b0, r1, g1, b1;
 967          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
 968          __m128i res0, res1, res2, res3;
 969          __m128i yuv0 = _mm_loadu_si128((const __m128i*)(src +  0)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
 970          __m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
 971
 972          __m128i _y0 = _mm_and_si128(yuv0, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
 973          __m128i u0 = _mm_and_si128(yuv0, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
 974          __m128i v0 = _mm_and_si128(yuv0, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
 975          __m128i _y1 = _mm_and_si128(yuv1, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
 976          __m128i u1 = _mm_and_si128(yuv1, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
 977          __m128i v1 = _mm_and_si128(yuv1, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
 978
 979          /* Juggle around to get U and V in the same 16-bit format as Y. */
 980          u0 = _mm_srli_si128(u0, 1);
 981          v0 = _mm_srli_si128(v0, 3);
 982          u1 = _mm_srli_si128(u1, 1);
 983          v1 = _mm_srli_si128(v1, 3);
 984          u = _mm_packs_epi32(u0, u1);
 985          v = _mm_packs_epi32(v0, v1);
 986
 987          /* Apply YUV offsets (U, V) -= (-128, -128). */
 988          u = _mm_sub_epi16(u, chroma_offset);
 989          v = _mm_sub_epi16(v, chroma_offset);
 990
 991          /* Upscale chroma horizontally (nearest). */
 992          u0 = _mm_unpacklo_epi16(u, u);
 993          u1 = _mm_unpackhi_epi16(u, u);
 994          v0 = _mm_unpacklo_epi16(v, v);
 995          v1 = _mm_unpackhi_epi16(v, v);
 996
 997          /* Apply transformations. */
 998          _y0 = _mm_mullo_epi16(_y0, yuv_mul);
 999          _y1 = _mm_mullo_epi16(_y1, yuv_mul);
1000          u0_g   = _mm_mullo_epi16(u0, u_g_mul);
1001          u1_g   = _mm_mullo_epi16(u1, u_g_mul);
1002          u0_b   = _mm_mullo_epi16(u0, u_b_mul);
1003          u1_b   = _mm_mullo_epi16(u1, u_b_mul);
1004          v0_r   = _mm_mullo_epi16(v0, v_r_mul);
1005          v1_r   = _mm_mullo_epi16(v1, v_r_mul);
1006          v0_g   = _mm_mullo_epi16(v0, v_g_mul);
1007          v1_g   = _mm_mullo_epi16(v1, v_g_mul);
1008
1009          /* Add contibutions from the transformed components. */
1010          r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_y0, v0_r),
1011                   round_offset), YUV_SHIFT);
1012          g0 = _mm_srai_epi16(_mm_adds_epi16(
1013                   _mm_adds_epi16(_mm_adds_epi16(_y0, v0_g), u0_g), round_offset), YUV_SHIFT);
1014          b0 = _mm_srai_epi16(_mm_adds_epi16(
1015                   _mm_adds_epi16(_y0, u0_b), round_offset), YUV_SHIFT);
1016
1017          r1 = _mm_srai_epi16(_mm_adds_epi16(
1018                   _mm_adds_epi16(_y1, v1_r), round_offset), YUV_SHIFT);
1019          g1 = _mm_srai_epi16(_mm_adds_epi16(
1020                   _mm_adds_epi16(_mm_adds_epi16(_y1, v1_g), u1_g), round_offset), YUV_SHIFT);
1021          b1 = _mm_srai_epi16(_mm_adds_epi16(
1022                   _mm_adds_epi16(_y1, u1_b), round_offset), YUV_SHIFT);
1023
1024          /* Saturate into 8-bit. */
1025          r0 = _mm_packus_epi16(r0, r1);
1026          g0 = _mm_packus_epi16(g0, g1);
1027          b0 = _mm_packus_epi16(b0, b1);
1028
1029          /* Interleave into ARGB. */
1030          res_lo_bg = _mm_unpacklo_epi8(b0, g0);
1031          res_hi_bg = _mm_unpackhi_epi8(b0, g0);
1032          res_lo_ra = _mm_unpacklo_epi8(r0, a);
1033          res_hi_ra = _mm_unpackhi_epi8(r0, a);
1034          res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra);
1035          res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra);
1036          res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra);
1037          res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra);
1038
1039          _mm_storeu_si128((__m128i*)(dst +  0), res0);
1040          _mm_storeu_si128((__m128i*)(dst +  4), res1);
1041          _mm_storeu_si128((__m128i*)(dst +  8), res2);
1042          _mm_storeu_si128((__m128i*)(dst + 12), res3);
1043       }
1044 #endif
1045
1046       /* Finish off the rest (if any) in C. */
1047       for (; w < width; w += 2, src += 4, dst += 2)
1048       {
1049          int _y0    = src[0];
1050          int  u     = src[1] - 128;
1051          int _y1    = src[2];
1052          int  v     = src[3] - 128;
1053
1054          uint8_t r0 = clamp_8bit((YUV_MAT_Y * _y0 +                   YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1055          uint8_t g0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1056          uint8_t b0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_B * u                   + YUV_OFFSET) >> YUV_SHIFT);
1057
1058          uint8_t r1 = clamp_8bit((YUV_MAT_Y * _y1 +                   YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1059          uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1060          uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u                   + YUV_OFFSET) >> YUV_SHIFT);
1061
1062          dst[0]     = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
1063          dst[1]     = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
1064       }
1065    }
1066 }
1067
1068 void conv_copy(void *output_, const void *input_,
1069       int width, int height,
1070       int out_stride, int in_stride)
1071 {
1072    int h;
1073    int copy_len         = abs(out_stride);
1074    const uint8_t *input = (const uint8_t*)input_;
1075    uint8_t *output      = (uint8_t*)output_;
1076
1077    if (abs(in_stride) < copy_len)
1078       copy_len          = abs(in_stride);
1079
1080    for (h = 0; h < height;
1081          h++, output += out_stride, input += in_stride)
1082       memcpy(output, input, copy_len);
1083 }