deps/libretro-common/formats/jpeg/rjpeg.c

   1 /* Copyright  (C) 2010-2020 The RetroArch team
   2  *
   3  * ---------------------------------------------------------------------------------------
   4  * The following license statement only applies to this file (rjpeg.c).
   5  * ---------------------------------------------------------------------------------------
   6  *
   7  * Permission is hereby granted, free of charge,
   8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation the rights to
  10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /* Modified version of stb_image's JPEG sources. */
  24
  25 #include <stdint.h>
  26 #include <stdarg.h>
  27 #include <stddef.h> /* ptrdiff_t on osx */
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include <retro_inline.h>
  32 #include <boolean.h>
  33 #include <formats/image.h>
  34 #include <formats/rjpeg.h>
  35 #include <features/features_cpu.h>
  36
  37 enum
  38 {
  39    RJPEG_DEFAULT = 0, /* only used for req_comp */
  40    RJPEG_GREY,
  41    RJPEG_GREY_ALPHA,
  42    RJPEG_RGB,
  43    RJPEG_RGB_ALPHA
  44 };
  45
  46 enum
  47 {
  48    RJPEG_SCAN_LOAD = 0,
  49    RJPEG_SCAN_TYPE,
  50    RJPEG_SCAN_HEADER
  51 };
  52
  53 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
  54                                     int w, int hs);
  55
  56 typedef struct
  57 {
  58    rjpeg_resample_row_func resample;
  59    uint8_t *line0;
  60    uint8_t *line1;
  61    int hs,vs;   /* expansion factor in each axis */
  62    int w_lores; /* horizontal pixels pre-expansion */
  63    int ystep;   /* how far through vertical expansion we are */
  64    int ypos;    /* which pre-expansion row we're on */
  65 } rjpeg_resample;
  66
  67 struct rjpeg
  68 {
  69    uint8_t *buff_data;
  70 };
  71
  72 #ifdef _MSC_VER
  73 #define RJPEG_HAS_LROTL
  74 #endif
  75
  76 #ifdef RJPEG_HAS_LROTL
  77    #define RJPEG_LROT(x,y)  _lrotl(x,y)
  78 #else
  79    #define RJPEG_LROT(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
  80 #endif
  81
  82 /* x86/x64 detection */
  83 #if defined(__x86_64__) || defined(_M_X64)
  84 #define RJPEG_X64_TARGET
  85 #elif defined(__i386) || defined(_M_IX86)
  86 #define RJPEG_X86_TARGET
  87 #endif
  88
  89 #if defined(__GNUC__) && (defined(RJPEG_X86_TARGET) || defined(RJPEG_X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
  90 /* NOTE: not clear do we actually need this for the 64-bit path?
  91  * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
  92  * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
  93  * this is just broken and gcc are jerks for not fixing it properly
  94  * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
  95  */
  96 #define RJPEG_NO_SIMD
  97 #endif
  98
  99 #if defined(__MINGW32__) && defined(RJPEG_X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
 100 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG_X64_TARGET
 101  *
 102  * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
 103  * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
 104  * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
 105  * simultaneously enabling "-mstackrealign".
 106  *
 107  * See https://github.com/nothings/stb/issues/81 for more information.
 108  *
 109  * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
 110  * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
 111  */
 112 #define RJPEG_NO_SIMD
 113 #endif
 114
 115 #if defined(__SSE2__)
 116 #include <emmintrin.h>
 117
 118 #ifdef _MSC_VER
 119 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 120 #else
 121 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 122 #endif
 123
 124 #endif
 125
 126 /* ARM NEON */
 127 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
 128 #undef RJPEG_NEON
 129 #endif
 130
 131 #ifdef RJPEG_NEON
 132 #include <arm_neon.h>
 133 /* assume GCC or Clang on ARM targets */
 134 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 135 #endif
 136
 137 #ifndef RJPEG_SIMD_ALIGN
 138 #define RJPEG_SIMD_ALIGN(type, name) type name
 139 #endif
 140
 141 typedef struct
 142 {
 143    uint8_t *img_buffer;
 144    uint8_t *img_buffer_end;
 145    uint8_t *img_buffer_original;
 146    int      img_n;
 147    int      img_out_n;
 148    int      buflen;
 149    uint32_t img_x;
 150    uint32_t img_y;
 151    uint8_t  buffer_start[128];
 152 } rjpeg_context;
 153
 154 static INLINE uint8_t rjpeg_get8(rjpeg_context *s)
 155 {
 156    if (s->img_buffer < s->img_buffer_end)
 157       return *s->img_buffer++;
 158
 159    return 0;
 160 }
 161
 162 #define RJPEG_AT_EOF(s)     ((s)->img_buffer >= (s)->img_buffer_end)
 163
 164 #define RJPEG_GET16BE(s)    ((rjpeg_get8((s)) << 8) + rjpeg_get8((s)))
 165
 166 /* huffman decoding acceleration */
 167 #define FAST_BITS   9  /* larger handles more cases; smaller stomps less cache */
 168
 169 typedef struct
 170 {
 171    unsigned int maxcode[18];
 172    int    delta[17];   /* old 'firstsymbol' - old 'firstcode' */
 173    /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
 174    uint16_t code[256];
 175    uint8_t  fast[1 << FAST_BITS];
 176    uint8_t  values[256];
 177    uint8_t  size[257];
 178 } rjpeg_huffman;
 179
 180 typedef struct
 181 {
 182    rjpeg_context *s;
 183    /* kernels */
 184    void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
 185    void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
 186          const uint8_t *pcr, int count, int step);
 187    uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
 188          uint8_t *in_far, int w, int hs);
 189
 190    /* definition of jpeg image component */
 191    struct
 192    {
 193       uint8_t *data;
 194       void *raw_data, *raw_coeff;
 195       uint8_t *linebuf;
 196       short   *coeff;            /* progressive only */
 197       int id;
 198       int h,v;
 199       int tq;
 200       int hd,ha;
 201       int dc_pred;
 202
 203       int x,y,w2,h2;
 204       int      coeff_w;          /* number of 8x8 coefficient blocks */
 205       int      coeff_h;          /* number of 8x8 coefficient blocks */
 206    } img_comp[4];
 207
 208    /* sizes for components, interleaved MCUs */
 209    int img_h_max, img_v_max;
 210    int img_mcu_x, img_mcu_y;
 211    int img_mcu_w, img_mcu_h;
 212
 213    int            code_bits;     /* number of valid bits */
 214    int            nomore;        /* flag if we saw a marker so must stop */
 215    int            progressive;
 216    int            spec_start;
 217    int            spec_end;
 218    int            succ_high;
 219    int            succ_low;
 220    int            eob_run;
 221    int scan_n, order[4];
 222    int restart_interval, todo;
 223    uint32_t       code_buffer;   /* jpeg entropy-coded buffer */
 224    rjpeg_huffman huff_dc[4];     /* unsigned int alignment */
 225    rjpeg_huffman huff_ac[4];     /* unsigned int alignment */
 226    int16_t fast_ac[4][1 << FAST_BITS];
 227    unsigned char  marker;        /* marker seen while filling entropy buffer */
 228    uint8_t dequant[4][64];
 229 } rjpeg_jpeg;
 230
 231 #define RJPEG_F2F(x)  ((int) (((x) * 4096 + 0.5)))
 232 #define RJPEG_FSH(x)  ((x) << 12)
 233
 234 #define RJPEG_MARKER_NONE  0xff
 235 /* if there's a pending marker from the entropy stream, return that
 236  * otherwise, fetch from the stream and get a marker. if there's no
 237  * marker, return 0xff, which is never a valid marker value
 238  */
 239
 240 /* in each scan, we'll have scan_n components, and the order
 241  * of the components is specified by order[]
 242  */
 243 #define RJPEG_RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
 244
 245 #define JPEG_MARKER           0xFF
 246 #define JPEG_MARKER_SOI       0xD8
 247 #define JPEG_MARKER_SOS       0xDA
 248 #define JPEG_MARKER_EOI       0xD9
 249 #define JPEG_MARKER_APP1      0xE1
 250 #define JPEG_MARKER_APP2      0xE2
 251
 252 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
 253 #define RJPEG_SOF(x)               ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
 254
 255 #define RJPEG_SOF_PROGRESSIVE(x)   ((x) == 0xc2)
 256 #define RJPEG_DIV4(x)              ((uint8_t) ((x) >> 2))
 257 #define RJPEG_DIV16(x)             ((uint8_t) ((x) >> 4))
 258
 259 static int rjpeg_build_huffman(rjpeg_huffman *h, int *count)
 260 {
 261    int i,j,k = 0,code;
 262
 263    /* build size list for each symbol (from JPEG spec) */
 264    for (i = 0; i < 16; ++i)
 265       for (j = 0; j < count[i]; ++j)
 266          h->size[k++] = (uint8_t) (i+1);
 267
 268    h->size[k] = 0;
 269    /* compute actual symbols (from jpeg spec) */
 270    code       = 0;
 271    k          = 0;
 272
 273    for (j = 1; j <= 16; ++j)
 274    {
 275       /* compute delta to add to code to compute symbol id */
 276       h->delta[j] = k - code;
 277       if (h->size[k] == j)
 278       {
 279          while (h->size[k] == j)
 280             h->code[k++] = (uint16_t) (code++);
 281
 282          /* Bad code lengths, corrupt JPEG? */
 283          if (code-1 >= (1 << j))
 284             return 0;
 285       }
 286       /* compute largest code + 1 for this size, preshifted as needed later */
 287       h->maxcode[j] = code << (16-j);
 288       code <<= 1;
 289    }
 290    h->maxcode[j] = 0xffffffff;
 291
 292    /* build non-spec acceleration table; 255 is flag for not-accelerated */
 293    memset(h->fast, 255, 1 << FAST_BITS);
 294    for (i = 0; i < k; ++i)
 295    {
 296       int s = h->size[i];
 297       if (s <= FAST_BITS)
 298       {
 299          int c = h->code[i] << (FAST_BITS-s);
 300          int m = 1 << (FAST_BITS-s);
 301          for (j = 0; j < m; ++j)
 302             h->fast[c+j] = (uint8_t) i;
 303       }
 304    }
 305    return 1;
 306 }
 307
 308 /* build a table that decodes both magnitude and value of small ACs in
 309  * one go. */
 310 static void rjpeg_build_fast_ac(int16_t *fast_ac, rjpeg_huffman *h)
 311 {
 312    int i;
 313
 314    for (i = 0; i < (1 << FAST_BITS); ++i)
 315    {
 316       uint8_t fast = h->fast[i];
 317
 318       fast_ac[i] = 0;
 319
 320       if (fast < 255)
 321       {
 322          int rs      = h->values[fast];
 323          int run     = (rs >> 4) & 15;
 324          int magbits = rs & 15;
 325          int len     = h->size[fast];
 326
 327          if (magbits && len + magbits <= FAST_BITS)
 328          {
 329             /* magnitude code followed by receive_extend code */
 330             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
 331             int m = 1 << (magbits - 1);
 332             if (k < m)
 333                k += (-1 << magbits) + 1;
 334
 335             /* if the result is small enough, we can fit it in fast_ac table */
 336             if (k >= -128 && k <= 127)
 337                fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
 338          }
 339       }
 340    }
 341 }
 342
 343 static void rjpeg_grow_buffer_unsafe(rjpeg_jpeg *j)
 344 {
 345    do
 346    {
 347       int b = j->nomore ? 0 : rjpeg_get8(j->s);
 348       if (b == 0xff)
 349       {
 350          int c = rjpeg_get8(j->s);
 351
 352          if (c != 0)
 353          {
 354             j->marker = (unsigned char) c;
 355             j->nomore = 1;
 356             return;
 357          }
 358       }
 359       j->code_buffer |= b << (24 - j->code_bits);
 360       j->code_bits   += 8;
 361    } while (j->code_bits <= 24);
 362 }
 363
 364 /* (1 << n) - 1 */
 365 static uint32_t rjpeg_bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 366
 367 /* decode a JPEG huffman value from the bitstream */
 368 static INLINE int rjpeg_jpeg_huff_decode(rjpeg_jpeg *j, rjpeg_huffman *h)
 369 {
 370    unsigned int temp;
 371    int c,k;
 372
 373    if (j->code_bits < 16)
 374       rjpeg_grow_buffer_unsafe(j);
 375
 376    /* look at the top FAST_BITS and determine what symbol ID it is,
 377     * if the code is <= FAST_BITS */
 378    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 379    k = h->fast[c];
 380
 381    if (k < 255)
 382    {
 383       int s = h->size[k];
 384       if (s > j->code_bits)
 385          return -1;
 386       j->code_buffer <<= s;
 387       j->code_bits -= s;
 388       return h->values[k];
 389    }
 390
 391    /* naive test is to shift the code_buffer down so k bits are
 392     * valid, then test against maxcode. To speed this up, we've
 393     * preshifted maxcode left so that it has (16-k) 0s at the
 394     * end; in other words, regardless of the number of bits, it
 395     * wants to be compared against something shifted to have 16;
 396     * that way we don't need to shift inside the loop. */
 397    temp = j->code_buffer >> 16;
 398    for (k=FAST_BITS+1 ; ; ++k)
 399       if (temp < h->maxcode[k])
 400          break;
 401
 402    if (k == 17)
 403    {
 404       /* error! code not found */
 405       j->code_bits -= 16;
 406       return -1;
 407    }
 408
 409    if (k > j->code_bits)
 410       return -1;
 411
 412    /* convert the huffman code to the symbol id */
 413    c = ((j->code_buffer >> (32 - k)) & rjpeg_bmask[k]) + h->delta[k];
 414
 415    /* convert the id to a symbol */
 416    j->code_bits -= k;
 417    j->code_buffer <<= k;
 418    return h->values[c];
 419 }
 420
 421 /* bias[n] = (-1<<n) + 1 */
 422 static int const rjpeg_jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 423
 424 /* combined JPEG 'receive' and JPEG 'extend', since baseline
 425  * always extends everything it receives. */
 426 static INLINE int rjpeg_extend_receive(rjpeg_jpeg *j, int n)
 427 {
 428    unsigned int k;
 429    int sgn;
 430    if (j->code_bits < n)
 431       rjpeg_grow_buffer_unsafe(j);
 432
 433    sgn             = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
 434    k               = RJPEG_LROT(j->code_buffer, n);
 435    j->code_buffer  = k & ~rjpeg_bmask[n];
 436    k              &= rjpeg_bmask[n];
 437    j->code_bits   -= n;
 438    return k + (rjpeg_jbias[n] & ~sgn);
 439 }
 440
 441 /* get some unsigned bits */
 442 static INLINE int rjpeg_jpeg_get_bits(rjpeg_jpeg *j, int n)
 443 {
 444    unsigned int k;
 445    if (j->code_bits < n)
 446       rjpeg_grow_buffer_unsafe(j);
 447    k              = RJPEG_LROT(j->code_buffer, n);
 448    j->code_buffer = k & ~rjpeg_bmask[n];
 449    k             &= rjpeg_bmask[n];
 450    j->code_bits  -= n;
 451    return k;
 452 }
 453
 454 static INLINE int rjpeg_jpeg_get_bit(rjpeg_jpeg *j)
 455 {
 456    unsigned int k;
 457    if (j->code_bits < 1)
 458       rjpeg_grow_buffer_unsafe(j);
 459
 460    k                = j->code_buffer;
 461    j->code_buffer <<= 1;
 462    --j->code_bits;
 463    return k & 0x80000000;
 464 }
 465
 466 /* given a value that's at position X in the zigzag stream,
 467  * where does it appear in the 8x8 matrix coded as row-major? */
 468 static uint8_t rjpeg_jpeg_dezigzag[64+15] =
 469 {
 470     0,  1,  8, 16,  9,  2,  3, 10,
 471    17, 24, 32, 25, 18, 11,  4,  5,
 472    12, 19, 26, 33, 40, 48, 41, 34,
 473    27, 20, 13,  6,  7, 14, 21, 28,
 474    35, 42, 49, 56, 57, 50, 43, 36,
 475    29, 22, 15, 23, 30, 37, 44, 51,
 476    58, 59, 52, 45, 38, 31, 39, 46,
 477    53, 60, 61, 54, 47, 55, 62, 63,
 478    /* let corrupt input sample past end */
 479    63, 63, 63, 63, 63, 63, 63, 63,
 480    63, 63, 63, 63, 63, 63, 63
 481 };
 482
 483 /* decode one 64-entry block-- */
 484 static int rjpeg_jpeg_decode_block(
 485       rjpeg_jpeg *j, short data[64],
 486       rjpeg_huffman *hdc,
 487       rjpeg_huffman *hac,
 488       int16_t *fac,
 489       int b,
 490       uint8_t *dequant)
 491 {
 492    int dc,k;
 493    int t;
 494    int diff      = 0;
 495
 496    if (j->code_bits < 16)
 497       rjpeg_grow_buffer_unsafe(j);
 498    t = rjpeg_jpeg_huff_decode(j, hdc);
 499
 500    /* Bad huffman code. Corrupt JPEG? */
 501    if (t < 0)
 502       return 0;
 503
 504    /* 0 all the ac values now so we can do it 32-bits at a time */
 505    memset(data,0,64*sizeof(data[0]));
 506
 507    if (t)
 508       diff                = rjpeg_extend_receive(j, t);
 509    dc                     = j->img_comp[b].dc_pred + diff;
 510    j->img_comp[b].dc_pred = dc;
 511    data[0]                = (short) (dc * dequant[0]);
 512
 513    /* decode AC components, see JPEG spec */
 514    k                      = 1;
 515    do
 516    {
 517       unsigned int zig;
 518       int c,r,s;
 519       if (j->code_bits < 16)
 520          rjpeg_grow_buffer_unsafe(j);
 521       c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 522       r = fac[c];
 523       if (r)
 524       {
 525          /* fast-AC path */
 526          k               += (r >> 4) & 15; /* run */
 527          s                = r & 15; /* combined length */
 528          j->code_buffer <<= s;
 529          j->code_bits    -= s;
 530          /* decode into unzigzag'd location */
 531          zig              = rjpeg_jpeg_dezigzag[k++];
 532          data[zig]        = (short) ((r >> 8) * dequant[zig]);
 533       }
 534       else
 535       {
 536          int rs = rjpeg_jpeg_huff_decode(j, hac);
 537
 538          /* Bad huffman code. Corrupt JPEG? */
 539          if (rs < 0)
 540             return 0;
 541
 542          s = rs & 15;
 543          r = rs >> 4;
 544          if (s == 0)
 545          {
 546             if (rs != 0xf0)
 547                break; /* end block */
 548             k += 16;
 549          }
 550          else
 551          {
 552             k += r;
 553             /* decode into unzigzag'd location */
 554             zig = rjpeg_jpeg_dezigzag[k++];
 555             data[zig] = (short) (rjpeg_extend_receive(j,s) * dequant[zig]);
 556          }
 557       }
 558    } while (k < 64);
 559    return 1;
 560 }
 561
 562 static int rjpeg_jpeg_decode_block_prog_dc(
 563       rjpeg_jpeg *j,
 564       short data[64],
 565       rjpeg_huffman *hdc,
 566       int b)
 567 {
 568    /* Can't merge DC and AC. Corrupt JPEG? */
 569    if (j->spec_end != 0)
 570       return 0;
 571
 572    if (j->code_bits < 16)
 573       rjpeg_grow_buffer_unsafe(j);
 574
 575    if (j->succ_high == 0)
 576    {
 577       int t;
 578       int dc;
 579       int diff = 0;
 580
 581       /* first scan for DC coefficient, must be first */
 582       memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
 583       t       = rjpeg_jpeg_huff_decode(j, hdc);
 584       if (t)
 585          diff = rjpeg_extend_receive(j, t);
 586
 587       dc      = j->img_comp[b].dc_pred + diff;
 588       j->img_comp[b].dc_pred = dc;
 589       data[0] = (short) (dc << j->succ_low);
 590    }
 591    else
 592    {
 593       /* refinement scan for DC coefficient */
 594       if (rjpeg_jpeg_get_bit(j))
 595          data[0] += (short) (1 << j->succ_low);
 596    }
 597    return 1;
 598 }
 599
 600 static int rjpeg_jpeg_decode_block_prog_ac(
 601       rjpeg_jpeg *j,
 602       short data[64],
 603       rjpeg_huffman *hac,
 604       int16_t *fac)
 605 {
 606    int k;
 607
 608    /* Can't merge DC and AC. Corrupt JPEG? */
 609    if (j->spec_start == 0)
 610       return 0;
 611
 612    if (j->succ_high == 0)
 613    {
 614       int shift = j->succ_low;
 615
 616       if (j->eob_run)
 617       {
 618          --j->eob_run;
 619          return 1;
 620       }
 621
 622       k = j->spec_start;
 623       do
 624       {
 625          unsigned int zig;
 626          int c,r,s;
 627          if (j->code_bits < 16)
 628             rjpeg_grow_buffer_unsafe(j);
 629          c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 630          r = fac[c];
 631          if (r)
 632          {
 633             /* fast-AC path */
 634             k               += (r >> 4) & 15; /* run */
 635             s                = r & 15; /* combined length */
 636             j->code_buffer <<= s;
 637             j->code_bits    -= s;
 638             zig              = rjpeg_jpeg_dezigzag[k++];
 639             data[zig]        = (short) ((r >> 8) << shift);
 640          }
 641          else
 642          {
 643             int rs = rjpeg_jpeg_huff_decode(j, hac);
 644
 645             /* Bad huffman code. Corrupt JPEG? */
 646             if (rs < 0)
 647                return 0;
 648
 649             s = rs & 15;
 650             r = rs >> 4;
 651             if (s == 0)
 652             {
 653                if (r < 15)
 654                {
 655                   j->eob_run = (1 << r);
 656                   if (r)
 657                      j->eob_run += rjpeg_jpeg_get_bits(j, r);
 658                   --j->eob_run;
 659                   break;
 660                }
 661                k += 16;
 662             }
 663             else
 664             {
 665                k         += r;
 666                zig        = rjpeg_jpeg_dezigzag[k++];
 667                data[zig]  = (short) (rjpeg_extend_receive(j,s) << shift);
 668             }
 669          }
 670       } while (k <= j->spec_end);
 671    }
 672    else
 673    {
 674       /* refinement scan for these AC coefficients */
 675
 676       short bit = (short) (1 << j->succ_low);
 677
 678       if (j->eob_run)
 679       {
 680          --j->eob_run;
 681          for (k = j->spec_start; k <= j->spec_end; ++k)
 682          {
 683             short *p = &data[rjpeg_jpeg_dezigzag[k]];
 684             if (*p != 0)
 685                if (rjpeg_jpeg_get_bit(j))
 686                   if ((*p & bit) == 0)
 687                   {
 688                      if (*p > 0)
 689                         *p += bit;
 690                      else
 691                         *p -= bit;
 692                   }
 693          }
 694       }
 695       else
 696       {
 697          k = j->spec_start;
 698          do
 699          {
 700             int r,s;
 701             int rs = rjpeg_jpeg_huff_decode(j, hac);
 702
 703             /* Bad huffman code. Corrupt JPEG? */
 704             if (rs < 0)
 705                return 0;
 706
 707             s = rs & 15;
 708             r = rs >> 4;
 709             if (s == 0)
 710             {
 711                if (r < 15)
 712                {
 713                   j->eob_run = (1 << r) - 1;
 714                   if (r)
 715                      j->eob_run += rjpeg_jpeg_get_bits(j, r);
 716                   r = 64; /* force end of block */
 717                }
 718                else
 719                {
 720                   /* r=15 s=0 should write 16 0s, so we just do
 721                    * a run of 15 0s and then write s (which is 0),
 722                    * so we don't have to do anything special here */
 723                }
 724             }
 725             else
 726             {
 727                /* Bad huffman code. Corrupt JPEG? */
 728                if (s != 1)
 729                   return 0;
 730
 731                /* sign bit */
 732                if (rjpeg_jpeg_get_bit(j))
 733                   s = bit;
 734                else
 735                   s = -bit;
 736             }
 737
 738             /* advance by r */
 739             while (k <= j->spec_end)
 740             {
 741                short *p = &data[rjpeg_jpeg_dezigzag[k++]];
 742                if (*p != 0)
 743                {
 744                   if (rjpeg_jpeg_get_bit(j))
 745                      if ((*p & bit) == 0)
 746                      {
 747                         if (*p > 0)
 748                            *p += bit;
 749                         else
 750                            *p -= bit;
 751                      }
 752                }
 753                else
 754                {
 755                   if (r == 0)
 756                   {
 757                      *p = (short) s;
 758                      break;
 759                   }
 760                   --r;
 761                }
 762             }
 763          } while (k <= j->spec_end);
 764       }
 765    }
 766    return 1;
 767 }
 768
 769 /* take a -128..127 value and rjpeg_clamp it and convert to 0..255 */
 770 static INLINE uint8_t rjpeg_clamp(int x)
 771 {
 772    /* trick to use a single test to catch both cases */
 773    if ((unsigned int) x > 255)
 774       return 255;
 775    return (uint8_t) x;
 776 }
 777
 778 /* derived from jidctint -- DCT_ISLOW */
 779 #define RJPEG_IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
 780    int t0,t1,p4,p5,x0,x1,x2,x3; \
 781    int p2 = s2;                                \
 782    int p3 = s6;                                \
 783    int p1 = (p2+p3) * RJPEG_F2F(0.5411961f);   \
 784    int t2 = p1 + p3 * RJPEG_F2F(-1.847759065f);\
 785    int t3 = p1 + p2 * RJPEG_F2F( 0.765366865f);\
 786    p2 = s0;                                    \
 787    p3 = s4;                                    \
 788    t0 = RJPEG_FSH(p2+p3);                      \
 789    t1 = RJPEG_FSH(p2-p3);                      \
 790    x0 = t0+t3;                                 \
 791    x3 = t0-t3;                                 \
 792    x1 = t1+t2;                                 \
 793    x2 = t1-t2;                                 \
 794    t0 = s7;                                    \
 795    t1 = s5;                                    \
 796    t2 = s3;                                    \
 797    t3 = s1;                                    \
 798    p3 = t0+t2;                                 \
 799    p4 = t1+t3;                                 \
 800    p1 = t0+t3;                                 \
 801    p2 = t1+t2;                                 \
 802    p5 = (p3+p4) * RJPEG_F2F( 1.175875602f);    \
 803    t0 = t0      * RJPEG_F2F( 0.298631336f);    \
 804    t1 = t1      * RJPEG_F2F( 2.053119869f);    \
 805    t2 = t2      * RJPEG_F2F( 3.072711026f);    \
 806    t3 = t3      * RJPEG_F2F( 1.501321110f);    \
 807    p1 = p5 + p1 * RJPEG_F2F(-0.899976223f);    \
 808    p2 = p5 + p2 * RJPEG_F2F(-2.562915447f);    \
 809    p3 = p3      * RJPEG_F2F(-1.961570560f);    \
 810    p4 = p4      * RJPEG_F2F(-0.390180644f);    \
 811    t3 += p1+p4;                                \
 812    t2 += p2+p3;                                \
 813    t1 += p2+p4;                                \
 814    t0 += p1+p3
 815
 816 static void rjpeg_idct_block(uint8_t *out, int out_stride, short data[64])
 817 {
 818    int i,val[64],*v=val;
 819    uint8_t   *o = NULL;
 820    int16_t   *d = data;
 821
 822    /* columns */
 823    for (i = 0; i < 8; ++i,++d, ++v)
 824    {
 825       /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
 826       if (     d[ 8] == 0
 827             && d[16] == 0
 828             && d[24] == 0
 829             && d[32] == 0
 830             && d[40] == 0
 831             && d[48] == 0
 832             && d[56] == 0)
 833       {
 834          /*    no shortcut                 0     seconds
 835           *    (1|2|3|4|5|6|7)==0          0     seconds
 836           *    all separate               -0.047 seconds
 837           *    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds */
 838          int dcterm = d[0] << 2;
 839          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
 840       }
 841       else
 842       {
 843          RJPEG_IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]);
 844
 845          /* constants scaled things up by 1<<12; let's bring them back
 846           * down, but keep 2 extra bits of precision */
 847          x0 += 512;
 848          x1 += 512;
 849          x2 += 512;
 850          x3 += 512;
 851
 852          v[ 0] = (x0+t3) >> 10;
 853          v[56] = (x0-t3) >> 10;
 854          v[ 8] = (x1+t2) >> 10;
 855          v[48] = (x1-t2) >> 10;
 856          v[16] = (x2+t1) >> 10;
 857          v[40] = (x2-t1) >> 10;
 858          v[24] = (x3+t0) >> 10;
 859          v[32] = (x3-t0) >> 10;
 860       }
 861    }
 862
 863    for (i = 0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
 864    {
 865       /* no fast case since the first 1D IDCT spread components out */
 866       RJPEG_IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]);
 867
 868       /* constants scaled things up by 1<<12, plus we had 1<<2 from first
 869        * loop, plus horizontal and vertical each scale by sqrt(8) so together
 870        * we've got an extra 1<<3, so 1<<17 total we need to remove.
 871        * so we want to round that, which means adding 0.5 * 1<<17,
 872        * aka 65536. Also, we'll end up with -128 to 127 that we want
 873        * to encode as 0..255 by adding 128, so we'll add that before the shift
 874        */
 875       x0 += 65536 + (128<<17);
 876       x1 += 65536 + (128<<17);
 877       x2 += 65536 + (128<<17);
 878       x3 += 65536 + (128<<17);
 879
 880       /* Tried computing the shifts into temps, or'ing the temps to see
 881        * if any were out of range, but that was slower */
 882       o[0] = rjpeg_clamp((x0+t3) >> 17);
 883       o[7] = rjpeg_clamp((x0-t3) >> 17);
 884       o[1] = rjpeg_clamp((x1+t2) >> 17);
 885       o[6] = rjpeg_clamp((x1-t2) >> 17);
 886       o[2] = rjpeg_clamp((x2+t1) >> 17);
 887       o[5] = rjpeg_clamp((x2-t1) >> 17);
 888       o[3] = rjpeg_clamp((x3+t0) >> 17);
 889       o[4] = rjpeg_clamp((x3-t0) >> 17);
 890    }
 891 }
 892
 893 #if defined(__SSE2__)
 894 /* sse2 integer IDCT. not the fastest possible implementation but it
 895  * produces bit-identical results to the generic C version so it's
 896  * fully "transparent".
 897  */
 898 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
 899 {
 900    /* This is constructed to match our regular (generic) integer IDCT exactly. */
 901    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
 902    __m128i tmp;
 903
 904    /* dot product constant: even elems=x, odd elems=y */
 905    #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
 906
 907    /* out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
 908     * out(1) = c1[even]*x + c1[odd]*y
 909     */
 910    #define dct_rot(out0,out1, x,y,c0,c1) \
 911       __m128i c0##lo   = _mm_unpacklo_epi16((x),(y)); \
 912       __m128i c0##hi   = _mm_unpackhi_epi16((x),(y)); \
 913       __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
 914       __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
 915       __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
 916       __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
 917
 918    /* out = in << 12  (in 16-bit, out 32-bit) */
 919    #define dct_widen(out, in) \
 920       __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
 921       __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
 922
 923    /* wide add */
 924    #define dct_wadd(out, a, b) \
 925       __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
 926       __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
 927
 928    /* wide sub */
 929    #define dct_wsub(out, a, b) \
 930       __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
 931       __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
 932
 933    /* butterfly a/b, add bias, then shift by "s" and pack */
 934    #define dct_bfly32o(out0, out1, a,b,bias,s) \
 935       { \
 936          __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
 937          __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
 938          dct_wadd(sum, abiased, b); \
 939          dct_wsub(dif, abiased, b); \
 940          out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
 941          out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
 942       }
 943
 944    /* 8-bit interleave step (for transposes) */
 945    #define dct_interleave8(a, b) \
 946       tmp = a; \
 947       a = _mm_unpacklo_epi8(a, b); \
 948       b = _mm_unpackhi_epi8(tmp, b)
 949
 950    /* 16-bit interleave step (for transposes) */
 951    #define dct_interleave16(a, b) \
 952       tmp = a; \
 953       a = _mm_unpacklo_epi16(a, b); \
 954       b = _mm_unpackhi_epi16(tmp, b)
 955
 956    #define dct_pass(bias,shift) \
 957       { \
 958          /* even part */ \
 959          dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
 960          __m128i sum04 = _mm_add_epi16(row0, row4); \
 961          __m128i dif04 = _mm_sub_epi16(row0, row4); \
 962          dct_widen(t0e, sum04); \
 963          dct_widen(t1e, dif04); \
 964          dct_wadd(x0, t0e, t3e); \
 965          dct_wsub(x3, t0e, t3e); \
 966          dct_wadd(x1, t1e, t2e); \
 967          dct_wsub(x2, t1e, t2e); \
 968          /* odd part */ \
 969          dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
 970          dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
 971          __m128i sum17 = _mm_add_epi16(row1, row7); \
 972          __m128i sum35 = _mm_add_epi16(row3, row5); \
 973          dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
 974          dct_wadd(x4, y0o, y4o); \
 975          dct_wadd(x5, y1o, y5o); \
 976          dct_wadd(x6, y2o, y5o); \
 977          dct_wadd(x7, y3o, y4o); \
 978          dct_bfly32o(row0,row7, x0,x7,bias,shift); \
 979          dct_bfly32o(row1,row6, x1,x6,bias,shift); \
 980          dct_bfly32o(row2,row5, x2,x5,bias,shift); \
 981          dct_bfly32o(row3,row4, x3,x4,bias,shift); \
 982       }
 983
 984    __m128i rot0_0 = dct_const(RJPEG_F2F(0.5411961f), RJPEG_F2F(0.5411961f) + RJPEG_F2F(-1.847759065f));
 985    __m128i rot0_1 = dct_const(RJPEG_F2F(0.5411961f) + RJPEG_F2F( 0.765366865f), RJPEG_F2F(0.5411961f));
 986    __m128i rot1_0 = dct_const(RJPEG_F2F(1.175875602f) + RJPEG_F2F(-0.899976223f), RJPEG_F2F(1.175875602f));
 987    __m128i rot1_1 = dct_const(RJPEG_F2F(1.175875602f), RJPEG_F2F(1.175875602f) + RJPEG_F2F(-2.562915447f));
 988    __m128i rot2_0 = dct_const(RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 0.298631336f), RJPEG_F2F(-1.961570560f));
 989    __m128i rot2_1 = dct_const(RJPEG_F2F(-1.961570560f), RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 3.072711026f));
 990    __m128i rot3_0 = dct_const(RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 2.053119869f), RJPEG_F2F(-0.390180644f));
 991    __m128i rot3_1 = dct_const(RJPEG_F2F(-0.390180644f), RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 1.501321110f));
 992
 993    /* rounding biases in column/row passes, see rjpeg_idct_block for explanation. */
 994    __m128i bias_0 = _mm_set1_epi32(512);
 995    __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
 996
 997    /* load */
 998    row0 = _mm_load_si128((const __m128i *) (data + 0*8));
 999    row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1000    row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1001    row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1002    row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1003    row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1004    row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1005    row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1006
1007    /* column pass */
1008    dct_pass(bias_0, 10);
1009
1010    {
1011       /* 16bit 8x8 transpose pass 1 */
1012       dct_interleave16(row0, row4);
1013       dct_interleave16(row1, row5);
1014       dct_interleave16(row2, row6);
1015       dct_interleave16(row3, row7);
1016
1017       /* transpose pass 2 */
1018       dct_interleave16(row0, row2);
1019       dct_interleave16(row1, row3);
1020       dct_interleave16(row4, row6);
1021       dct_interleave16(row5, row7);
1022
1023       /* transpose pass 3 */
1024       dct_interleave16(row0, row1);
1025       dct_interleave16(row2, row3);
1026       dct_interleave16(row4, row5);
1027       dct_interleave16(row6, row7);
1028    }
1029
1030    /* row pass */
1031    dct_pass(bias_1, 17);
1032
1033    {
1034       /* pack */
1035       __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1036       __m128i p1 = _mm_packus_epi16(row2, row3);
1037       __m128i p2 = _mm_packus_epi16(row4, row5);
1038       __m128i p3 = _mm_packus_epi16(row6, row7);
1039
1040       /* 8bit 8x8 transpose pass 1 */
1041       dct_interleave8(p0, p2); /* a0e0a1e1... */
1042       dct_interleave8(p1, p3); /* c0g0c1g1... */
1043
1044       /* transpose pass 2 */
1045       dct_interleave8(p0, p1); /* a0c0e0g0... */
1046       dct_interleave8(p2, p3); /* b0d0f0h0... */
1047
1048       /* transpose pass 3 */
1049       dct_interleave8(p0, p2); /* a0b0c0d0... */
1050       dct_interleave8(p1, p3); /* a4b4c4d4... */
1051
1052       /* store */
1053       _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1054       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1055       _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1056       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1057       _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1058       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1059       _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1060       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1061    }
1062
1063 #undef dct_const
1064 #undef dct_rot
1065 #undef dct_widen
1066 #undef dct_wadd
1067 #undef dct_wsub
1068 #undef dct_bfly32o
1069 #undef dct_interleave8
1070 #undef dct_interleave16
1071 #undef dct_pass
1072 }
1073
1074 #endif
1075
1076 #ifdef RJPEG_NEON
1077
1078 /* NEON integer IDCT. should produce bit-identical
1079  * results to the generic C version. */
1080 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
1081 {
1082    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1083
1084    int16x4_t rot0_0 = vdup_n_s16(RJPEG_F2F(0.5411961f));
1085    int16x4_t rot0_1 = vdup_n_s16(RJPEG_F2F(-1.847759065f));
1086    int16x4_t rot0_2 = vdup_n_s16(RJPEG_F2F( 0.765366865f));
1087    int16x4_t rot1_0 = vdup_n_s16(RJPEG_F2F( 1.175875602f));
1088    int16x4_t rot1_1 = vdup_n_s16(RJPEG_F2F(-0.899976223f));
1089    int16x4_t rot1_2 = vdup_n_s16(RJPEG_F2F(-2.562915447f));
1090    int16x4_t rot2_0 = vdup_n_s16(RJPEG_F2F(-1.961570560f));
1091    int16x4_t rot2_1 = vdup_n_s16(RJPEG_F2F(-0.390180644f));
1092    int16x4_t rot3_0 = vdup_n_s16(RJPEG_F2F( 0.298631336f));
1093    int16x4_t rot3_1 = vdup_n_s16(RJPEG_F2F( 2.053119869f));
1094    int16x4_t rot3_2 = vdup_n_s16(RJPEG_F2F( 3.072711026f));
1095    int16x4_t rot3_3 = vdup_n_s16(RJPEG_F2F( 1.501321110f));
1096
1097 #define dct_long_mul(out, inq, coeff) \
1098    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1099    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1100
1101 #define dct_long_mac(out, acc, inq, coeff) \
1102    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1103    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1104
1105 #define dct_widen(out, inq) \
1106    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1107    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1108
1109 /* wide add */
1110 #define dct_wadd(out, a, b) \
1111    int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1112    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1113
1114 /* wide sub */
1115 #define dct_wsub(out, a, b) \
1116    int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1117    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1118
1119 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1120 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1121    { \
1122       dct_wadd(sum, a, b); \
1123       dct_wsub(dif, a, b); \
1124       out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1125       out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1126    }
1127
1128 #define dct_pass(shiftop, shift) \
1129    { \
1130       /* even part */ \
1131       int16x8_t sum26 = vaddq_s16(row2, row6); \
1132       dct_long_mul(p1e, sum26, rot0_0); \
1133       dct_long_mac(t2e, p1e, row6, rot0_1); \
1134       dct_long_mac(t3e, p1e, row2, rot0_2); \
1135       int16x8_t sum04 = vaddq_s16(row0, row4); \
1136       int16x8_t dif04 = vsubq_s16(row0, row4); \
1137       dct_widen(t0e, sum04); \
1138       dct_widen(t1e, dif04); \
1139       dct_wadd(x0, t0e, t3e); \
1140       dct_wsub(x3, t0e, t3e); \
1141       dct_wadd(x1, t1e, t2e); \
1142       dct_wsub(x2, t1e, t2e); \
1143       /* odd part */ \
1144       int16x8_t sum15 = vaddq_s16(row1, row5); \
1145       int16x8_t sum17 = vaddq_s16(row1, row7); \
1146       int16x8_t sum35 = vaddq_s16(row3, row5); \
1147       int16x8_t sum37 = vaddq_s16(row3, row7); \
1148       int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1149       dct_long_mul(p5o, sumodd, rot1_0); \
1150       dct_long_mac(p1o, p5o, sum17, rot1_1); \
1151       dct_long_mac(p2o, p5o, sum35, rot1_2); \
1152       dct_long_mul(p3o, sum37, rot2_0); \
1153       dct_long_mul(p4o, sum15, rot2_1); \
1154       dct_wadd(sump13o, p1o, p3o); \
1155       dct_wadd(sump24o, p2o, p4o); \
1156       dct_wadd(sump23o, p2o, p3o); \
1157       dct_wadd(sump14o, p1o, p4o); \
1158       dct_long_mac(x4, sump13o, row7, rot3_0); \
1159       dct_long_mac(x5, sump24o, row5, rot3_1); \
1160       dct_long_mac(x6, sump23o, row3, rot3_2); \
1161       dct_long_mac(x7, sump14o, row1, rot3_3); \
1162       dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1163       dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1164       dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1165       dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1166    }
1167
1168    /* load */
1169    row0 = vld1q_s16(data + 0*8);
1170    row1 = vld1q_s16(data + 1*8);
1171    row2 = vld1q_s16(data + 2*8);
1172    row3 = vld1q_s16(data + 3*8);
1173    row4 = vld1q_s16(data + 4*8);
1174    row5 = vld1q_s16(data + 5*8);
1175    row6 = vld1q_s16(data + 6*8);
1176    row7 = vld1q_s16(data + 7*8);
1177
1178    /* add DC bias */
1179    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1180
1181    /* column pass */
1182    dct_pass(vrshrn_n_s32, 10);
1183
1184    /* 16bit 8x8 transpose */
1185    {
1186 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1187  * whether compilers actually get this is another story, sadly. */
1188 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1189 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1190 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1191
1192       /* pass 1 */
1193       dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1194       dct_trn16(row2, row3);
1195       dct_trn16(row4, row5);
1196       dct_trn16(row6, row7);
1197
1198       /* pass 2 */
1199       dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1200       dct_trn32(row1, row3);
1201       dct_trn32(row4, row6);
1202       dct_trn32(row5, row7);
1203
1204       /* pass 3 */
1205       dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1206       dct_trn64(row1, row5);
1207       dct_trn64(row2, row6);
1208       dct_trn64(row3, row7);
1209
1210 #undef dct_trn16
1211 #undef dct_trn32
1212 #undef dct_trn64
1213    }
1214
1215    /* row pass
1216     * vrshrn_n_s32 only supports shifts up to 16, we need
1217     * 17. so do a non-rounding shift of 16 first then follow
1218     * up with a rounding shift by 1. */
1219    dct_pass(vshrn_n_s32, 16);
1220
1221    {
1222       /* pack and round */
1223       uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1224       uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1225       uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1226       uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1227       uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1228       uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1229       uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1230       uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1231
1232       /* again, these can translate into one instruction, but often don't. */
1233 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1234 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1235 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1236
1237       /* sadly can't use interleaved stores here since we only write
1238        * 8 bytes to each scan line! */
1239
1240       /* 8x8 8-bit transpose pass 1 */
1241       dct_trn8_8(p0, p1);
1242       dct_trn8_8(p2, p3);
1243       dct_trn8_8(p4, p5);
1244       dct_trn8_8(p6, p7);
1245
1246       /* pass 2 */
1247       dct_trn8_16(p0, p2);
1248       dct_trn8_16(p1, p3);
1249       dct_trn8_16(p4, p6);
1250       dct_trn8_16(p5, p7);
1251
1252       /* pass 3 */
1253       dct_trn8_32(p0, p4);
1254       dct_trn8_32(p1, p5);
1255       dct_trn8_32(p2, p6);
1256       dct_trn8_32(p3, p7);
1257
1258       /* store */
1259       vst1_u8(out, p0);
1260       out += out_stride;
1261       vst1_u8(out, p1);
1262       out += out_stride;
1263       vst1_u8(out, p2);
1264       out += out_stride;
1265       vst1_u8(out, p3);
1266       out += out_stride;
1267       vst1_u8(out, p4);
1268       out += out_stride;
1269       vst1_u8(out, p5);
1270       out += out_stride;
1271       vst1_u8(out, p6);
1272       out += out_stride;
1273       vst1_u8(out, p7);
1274
1275 #undef dct_trn8_8
1276 #undef dct_trn8_16
1277 #undef dct_trn8_32
1278    }
1279
1280 #undef dct_long_mul
1281 #undef dct_long_mac
1282 #undef dct_widen
1283 #undef dct_wadd
1284 #undef dct_wsub
1285 #undef dct_bfly32o
1286 #undef dct_pass
1287 }
1288
1289 #endif /* RJPEG_NEON */
1290
1291 static uint8_t rjpeg_get_marker(rjpeg_jpeg *j)
1292 {
1293    uint8_t x;
1294
1295    if (j->marker != RJPEG_MARKER_NONE)
1296    {
1297       x = j->marker;
1298       j->marker = RJPEG_MARKER_NONE;
1299       return x;
1300    }
1301
1302    x = rjpeg_get8(j->s);
1303    if (x != 0xff)
1304       return RJPEG_MARKER_NONE;
1305    while (x == 0xff)
1306       x = rjpeg_get8(j->s);
1307    return x;
1308 }
1309
1310 /* after a restart interval, rjpeg_jpeg_reset the entropy decoder and
1311  * the dc prediction
1312  */
1313 static void rjpeg_jpeg_reset(rjpeg_jpeg *j)
1314 {
1315    j->code_bits           = 0;
1316    j->code_buffer         = 0;
1317    j->nomore              = 0;
1318    j->img_comp[0].dc_pred = 0;
1319    j->img_comp[1].dc_pred = 0;
1320    j->img_comp[2].dc_pred = 0;
1321    j->marker              = RJPEG_MARKER_NONE;
1322    j->todo                = j->restart_interval ? j->restart_interval : 0x7fffffff;
1323    j->eob_run             = 0;
1324
1325    /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1326     * since we don't even allow 1<<30 pixels */
1327 }
1328
1329 static int rjpeg_parse_entropy_coded_data(rjpeg_jpeg *z)
1330 {
1331    rjpeg_jpeg_reset(z);
1332
1333    if (z->scan_n == 1)
1334    {
1335       int i, j;
1336       int n = z->order[0];
1337       int w = (z->img_comp[n].x+7) >> 3;
1338       int h = (z->img_comp[n].y+7) >> 3;
1339
1340       /* non-interleaved data, we just need to process one block at a time,
1341        * in trivial scanline order
1342        * number of blocks to do just depends on how many actual "pixels" this
1343        * component has, independent of interleaved MCU blocking and such */
1344
1345       if (z->progressive)
1346       {
1347          for (j = 0; j < h; ++j)
1348          {
1349             for (i = 0; i < w; ++i)
1350             {
1351                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1352
1353                if (z->spec_start == 0)
1354                {
1355                   if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1356                      return 0;
1357                }
1358                else
1359                {
1360                   int ha = z->img_comp[n].ha;
1361                   if (!rjpeg_jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1362                      return 0;
1363                }
1364
1365                /* every data block is an MCU, so countdown the restart interval */
1366                if (--z->todo <= 0)
1367                {
1368                   if (z->code_bits < 24)
1369                      rjpeg_grow_buffer_unsafe(z);
1370
1371                   if (!RJPEG_RESTART(z->marker))
1372                      return 1;
1373                   rjpeg_jpeg_reset(z);
1374                }
1375             }
1376          }
1377       }
1378       else
1379       {
1380          RJPEG_SIMD_ALIGN(short, data[64]);
1381
1382          for (j = 0; j < h; ++j)
1383          {
1384             for (i = 0; i < w; ++i)
1385             {
1386                int ha = z->img_comp[n].ha;
1387                if (!rjpeg_jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1388                         z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1389                   return 0;
1390
1391                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1392                      z->img_comp[n].w2, data);
1393
1394                /* every data block is an MCU, so countdown the restart interval */
1395                if (--z->todo <= 0)
1396                {
1397                   if (z->code_bits < 24)
1398                      rjpeg_grow_buffer_unsafe(z);
1399
1400                   /* if it's NOT a restart, then just bail,
1401                    * so we get corrupt data rather than no data */
1402                   if (!RJPEG_RESTART(z->marker))
1403                      return 1;
1404                   rjpeg_jpeg_reset(z);
1405                }
1406             }
1407          }
1408       }
1409    }
1410    else
1411    {
1412       /* interleaved */
1413       int i,j,k,x,y;
1414
1415       if (z->progressive)
1416       {
1417          for (j = 0; j < z->img_mcu_y; ++j)
1418          {
1419             for (i = 0; i < z->img_mcu_x; ++i)
1420             {
1421                /* scan an interleaved MCU... process scan_n components in order */
1422                for (k = 0; k < z->scan_n; ++k)
1423                {
1424                   int n = z->order[k];
1425                   /* scan out an MCU's worth of this component; that's just determined
1426                    * by the basic H and V specified for the component */
1427                   for (y = 0; y < z->img_comp[n].v; ++y)
1428                   {
1429                      for (x = 0; x < z->img_comp[n].h; ++x)
1430                      {
1431                         int      x2 = (i*z->img_comp[n].h + x);
1432                         int      y2 = (j*z->img_comp[n].v + y);
1433                         short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1434                         if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1435                            return 0;
1436                      }
1437                   }
1438                }
1439
1440                /* after all interleaved components, that's an interleaved MCU,
1441                 * so now count down the restart interval */
1442                if (--z->todo <= 0)
1443                {
1444                   if (z->code_bits < 24)
1445                      rjpeg_grow_buffer_unsafe(z);
1446                   if (!RJPEG_RESTART(z->marker))
1447                      return 1;
1448                   rjpeg_jpeg_reset(z);
1449                }
1450             }
1451          }
1452       }
1453       else
1454       {
1455          RJPEG_SIMD_ALIGN(short, data[64]);
1456
1457          for (j = 0; j < z->img_mcu_y; ++j)
1458          {
1459             for (i = 0; i < z->img_mcu_x; ++i)
1460             {
1461                /* scan an interleaved MCU... process scan_n components in order */
1462                for (k = 0; k < z->scan_n; ++k)
1463                {
1464                   int n = z->order[k];
1465                   /* scan out an MCU's worth of this component; that's just determined
1466                    * by the basic H and V specified for the component */
1467                   for (y = 0; y < z->img_comp[n].v; ++y)
1468                   {
1469                      for (x = 0; x < z->img_comp[n].h; ++x)
1470                      {
1471                         int x2 = (i*z->img_comp[n].h + x)*8;
1472                         int y2 = (j*z->img_comp[n].v + y)*8;
1473                         int ha = z->img_comp[n].ha;
1474
1475                         if (!rjpeg_jpeg_decode_block(z, data,
1476                                  z->huff_dc+z->img_comp[n].hd,
1477                                  z->huff_ac+ha, z->fast_ac[ha],
1478                                  n, z->dequant[z->img_comp[n].tq]))
1479                            return 0;
1480
1481                         z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1482                               z->img_comp[n].w2, data);
1483                      }
1484                   }
1485                }
1486
1487                /* after all interleaved components, that's an interleaved MCU,
1488                 * so now count down the restart interval */
1489                if (--z->todo <= 0)
1490                {
1491                   if (z->code_bits < 24)
1492                      rjpeg_grow_buffer_unsafe(z);
1493                   if (!RJPEG_RESTART(z->marker))
1494                      return 1;
1495                   rjpeg_jpeg_reset(z);
1496                }
1497             }
1498          }
1499       }
1500    }
1501
1502    return 1;
1503 }
1504
1505 static void rjpeg_jpeg_dequantize(short *data, uint8_t *dequant)
1506 {
1507    int i;
1508    for (i = 0; i < 64; ++i)
1509       data[i] *= dequant[i];
1510 }
1511
1512 static void rjpeg_jpeg_finish(rjpeg_jpeg *z)
1513 {
1514    int i,j,n;
1515
1516    if (!z->progressive)
1517       return;
1518
1519    /* dequantize and IDCT the data */
1520    for (n = 0; n < z->s->img_n; ++n)
1521    {
1522       int w = (z->img_comp[n].x+7) >> 3;
1523       int h = (z->img_comp[n].y+7) >> 3;
1524       for (j = 0; j < h; ++j)
1525       {
1526          for (i = 0; i < w; ++i)
1527          {
1528             short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1529             rjpeg_jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1530             z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1531                   z->img_comp[n].w2, data);
1532          }
1533       }
1534    }
1535 }
1536
1537 static int rjpeg_process_marker(rjpeg_jpeg *z, int m)
1538 {
1539    int L;
1540    switch (m)
1541    {
1542       case RJPEG_MARKER_NONE: /* no marker found */
1543          /* Expected marker. Corrupt JPEG? */
1544          return 0;
1545
1546       case 0xDD: /* DRI - specify restart interval */
1547
1548          /* Bad DRI length. Corrupt JPEG? */
1549          if (RJPEG_GET16BE(z->s) != 4)
1550             return 0;
1551
1552          z->restart_interval = RJPEG_GET16BE(z->s);
1553          return 1;
1554
1555       case 0xDB: /* DQT - define quantization table */
1556          L = RJPEG_GET16BE(z->s)-2;
1557          while (L > 0)
1558          {
1559             int q = rjpeg_get8(z->s);
1560             int p = q >> 4;
1561             int t = q & 15,i;
1562
1563             /* Bad DQT type. Corrupt JPEG? */
1564             if (p != 0)
1565                return 0;
1566
1567             /* Bad DQT table. Corrupt JPEG? */
1568             if (t > 3)
1569                return 0;
1570
1571             for (i = 0; i < 64; ++i)
1572                z->dequant[t][rjpeg_jpeg_dezigzag[i]] = rjpeg_get8(z->s);
1573             L -= 65;
1574          }
1575          return L == 0;
1576
1577       case 0xC4: /* DHT - define huffman table */
1578          L = RJPEG_GET16BE(z->s)-2;
1579          while (L > 0)
1580          {
1581             int sizes[16],i,n = 0;
1582             uint8_t *v = NULL;
1583             int q      = rjpeg_get8(z->s);
1584             int tc     = q >> 4;
1585             int th     = q & 15;
1586
1587             /* Bad DHT header. Corrupt JPEG? */
1588             if (tc > 1 || th > 3)
1589                return 0;
1590
1591             for (i = 0; i < 16; ++i)
1592             {
1593                sizes[i] = rjpeg_get8(z->s);
1594                n += sizes[i];
1595             }
1596             L -= 17;
1597
1598             if (tc == 0)
1599             {
1600                if (!rjpeg_build_huffman(z->huff_dc+th, sizes))
1601                   return 0;
1602                v = z->huff_dc[th].values;
1603             }
1604             else
1605             {
1606                if (!rjpeg_build_huffman(z->huff_ac+th, sizes))
1607                   return 0;
1608                v = z->huff_ac[th].values;
1609             }
1610             for (i = 0; i < n; ++i)
1611                v[i] = rjpeg_get8(z->s);
1612             if (tc != 0)
1613                rjpeg_build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1614             L -= n;
1615          }
1616          return L == 0;
1617    }
1618
1619    /* check for comment block or APP blocks */
1620    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1621    {
1622       int n = RJPEG_GET16BE(z->s)-2;
1623
1624       if (n < 0)
1625          z->s->img_buffer = z->s->img_buffer_end;
1626       else
1627          z->s->img_buffer += n;
1628
1629       return 1;
1630    }
1631    return 0;
1632 }
1633
1634 /* after we see SOS */
1635 static int rjpeg_process_scan_header(rjpeg_jpeg *z)
1636 {
1637    int i;
1638    int aa;
1639    int Ls    = RJPEG_GET16BE(z->s);
1640
1641    z->scan_n = rjpeg_get8(z->s);
1642
1643    /* Bad SOS component count. Corrupt JPEG? */
1644    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1645       return 0;
1646
1647    /* Bad SOS length. Corrupt JPEG? */
1648    if (Ls != 6+2*z->scan_n)
1649       return 0;
1650
1651    for (i = 0; i < z->scan_n; ++i)
1652    {
1653       int which;
1654       int id = rjpeg_get8(z->s);
1655       int q  = rjpeg_get8(z->s);
1656
1657       for (which = 0; which < z->s->img_n; ++which)
1658          if (z->img_comp[which].id == id)
1659             break;
1660       if (which == z->s->img_n)
1661          return 0; /* no match */
1662
1663       /* Bad DC huff. Corrupt JPEG? */
1664       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3)
1665          return 0;
1666
1667       /* Bad AC huff. Corrupt JPEG? */
1668       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3)
1669          return 0;
1670
1671       z->order[i] = which;
1672    }
1673
1674    z->spec_start = rjpeg_get8(z->s);
1675    z->spec_end   = rjpeg_get8(z->s); /* should be 63, but might be 0 */
1676    aa            = rjpeg_get8(z->s);
1677    z->succ_high  = (aa >> 4);
1678    z->succ_low   = (aa & 15);
1679
1680    if (z->progressive)
1681    {
1682       /* Bad SOS. Corrupt JPEG? */
1683       if (  z->spec_start > 63 ||
1684             z->spec_end > 63   ||
1685             z->spec_start > z->spec_end ||
1686             z->succ_high > 13           ||
1687             z->succ_low > 13)
1688          return 0;
1689    }
1690    else
1691    {
1692       /* Bad SOS. Corrupt JPEG? */
1693       if (z->spec_start != 0)
1694          return 0;
1695       if (z->succ_high != 0 || z->succ_low != 0)
1696          return 0;
1697
1698       z->spec_end = 63;
1699    }
1700
1701    return 1;
1702 }
1703
1704 static int rjpeg_process_frame_header(rjpeg_jpeg *z, int scan)
1705 {
1706    rjpeg_context *s = z->s;
1707    int Lf,p,i,q, h_max=1,v_max=1,c;
1708    Lf = RJPEG_GET16BE(s);
1709
1710    /* JPEG */
1711
1712    /* Bad SOF len. Corrupt JPEG? */
1713    if (Lf < 11)
1714       return 0;
1715
1716    p  = rjpeg_get8(s);
1717
1718    /* JPEG baseline */
1719
1720    /* Only 8-bit. JPEG format not supported? */
1721    if (p != 8)
1722       return 0;
1723
1724    s->img_y = RJPEG_GET16BE(s);
1725
1726    /* Legal, but we don't handle it--but neither does IJG */
1727
1728    /* No header height, JPEG format not supported? */
1729    if (s->img_y == 0)
1730       return 0;
1731
1732    s->img_x = RJPEG_GET16BE(s);
1733
1734    /* No header width. Corrupt JPEG? */
1735    if (s->img_x == 0)
1736       return 0;
1737
1738    c = rjpeg_get8(s);
1739
1740    /* JFIF requires */
1741
1742    /* Bad component count. Corrupt JPEG? */
1743    if (c != 3 && c != 1)
1744       return 0;
1745
1746    s->img_n = c;
1747
1748    for (i = 0; i < c; ++i)
1749    {
1750       z->img_comp[i].data = NULL;
1751       z->img_comp[i].linebuf = NULL;
1752    }
1753
1754    /* Bad SOF length. Corrupt JPEG? */
1755    if (Lf != 8+3*s->img_n)
1756       return 0;
1757
1758    for (i = 0; i < s->img_n; ++i)
1759    {
1760       z->img_comp[i].id = rjpeg_get8(s);
1761       if (z->img_comp[i].id != i+1)   /* JFIF requires */
1762          if (z->img_comp[i].id != i)  /* some version of jpegtran outputs non-JFIF-compliant files! */
1763             return 0;
1764
1765       q                = rjpeg_get8(s);
1766       z->img_comp[i].h = (q >> 4);
1767
1768       /* Bad H. Corrupt JPEG? */
1769       if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1770          return 0;
1771
1772       z->img_comp[i].v = q & 15;
1773
1774       /* Bad V. Corrupt JPEG? */
1775       if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1776          return 0;
1777
1778       z->img_comp[i].tq = rjpeg_get8(s);
1779
1780       /* Bad TQ. Corrupt JPEG? */
1781       if (z->img_comp[i].tq > 3)
1782          return 0;
1783    }
1784
1785    if (scan != RJPEG_SCAN_LOAD)
1786       return 1;
1787
1788    /* Image too large to decode? */
1789    if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1790       return 0;
1791
1792    for (i = 0; i < s->img_n; ++i)
1793    {
1794       if (z->img_comp[i].h > h_max)
1795          h_max = z->img_comp[i].h;
1796       if (z->img_comp[i].v > v_max)
1797          v_max = z->img_comp[i].v;
1798    }
1799
1800    /* compute interleaved MCU info */
1801    z->img_h_max = h_max;
1802    z->img_v_max = v_max;
1803    z->img_mcu_w = h_max * 8;
1804    z->img_mcu_h = v_max * 8;
1805    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1806    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1807
1808    if (z->progressive)
1809    {
1810       for (i = 0; i < s->img_n; ++i)
1811       {
1812          /* number of effective pixels (e.g. for non-interleaved MCU) */
1813          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1814          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1815
1816          /* to simplify generation, we'll allocate enough memory to decode
1817           * the bogus oversized data from using interleaved MCUs and their
1818           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1819           * discard the extra data until colorspace conversion */
1820          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1821          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1822          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1823
1824          /* Out of memory? */
1825          if (!z->img_comp[i].raw_data)
1826          {
1827             for (--i; i >= 0; --i)
1828             {
1829                free(z->img_comp[i].raw_data);
1830                z->img_comp[i].data = NULL;
1831             }
1832
1833             return 0;
1834          }
1835
1836          /* align blocks for IDCT using MMX/SSE */
1837          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1838          z->img_comp[i].linebuf   = NULL;
1839          z->img_comp[i].coeff_w   = (z->img_comp[i].w2 + 7) >> 3;
1840          z->img_comp[i].coeff_h   = (z->img_comp[i].h2 + 7) >> 3;
1841          z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1842                                     z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1843          z->img_comp[i].coeff     = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1844       }
1845    }
1846    else
1847    {
1848       for (i = 0; i < s->img_n; ++i)
1849       {
1850          /* number of effective pixels (e.g. for non-interleaved MCU) */
1851          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1852          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1853
1854          /* to simplify generation, we'll allocate enough memory to decode
1855           * the bogus oversized data from using interleaved MCUs and their
1856           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1857           * discard the extra data until colorspace conversion */
1858          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1859          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1860          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1861
1862          /* Out of memory? */
1863          if (!z->img_comp[i].raw_data)
1864          {
1865             for (--i; i >= 0; --i)
1866             {
1867                free(z->img_comp[i].raw_data);
1868                z->img_comp[i].data = NULL;
1869             }
1870          }
1871
1872          /* align blocks for IDCT using MMX/SSE */
1873          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1874          z->img_comp[i].linebuf   = NULL;
1875          z->img_comp[i].coeff     = 0;
1876          z->img_comp[i].raw_coeff = 0;
1877       }
1878    }
1879
1880    return 1;
1881 }
1882
1883 static int rjpeg_decode_jpeg_header(rjpeg_jpeg *z, int scan)
1884 {
1885    int m;
1886    z->marker = RJPEG_MARKER_NONE; /* initialize cached marker to empty */
1887    m         = rjpeg_get_marker(z);
1888
1889    /* No SOI. Corrupt JPEG? */
1890    if (m != JPEG_MARKER_SOI)
1891       return 0;
1892
1893    if (scan == RJPEG_SCAN_TYPE)
1894       return 1;
1895
1896    m = rjpeg_get_marker(z);
1897    while (!RJPEG_SOF(m))
1898    {
1899       if (!rjpeg_process_marker(z,m))
1900          return 0;
1901       m = rjpeg_get_marker(z);
1902       while (m == RJPEG_MARKER_NONE)
1903       {
1904          /* some files have extra padding after their blocks, so ok, we'll scan */
1905
1906          /* No SOF. Corrupt JPEG? */
1907          if (RJPEG_AT_EOF(z->s))
1908             return 0;
1909
1910          m = rjpeg_get_marker(z);
1911       }
1912    }
1913    z->progressive = RJPEG_SOF_PROGRESSIVE(m);
1914    if (!rjpeg_process_frame_header(z, scan))
1915       return 0;
1916    return 1;
1917 }
1918
1919 /* decode image to YCbCr format */
1920 static int rjpeg_decode_jpeg_image(rjpeg_jpeg *j)
1921 {
1922    int m;
1923    for (m = 0; m < 4; m++)
1924    {
1925       j->img_comp[m].raw_data = NULL;
1926       j->img_comp[m].raw_coeff = NULL;
1927    }
1928    j->restart_interval = 0;
1929    if (!rjpeg_decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1930       return 0;
1931    m = rjpeg_get_marker(j);
1932
1933    while (m != JPEG_MARKER_EOI)
1934    {
1935       if (m == JPEG_MARKER_SOS)
1936       {
1937          if (!rjpeg_process_scan_header(j))
1938             return 0;
1939          if (!rjpeg_parse_entropy_coded_data(j))
1940             return 0;
1941
1942          if (j->marker == RJPEG_MARKER_NONE )
1943          {
1944             /* handle 0s at the end of image data from IP Kamera 9060 */
1945
1946             while (!RJPEG_AT_EOF(j->s))
1947             {
1948                int x = rjpeg_get8(j->s);
1949                if (x == 255)
1950                {
1951                   j->marker = rjpeg_get8(j->s);
1952                   break;
1953                }
1954                else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1955                   return 0;
1956             }
1957
1958             /* if we reach eof without hitting a marker,
1959              * rjpeg_get_marker() below will fail and we'll eventually return 0 */
1960          }
1961       }
1962       else
1963       {
1964          if (!rjpeg_process_marker(j, m))
1965             return 0;
1966       }
1967       m = rjpeg_get_marker(j);
1968    }
1969
1970    if (j->progressive)
1971       rjpeg_jpeg_finish(j);
1972    return 1;
1973 }
1974
1975 /* static jfif-centered resampling (across block boundaries) */
1976
1977 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1978       uint8_t *in_far, int w, int hs)
1979 {
1980    (void)out;
1981    (void)in_far;
1982    (void)w;
1983    (void)hs;
1984    return in_near;
1985 }
1986
1987 static uint8_t* rjpeg_resample_row_v_2(uint8_t *out, uint8_t *in_near,
1988       uint8_t *in_far, int w, int hs)
1989 {
1990    /* need to generate two samples vertically for every one in input */
1991    int i;
1992    (void)hs;
1993    for (i = 0; i < w; ++i)
1994       out[i] = RJPEG_DIV4(3*in_near[i] + in_far[i] + 2);
1995    return out;
1996 }
1997
1998 static uint8_t*  rjpeg_resample_row_h_2(uint8_t *out, uint8_t *in_near,
1999       uint8_t *in_far, int w, int hs)
2000 {
2001    /* need to generate two samples horizontally for every one in input */
2002    int i;
2003    uint8_t *input = in_near;
2004
2005    if (w == 1)
2006    {
2007       /* if only one sample, can't do any interpolation */
2008       out[0] = out[1] = input[0];
2009       return out;
2010    }
2011
2012    out[0] = input[0];
2013    out[1] = RJPEG_DIV4(input[0]*3 + input[1] + 2);
2014
2015    for (i=1; i < w-1; ++i)
2016    {
2017       int n      = 3 * input[i] + 2;
2018       out[i*2+0] = RJPEG_DIV4(n+input[i-1]);
2019       out[i*2+1] = RJPEG_DIV4(n+input[i+1]);
2020    }
2021    out[i*2+0] = RJPEG_DIV4(input[w-2]*3 + input[w-1] + 2);
2022    out[i*2+1] = input[w-1];
2023
2024    (void)in_far;
2025    (void)hs;
2026
2027    return out;
2028 }
2029
2030 static uint8_t *rjpeg_resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2031       uint8_t *in_far, int w, int hs)
2032 {
2033    /* need to generate 2x2 samples for every one in input */
2034    int i,t0,t1;
2035    if (w == 1)
2036    {
2037       out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2038       return out;
2039    }
2040
2041    t1     = 3*in_near[0] + in_far[0];
2042    out[0] = RJPEG_DIV4(t1+2);
2043
2044    for (i = 1; i < w; ++i)
2045    {
2046       t0         = t1;
2047       t1         = 3*in_near[i]+in_far[i];
2048       out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2049       out[i*2  ] = RJPEG_DIV16(3*t1 + t0 + 8);
2050    }
2051    out[w*2-1] = RJPEG_DIV4(t1+2);
2052
2053    (void)hs;
2054
2055    return out;
2056 }
2057
2058 #if defined(__SSE2__) || defined(RJPEG_NEON)
2059 static uint8_t *rjpeg_resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2060       uint8_t *in_far, int w, int hs)
2061 {
2062    /* need to generate 2x2 samples for every one in input */
2063    int i = 0,t0,t1;
2064
2065    if (w == 1)
2066    {
2067       out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2068       return out;
2069    }
2070
2071    t1 = 3*in_near[0] + in_far[0];
2072    /* process groups of 8 pixels for as long as we can.
2073     * note we can't handle the last pixel in a row in this loop
2074     * because we need to handle the filter boundary conditions.
2075     */
2076    for (; i < ((w-1) & ~7); i += 8)
2077    {
2078 #if defined(__SSE2__)
2079       /* load and perform the vertical filtering pass
2080        * this uses 3*x + y = 4*x + (y - x) */
2081       __m128i zero  = _mm_setzero_si128();
2082       __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
2083       __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2084       __m128i farw  = _mm_unpacklo_epi8(farb, zero);
2085       __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2086       __m128i diff  = _mm_sub_epi16(farw, nearw);
2087       __m128i nears = _mm_slli_epi16(nearw, 2);
2088       __m128i curr  = _mm_add_epi16(nears, diff); /* current row */
2089
2090       /* horizontal filter works the same based on shifted vers of current
2091        * row. "prev" is current row shifted right by 1 pixel; we need to
2092        * insert the previous pixel value (from t1).
2093        * "next" is current row shifted left by 1 pixel, with first pixel
2094        * of next block of 8 pixels added in.
2095        */
2096       __m128i prv0 = _mm_slli_si128(curr, 2);
2097       __m128i nxt0 = _mm_srli_si128(curr, 2);
2098       __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2099       __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2100
2101       /* horizontal filter, polyphase implementation since it's convenient:
2102        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2103        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2104        * note the shared term. */
2105       __m128i bias = _mm_set1_epi16(8);
2106       __m128i curs = _mm_slli_epi16(curr, 2);
2107       __m128i prvd = _mm_sub_epi16(prev, curr);
2108       __m128i nxtd = _mm_sub_epi16(next, curr);
2109       __m128i curb = _mm_add_epi16(curs, bias);
2110       __m128i even = _mm_add_epi16(prvd, curb);
2111       __m128i odd  = _mm_add_epi16(nxtd, curb);
2112
2113       /* interleave even and odd pixels, then undo scaling. */
2114       __m128i int0 = _mm_unpacklo_epi16(even, odd);
2115       __m128i int1 = _mm_unpackhi_epi16(even, odd);
2116       __m128i de0  = _mm_srli_epi16(int0, 4);
2117       __m128i de1  = _mm_srli_epi16(int1, 4);
2118
2119       /* pack and write output */
2120       __m128i outv = _mm_packus_epi16(de0, de1);
2121       _mm_storeu_si128((__m128i *) (out + i*2), outv);
2122 #elif defined(RJPEG_NEON)
2123       /* load and perform the vertical filtering pass
2124        * this uses 3*x + y = 4*x + (y - x) */
2125       uint8x8_t farb  = vld1_u8(in_far + i);
2126       uint8x8_t nearb = vld1_u8(in_near + i);
2127       int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2128       int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2129       int16x8_t curr  = vaddq_s16(nears, diff); /* current row */
2130
2131       /* horizontal filter works the same based on shifted vers of current
2132        * row. "prev" is current row shifted right by 1 pixel; we need to
2133        * insert the previous pixel value (from t1).
2134        * "next" is current row shifted left by 1 pixel, with first pixel
2135        * of next block of 8 pixels added in. */
2136       int16x8_t prv0 = vextq_s16(curr, curr, 7);
2137       int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2138       int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2139       int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2140
2141       /* horizontal filter, polyphase implementation since it's convenient:
2142        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2143        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2144        * note the shared term.
2145        */
2146       int16x8_t curs = vshlq_n_s16(curr, 2);
2147       int16x8_t prvd = vsubq_s16(prev, curr);
2148       int16x8_t nxtd = vsubq_s16(next, curr);
2149       int16x8_t even = vaddq_s16(curs, prvd);
2150       int16x8_t odd  = vaddq_s16(curs, nxtd);
2151
2152       /* undo scaling and round, then store with even/odd phases interleaved */
2153       uint8x8x2_t o;
2154       o.val[0] = vqrshrun_n_s16(even, 4);
2155       o.val[1] = vqrshrun_n_s16(odd,  4);
2156       vst2_u8(out + i*2, o);
2157 #endif
2158
2159       /* "previous" value for next iteration */
2160       t1 = 3*in_near[i+7] + in_far[i+7];
2161    }
2162
2163    t0       = t1;
2164    t1       = 3*in_near[i] + in_far[i];
2165    out[i*2] = RJPEG_DIV16(3*t1 + t0 + 8);
2166
2167    for (++i; i < w; ++i)
2168    {
2169       t0         = t1;
2170       t1         = 3*in_near[i]+in_far[i];
2171       out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2172       out[i*2  ] = RJPEG_DIV16(3*t1 + t0 + 8);
2173    }
2174    out[w*2-1]    = RJPEG_DIV4(t1+2);
2175
2176    (void)hs;
2177
2178    return out;
2179 }
2180 #endif
2181
2182 static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
2183       uint8_t *in_near, uint8_t *in_far, int w, int hs)
2184 {
2185    /* resample with nearest-neighbor */
2186    int i,j;
2187    (void)in_far;
2188
2189    for (i = 0; i < w; ++i)
2190       for (j = 0; j < hs; ++j)
2191          out[i*hs+j] = in_near[i];
2192    return out;
2193 }
2194
2195 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2196  * to make sure the code produces the same results in both SIMD and scalar */
2197 #ifndef FLOAT2FIXED
2198 #define FLOAT2FIXED(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
2199 #endif
2200
2201 static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2202       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2203 {
2204    int i;
2205    for (i = 0; i < count; ++i)
2206    {
2207       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2208       int cr = pcr[i] - 128;
2209       int cb = pcb[i] - 128;
2210       int r = y_fixed +  cr* FLOAT2FIXED(1.40200f);
2211       int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2212       int b = y_fixed                               +   cb* FLOAT2FIXED(1.77200f);
2213       r >>= 20;
2214       g >>= 20;
2215       b >>= 20;
2216       if ((unsigned) r > 255)
2217          r = 255;
2218       if ((unsigned) g > 255)
2219          g = 255;
2220       if ((unsigned) b > 255)
2221          b = 255;
2222       out[0] = (uint8_t)r;
2223       out[1] = (uint8_t)g;
2224       out[2] = (uint8_t)b;
2225       out[3] = 255;
2226       out += step;
2227    }
2228 }
2229
2230 #if defined(__SSE2__) || defined(RJPEG_NEON)
2231 static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2232       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2233 {
2234    int i = 0;
2235
2236 #if defined(__SSE2__)
2237    /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2238     * it's useful in practice (you wouldn't use it for textures, for example).
2239     * so just accelerate step == 4 case.
2240     */
2241    if (step == 4)
2242    {
2243       /* this is a fairly straightforward implementation and not super-optimized. */
2244       __m128i signflip  = _mm_set1_epi8(-0x80);
2245       __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
2246       __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2247       __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2248       __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
2249       __m128i y_bias    = _mm_set1_epi8((char) (unsigned char) 128);
2250       __m128i xw        = _mm_set1_epi16(255); /* alpha channel */
2251
2252       for (; i+7 < count; i += 8)
2253       {
2254          /* load */
2255          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2256          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2257          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2258          __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2259          __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2260
2261          /* unpack to short (and left-shift cr, cb by 8) */
2262          __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
2263          __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2264          __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2265
2266          /* color transform */
2267          __m128i yws = _mm_srli_epi16(yw, 4);
2268          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2269          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2270          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2271          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2272          __m128i rws = _mm_add_epi16(cr0, yws);
2273          __m128i gwt = _mm_add_epi16(cb0, yws);
2274          __m128i bws = _mm_add_epi16(yws, cb1);
2275          __m128i gws = _mm_add_epi16(gwt, cr1);
2276
2277          /* descale */
2278          __m128i rw = _mm_srai_epi16(rws, 4);
2279          __m128i bw = _mm_srai_epi16(bws, 4);
2280          __m128i gw = _mm_srai_epi16(gws, 4);
2281
2282          /* back to byte, set up for transpose */
2283          __m128i brb = _mm_packus_epi16(rw, bw);
2284          __m128i gxb = _mm_packus_epi16(gw, xw);
2285
2286          /* transpose to interleave channels */
2287          __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2288          __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2289          __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2290          __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2291
2292          /* store */
2293          _mm_storeu_si128((__m128i *) (out + 0), o0);
2294          _mm_storeu_si128((__m128i *) (out + 16), o1);
2295          out += 32;
2296       }
2297    }
2298 #endif
2299
2300 #ifdef RJPEG_NEON
2301    /* in this version, step=3 support would be easy to add. but is there demand? */
2302    if (step == 4)
2303    {
2304       /* this is a fairly straightforward implementation and not super-optimized. */
2305       uint8x8_t signflip = vdup_n_u8(0x80);
2306       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
2307       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2308       int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2309       int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
2310
2311       for (; i+7 < count; i += 8)
2312       {
2313          uint8x8x4_t o;
2314
2315          /* load */
2316          uint8x8_t y_bytes  = vld1_u8(y + i);
2317          uint8x8_t cr_bytes = vld1_u8(pcr + i);
2318          uint8x8_t cb_bytes = vld1_u8(pcb + i);
2319          int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2320          int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2321
2322          /* expand to s16 */
2323          int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2324          int16x8_t crw = vshll_n_s8(cr_biased, 7);
2325          int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2326
2327          /* color transform */
2328          int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2329          int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2330          int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2331          int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2332          int16x8_t rws = vaddq_s16(yws, cr0);
2333          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2334          int16x8_t bws = vaddq_s16(yws, cb1);
2335
2336          /* undo scaling, round, convert to byte */
2337          o.val[0] = vqrshrun_n_s16(rws, 4);
2338          o.val[1] = vqrshrun_n_s16(gws, 4);
2339          o.val[2] = vqrshrun_n_s16(bws, 4);
2340          o.val[3] = vdup_n_u8(255);
2341
2342          /* store, interleaving r/g/b/a */
2343          vst4_u8(out, o);
2344          out += 8*4;
2345       }
2346    }
2347 #endif
2348
2349    for (; i < count; ++i)
2350    {
2351       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2352       int cr      = pcr[i] - 128;
2353       int cb      = pcb[i] - 128;
2354       int r       = y_fixed + cr* FLOAT2FIXED(1.40200f);
2355       int g       = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2356       int b       = y_fixed                             +   cb* FLOAT2FIXED(1.77200f);
2357       r >>= 20;
2358       g >>= 20;
2359       b >>= 20;
2360       if ((unsigned) r > 255)
2361          r = 255;
2362       if ((unsigned) g > 255)
2363          g = 255;
2364       if ((unsigned) b > 255)
2365          b = 255;
2366       out[0] = (uint8_t)r;
2367       out[1] = (uint8_t)g;
2368       out[2] = (uint8_t)b;
2369       out[3] = 255;
2370       out += step;
2371    }
2372 }
2373 #endif
2374
2375 /* set up the kernels */
2376 static void rjpeg_setup_jpeg(rjpeg_jpeg *j)
2377 {
2378    uint64_t mask = cpu_features_get();
2379
2380    (void)mask;
2381
2382    j->idct_block_kernel        = rjpeg_idct_block;
2383    j->YCbCr_to_RGB_kernel      = rjpeg_YCbCr_to_RGB_row;
2384    j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2;
2385
2386 #if defined(__SSE2__)
2387    if (mask & RETRO_SIMD_SSE2)
2388    {
2389       j->idct_block_kernel        = rjpeg_idct_simd;
2390       j->YCbCr_to_RGB_kernel      = rjpeg_YCbCr_to_RGB_simd;
2391       j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2392    }
2393 #endif
2394
2395 #ifdef RJPEG_NEON
2396    j->idct_block_kernel           = rjpeg_idct_simd;
2397    j->YCbCr_to_RGB_kernel         = rjpeg_YCbCr_to_RGB_simd;
2398    j->resample_row_hv_2_kernel    = rjpeg_resample_row_hv_2_simd;
2399 #endif
2400 }
2401
2402 /* clean up the temporary component buffers */
2403 static void rjpeg_cleanup_jpeg(rjpeg_jpeg *j)
2404 {
2405    int i;
2406    for (i = 0; i < j->s->img_n; ++i)
2407    {
2408       if (j->img_comp[i].raw_data)
2409       {
2410          free(j->img_comp[i].raw_data);
2411          j->img_comp[i].raw_data = NULL;
2412          j->img_comp[i].data = NULL;
2413       }
2414
2415       if (j->img_comp[i].raw_coeff)
2416       {
2417          free(j->img_comp[i].raw_coeff);
2418          j->img_comp[i].raw_coeff = 0;
2419          j->img_comp[i].coeff = 0;
2420       }
2421
2422       if (j->img_comp[i].linebuf)
2423       {
2424          free(j->img_comp[i].linebuf);
2425          j->img_comp[i].linebuf = NULL;
2426       }
2427    }
2428 }
2429
2430 static uint8_t *rjpeg_load_jpeg_image(rjpeg_jpeg *z,
2431       unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2432 {
2433    int n, decode_n;
2434    int k;
2435    unsigned int i,j;
2436    rjpeg_resample res_comp[4];
2437    uint8_t *coutput[4] = {0};
2438    uint8_t *output     = NULL;
2439    z->s->img_n         = 0;
2440
2441    /* load a jpeg image from whichever source, but leave in YCbCr format */
2442    if (!rjpeg_decode_jpeg_image(z))
2443       goto error;
2444
2445    /* determine actual number of components to generate */
2446    n = req_comp ? req_comp : z->s->img_n;
2447
2448    if (z->s->img_n == 3 && n < 3)
2449       decode_n = 1;
2450    else
2451       decode_n = z->s->img_n;
2452
2453    /* resample and color-convert */
2454    for (k = 0; k < decode_n; ++k)
2455    {
2456       rjpeg_resample *r = &res_comp[k];
2457
2458       /* allocate line buffer big enough for upsampling off the edges
2459        * with upsample factor of 4 */
2460       z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2461       if (!z->img_comp[k].linebuf)
2462          goto error;
2463
2464       r->hs       = z->img_h_max / z->img_comp[k].h;
2465       r->vs       = z->img_v_max / z->img_comp[k].v;
2466       r->ystep    = r->vs >> 1;
2467       r->w_lores  = (z->s->img_x + r->hs-1) / r->hs;
2468       r->ypos     = 0;
2469       r->line0    = r->line1 = z->img_comp[k].data;
2470       r->resample = rjpeg_resample_row_generic;
2471
2472       if      (r->hs == 1 && r->vs == 1)
2473          r->resample = rjpeg_resample_row_1;
2474       else if (r->hs == 1 && r->vs == 2)
2475          r->resample = rjpeg_resample_row_v_2;
2476       else if (r->hs == 2 && r->vs == 1)
2477          r->resample = rjpeg_resample_row_h_2;
2478       else if (r->hs == 2 && r->vs == 2)
2479          r->resample = z->resample_row_hv_2_kernel;
2480    }
2481
2482    /* can't error after this so, this is safe */
2483    output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2484
2485    if (!output)
2486       goto error;
2487
2488    /* now go ahead and resample */
2489    for (j = 0; j < z->s->img_y; ++j)
2490    {
2491       uint8_t *out = output + n * z->s->img_x * j;
2492       for (k = 0; k < decode_n; ++k)
2493       {
2494          rjpeg_resample *r = &res_comp[k];
2495          int         y_bot  = r->ystep >= (r->vs >> 1);
2496
2497          coutput[k]         = r->resample(z->img_comp[k].linebuf,
2498                y_bot ? r->line1 : r->line0,
2499                y_bot ? r->line0 : r->line1,
2500                r->w_lores, r->hs);
2501
2502          if (++r->ystep >= r->vs)
2503          {
2504             r->ystep = 0;
2505             r->line0 = r->line1;
2506             if (++r->ypos < z->img_comp[k].y)
2507                r->line1 += z->img_comp[k].w2;
2508          }
2509       }
2510
2511       if (n >= 3)
2512       {
2513          uint8_t *y = coutput[0];
2514          if (y)
2515          {
2516             if (z->s->img_n == 3)
2517                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2518             else
2519                for (i = 0; i < z->s->img_x; ++i)
2520                {
2521                   out[0]  = out[1] = out[2] = y[i];
2522                   out[3]  = 255; /* not used if n==3 */
2523                   out    += n;
2524                }
2525          }
2526       }
2527       else
2528       {
2529          uint8_t *y = coutput[0];
2530          if (n == 1)
2531             for (i = 0; i < z->s->img_x; ++i)
2532                out[i] = y[i];
2533          else
2534             for (i = 0; i < z->s->img_x; ++i)
2535             {
2536                *out++ = y[i];
2537                *out++ = 255;
2538             }
2539       }
2540    }
2541
2542    rjpeg_cleanup_jpeg(z);
2543    *out_x = z->s->img_x;
2544    *out_y = z->s->img_y;
2545
2546    if (comp)
2547       *comp  = z->s->img_n; /* report original components, not output */
2548    return output;
2549
2550 error:
2551    rjpeg_cleanup_jpeg(z);
2552    return NULL;
2553 }
2554
2555 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2556       size_t size, unsigned *width, unsigned *height)
2557 {
2558    rjpeg_jpeg j;
2559    rjpeg_context s;
2560    int comp;
2561    uint32_t *img         = NULL;
2562    uint32_t *pixels      = NULL;
2563    unsigned size_tex     = 0;
2564
2565    if (!rjpeg)
2566       return IMAGE_PROCESS_ERROR;
2567
2568    s.img_buffer          = (uint8_t*)rjpeg->buff_data;
2569    s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2570    s.img_buffer_end      = (uint8_t*)rjpeg->buff_data + (int)size;
2571
2572    j.s                   = &s;
2573
2574    rjpeg_setup_jpeg(&j);
2575
2576    img                   =  (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2577
2578    if (!img)
2579       return IMAGE_PROCESS_ERROR;
2580
2581    size_tex = (*width) * (*height);
2582    pixels   = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2583
2584    if (!pixels)
2585    {
2586       free(img);
2587       return IMAGE_PROCESS_ERROR;
2588    }
2589
2590    *buf_data = pixels;
2591
2592    /* Convert RGBA to ARGB */
2593    while (size_tex--)
2594    {
2595       unsigned int texel = img[size_tex];
2596       unsigned int A     = texel & 0xFF000000;
2597       unsigned int B     = texel & 0x00FF0000;
2598       unsigned int G     = texel & 0x0000FF00;
2599       unsigned int R     = texel & 0x000000FF;
2600       ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2601    }
2602
2603    free(img);
2604
2605    return IMAGE_PROCESS_END;
2606 }
2607
2608 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2609 {
2610    if (!rjpeg)
2611       return false;
2612
2613    rjpeg->buff_data = (uint8_t*)data;
2614
2615    return true;
2616 }
2617
2618 void rjpeg_free(rjpeg_t *rjpeg)
2619 {
2620    if (!rjpeg)
2621       return;
2622
2623    free(rjpeg);
2624 }
2625
2626 rjpeg_t *rjpeg_alloc(void)
2627 {
2628    rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2629    if (!rjpeg)
2630       return NULL;
2631    return rjpeg;
2632 }