1 /* Copyright (C) 2010-2020 The RetroArch team
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (rjpeg.c).
5 * ---------------------------------------------------------------------------------------
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 /* Modified version of stb_image's JPEG sources. */
27 #include <stddef.h> /* ptrdiff_t on osx */
31 #include <retro_inline.h>
33 #include <formats/image.h>
34 #include <formats/rjpeg.h>
35 #include <features/features_cpu.h>
39 RJPEG_DEFAULT = 0, /* only used for req_comp */
53 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
58 rjpeg_resample_row_func resample;
61 int hs,vs; /* expansion factor in each axis */
62 int w_lores; /* horizontal pixels pre-expansion */
63 int ystep; /* how far through vertical expansion we are */
64 int ypos; /* which pre-expansion row we're on */
73 #define RJPEG_HAS_LROTL
76 #ifdef RJPEG_HAS_LROTL
77 #define RJPEG_LROT(x,y) _lrotl(x,y)
79 #define RJPEG_LROT(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
82 /* x86/x64 detection */
83 #if defined(__x86_64__) || defined(_M_X64)
84 #define RJPEG_X64_TARGET
85 #elif defined(__i386) || defined(_M_IX86)
86 #define RJPEG_X86_TARGET
89 #if defined(__GNUC__) && (defined(RJPEG_X86_TARGET) || defined(RJPEG_X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
90 /* NOTE: not clear do we actually need this for the 64-bit path?
91 * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
92 * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
93 * this is just broken and gcc are jerks for not fixing it properly
94 * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
99 #if defined(__MINGW32__) && defined(RJPEG_X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
100 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG_X64_TARGET
102 * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
103 * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
104 * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
105 * simultaneously enabling "-mstackrealign".
107 * See https://github.com/nothings/stb/issues/81 for more information.
109 * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
110 * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112 #define RJPEG_NO_SIMD
115 #if defined(__SSE2__)
116 #include <emmintrin.h>
119 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
127 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
132 #include <arm_neon.h>
133 /* assume GCC or Clang on ARM targets */
134 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
137 #ifndef RJPEG_SIMD_ALIGN
138 #define RJPEG_SIMD_ALIGN(type, name) type name
144 uint8_t *img_buffer_end;
145 uint8_t *img_buffer_original;
151 uint8_t buffer_start[128];
154 static INLINE uint8_t rjpeg_get8(rjpeg_context *s)
156 if (s->img_buffer < s->img_buffer_end)
157 return *s->img_buffer++;
162 #define RJPEG_AT_EOF(s) ((s)->img_buffer >= (s)->img_buffer_end)
164 #define RJPEG_GET16BE(s) ((rjpeg_get8((s)) << 8) + rjpeg_get8((s)))
166 /* huffman decoding acceleration */
167 #define FAST_BITS 9 /* larger handles more cases; smaller stomps less cache */
171 unsigned int maxcode[18];
172 int delta[17]; /* old 'firstsymbol' - old 'firstcode' */
173 /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
175 uint8_t fast[1 << FAST_BITS];
184 void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
185 void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
186 const uint8_t *pcr, int count, int step);
187 uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
188 uint8_t *in_far, int w, int hs);
190 /* definition of jpeg image component */
194 void *raw_data, *raw_coeff;
196 short *coeff; /* progressive only */
204 int coeff_w; /* number of 8x8 coefficient blocks */
205 int coeff_h; /* number of 8x8 coefficient blocks */
208 /* sizes for components, interleaved MCUs */
209 int img_h_max, img_v_max;
210 int img_mcu_x, img_mcu_y;
211 int img_mcu_w, img_mcu_h;
213 int code_bits; /* number of valid bits */
214 int nomore; /* flag if we saw a marker so must stop */
221 int scan_n, order[4];
222 int restart_interval, todo;
223 uint32_t code_buffer; /* jpeg entropy-coded buffer */
224 rjpeg_huffman huff_dc[4]; /* unsigned int alignment */
225 rjpeg_huffman huff_ac[4]; /* unsigned int alignment */
226 int16_t fast_ac[4][1 << FAST_BITS];
227 unsigned char marker; /* marker seen while filling entropy buffer */
228 uint8_t dequant[4][64];
231 #define RJPEG_F2F(x) ((int) (((x) * 4096 + 0.5)))
232 #define RJPEG_FSH(x) ((x) << 12)
234 #define RJPEG_MARKER_NONE 0xff
235 /* if there's a pending marker from the entropy stream, return that
236 * otherwise, fetch from the stream and get a marker. if there's no
237 * marker, return 0xff, which is never a valid marker value
240 /* in each scan, we'll have scan_n components, and the order
241 * of the components is specified by order[]
243 #define RJPEG_RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
245 #define JPEG_MARKER 0xFF
246 #define JPEG_MARKER_SOI 0xD8
247 #define JPEG_MARKER_SOS 0xDA
248 #define JPEG_MARKER_EOI 0xD9
249 #define JPEG_MARKER_APP1 0xE1
250 #define JPEG_MARKER_APP2 0xE2
252 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
253 #define RJPEG_SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
255 #define RJPEG_SOF_PROGRESSIVE(x) ((x) == 0xc2)
256 #define RJPEG_DIV4(x) ((uint8_t) ((x) >> 2))
257 #define RJPEG_DIV16(x) ((uint8_t) ((x) >> 4))
259 static int rjpeg_build_huffman(rjpeg_huffman *h, int *count)
263 /* build size list for each symbol (from JPEG spec) */
264 for (i = 0; i < 16; ++i)
265 for (j = 0; j < count[i]; ++j)
266 h->size[k++] = (uint8_t) (i+1);
269 /* compute actual symbols (from jpeg spec) */
273 for (j = 1; j <= 16; ++j)
275 /* compute delta to add to code to compute symbol id */
276 h->delta[j] = k - code;
279 while (h->size[k] == j)
280 h->code[k++] = (uint16_t) (code++);
282 /* Bad code lengths, corrupt JPEG? */
283 if (code-1 >= (1 << j))
286 /* compute largest code + 1 for this size, preshifted as needed later */
287 h->maxcode[j] = code << (16-j);
290 h->maxcode[j] = 0xffffffff;
292 /* build non-spec acceleration table; 255 is flag for not-accelerated */
293 memset(h->fast, 255, 1 << FAST_BITS);
294 for (i = 0; i < k; ++i)
299 int c = h->code[i] << (FAST_BITS-s);
300 int m = 1 << (FAST_BITS-s);
301 for (j = 0; j < m; ++j)
302 h->fast[c+j] = (uint8_t) i;
308 /* build a table that decodes both magnitude and value of small ACs in
310 static void rjpeg_build_fast_ac(int16_t *fast_ac, rjpeg_huffman *h)
314 for (i = 0; i < (1 << FAST_BITS); ++i)
316 uint8_t fast = h->fast[i];
322 int rs = h->values[fast];
323 int run = (rs >> 4) & 15;
324 int magbits = rs & 15;
325 int len = h->size[fast];
327 if (magbits && len + magbits <= FAST_BITS)
329 /* magnitude code followed by receive_extend code */
330 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
331 int m = 1 << (magbits - 1);
333 k += (-1 << magbits) + 1;
335 /* if the result is small enough, we can fit it in fast_ac table */
336 if (k >= -128 && k <= 127)
337 fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
343 static void rjpeg_grow_buffer_unsafe(rjpeg_jpeg *j)
347 int b = j->nomore ? 0 : rjpeg_get8(j->s);
350 int c = rjpeg_get8(j->s);
354 j->marker = (unsigned char) c;
359 j->code_buffer |= b << (24 - j->code_bits);
361 } while (j->code_bits <= 24);
365 static uint32_t rjpeg_bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
367 /* decode a JPEG huffman value from the bitstream */
368 static INLINE int rjpeg_jpeg_huff_decode(rjpeg_jpeg *j, rjpeg_huffman *h)
373 if (j->code_bits < 16)
374 rjpeg_grow_buffer_unsafe(j);
376 /* look at the top FAST_BITS and determine what symbol ID it is,
377 * if the code is <= FAST_BITS */
378 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
384 if (s > j->code_bits)
386 j->code_buffer <<= s;
391 /* naive test is to shift the code_buffer down so k bits are
392 * valid, then test against maxcode. To speed this up, we've
393 * preshifted maxcode left so that it has (16-k) 0s at the
394 * end; in other words, regardless of the number of bits, it
395 * wants to be compared against something shifted to have 16;
396 * that way we don't need to shift inside the loop. */
397 temp = j->code_buffer >> 16;
398 for (k=FAST_BITS+1 ; ; ++k)
399 if (temp < h->maxcode[k])
404 /* error! code not found */
409 if (k > j->code_bits)
412 /* convert the huffman code to the symbol id */
413 c = ((j->code_buffer >> (32 - k)) & rjpeg_bmask[k]) + h->delta[k];
415 /* convert the id to a symbol */
417 j->code_buffer <<= k;
421 /* bias[n] = (-1<<n) + 1 */
422 static int const rjpeg_jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
424 /* combined JPEG 'receive' and JPEG 'extend', since baseline
425 * always extends everything it receives. */
426 static INLINE int rjpeg_extend_receive(rjpeg_jpeg *j, int n)
430 if (j->code_bits < n)
431 rjpeg_grow_buffer_unsafe(j);
433 sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
434 k = RJPEG_LROT(j->code_buffer, n);
435 j->code_buffer = k & ~rjpeg_bmask[n];
438 return k + (rjpeg_jbias[n] & ~sgn);
441 /* get some unsigned bits */
442 static INLINE int rjpeg_jpeg_get_bits(rjpeg_jpeg *j, int n)
445 if (j->code_bits < n)
446 rjpeg_grow_buffer_unsafe(j);
447 k = RJPEG_LROT(j->code_buffer, n);
448 j->code_buffer = k & ~rjpeg_bmask[n];
454 static INLINE int rjpeg_jpeg_get_bit(rjpeg_jpeg *j)
457 if (j->code_bits < 1)
458 rjpeg_grow_buffer_unsafe(j);
461 j->code_buffer <<= 1;
463 return k & 0x80000000;
466 /* given a value that's at position X in the zigzag stream,
467 * where does it appear in the 8x8 matrix coded as row-major? */
468 static uint8_t rjpeg_jpeg_dezigzag[64+15] =
470 0, 1, 8, 16, 9, 2, 3, 10,
471 17, 24, 32, 25, 18, 11, 4, 5,
472 12, 19, 26, 33, 40, 48, 41, 34,
473 27, 20, 13, 6, 7, 14, 21, 28,
474 35, 42, 49, 56, 57, 50, 43, 36,
475 29, 22, 15, 23, 30, 37, 44, 51,
476 58, 59, 52, 45, 38, 31, 39, 46,
477 53, 60, 61, 54, 47, 55, 62, 63,
478 /* let corrupt input sample past end */
479 63, 63, 63, 63, 63, 63, 63, 63,
480 63, 63, 63, 63, 63, 63, 63
483 /* decode one 64-entry block-- */
484 static int rjpeg_jpeg_decode_block(
485 rjpeg_jpeg *j, short data[64],
496 if (j->code_bits < 16)
497 rjpeg_grow_buffer_unsafe(j);
498 t = rjpeg_jpeg_huff_decode(j, hdc);
500 /* Bad huffman code. Corrupt JPEG? */
504 /* 0 all the ac values now so we can do it 32-bits at a time */
505 memset(data,0,64*sizeof(data[0]));
508 diff = rjpeg_extend_receive(j, t);
509 dc = j->img_comp[b].dc_pred + diff;
510 j->img_comp[b].dc_pred = dc;
511 data[0] = (short) (dc * dequant[0]);
513 /* decode AC components, see JPEG spec */
519 if (j->code_bits < 16)
520 rjpeg_grow_buffer_unsafe(j);
521 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
526 k += (r >> 4) & 15; /* run */
527 s = r & 15; /* combined length */
528 j->code_buffer <<= s;
530 /* decode into unzigzag'd location */
531 zig = rjpeg_jpeg_dezigzag[k++];
532 data[zig] = (short) ((r >> 8) * dequant[zig]);
536 int rs = rjpeg_jpeg_huff_decode(j, hac);
538 /* Bad huffman code. Corrupt JPEG? */
547 break; /* end block */
553 /* decode into unzigzag'd location */
554 zig = rjpeg_jpeg_dezigzag[k++];
555 data[zig] = (short) (rjpeg_extend_receive(j,s) * dequant[zig]);
562 static int rjpeg_jpeg_decode_block_prog_dc(
568 /* Can't merge DC and AC. Corrupt JPEG? */
569 if (j->spec_end != 0)
572 if (j->code_bits < 16)
573 rjpeg_grow_buffer_unsafe(j);
575 if (j->succ_high == 0)
581 /* first scan for DC coefficient, must be first */
582 memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
583 t = rjpeg_jpeg_huff_decode(j, hdc);
585 diff = rjpeg_extend_receive(j, t);
587 dc = j->img_comp[b].dc_pred + diff;
588 j->img_comp[b].dc_pred = dc;
589 data[0] = (short) (dc << j->succ_low);
593 /* refinement scan for DC coefficient */
594 if (rjpeg_jpeg_get_bit(j))
595 data[0] += (short) (1 << j->succ_low);
600 static int rjpeg_jpeg_decode_block_prog_ac(
608 /* Can't merge DC and AC. Corrupt JPEG? */
609 if (j->spec_start == 0)
612 if (j->succ_high == 0)
614 int shift = j->succ_low;
627 if (j->code_bits < 16)
628 rjpeg_grow_buffer_unsafe(j);
629 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
634 k += (r >> 4) & 15; /* run */
635 s = r & 15; /* combined length */
636 j->code_buffer <<= s;
638 zig = rjpeg_jpeg_dezigzag[k++];
639 data[zig] = (short) ((r >> 8) << shift);
643 int rs = rjpeg_jpeg_huff_decode(j, hac);
645 /* Bad huffman code. Corrupt JPEG? */
655 j->eob_run = (1 << r);
657 j->eob_run += rjpeg_jpeg_get_bits(j, r);
666 zig = rjpeg_jpeg_dezigzag[k++];
667 data[zig] = (short) (rjpeg_extend_receive(j,s) << shift);
670 } while (k <= j->spec_end);
674 /* refinement scan for these AC coefficients */
676 short bit = (short) (1 << j->succ_low);
681 for (k = j->spec_start; k <= j->spec_end; ++k)
683 short *p = &data[rjpeg_jpeg_dezigzag[k]];
685 if (rjpeg_jpeg_get_bit(j))
701 int rs = rjpeg_jpeg_huff_decode(j, hac);
703 /* Bad huffman code. Corrupt JPEG? */
713 j->eob_run = (1 << r) - 1;
715 j->eob_run += rjpeg_jpeg_get_bits(j, r);
716 r = 64; /* force end of block */
720 /* r=15 s=0 should write 16 0s, so we just do
721 * a run of 15 0s and then write s (which is 0),
722 * so we don't have to do anything special here */
727 /* Bad huffman code. Corrupt JPEG? */
732 if (rjpeg_jpeg_get_bit(j))
739 while (k <= j->spec_end)
741 short *p = &data[rjpeg_jpeg_dezigzag[k++]];
744 if (rjpeg_jpeg_get_bit(j))
763 } while (k <= j->spec_end);
769 /* take a -128..127 value and rjpeg_clamp it and convert to 0..255 */
770 static INLINE uint8_t rjpeg_clamp(int x)
772 /* trick to use a single test to catch both cases */
773 if ((unsigned int) x > 255)
778 /* derived from jidctint -- DCT_ISLOW */
779 #define RJPEG_IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
780 int t0,t1,p4,p5,x0,x1,x2,x3; \
783 int p1 = (p2+p3) * RJPEG_F2F(0.5411961f); \
784 int t2 = p1 + p3 * RJPEG_F2F(-1.847759065f);\
785 int t3 = p1 + p2 * RJPEG_F2F( 0.765366865f);\
788 t0 = RJPEG_FSH(p2+p3); \
789 t1 = RJPEG_FSH(p2-p3); \
802 p5 = (p3+p4) * RJPEG_F2F( 1.175875602f); \
803 t0 = t0 * RJPEG_F2F( 0.298631336f); \
804 t1 = t1 * RJPEG_F2F( 2.053119869f); \
805 t2 = t2 * RJPEG_F2F( 3.072711026f); \
806 t3 = t3 * RJPEG_F2F( 1.501321110f); \
807 p1 = p5 + p1 * RJPEG_F2F(-0.899976223f); \
808 p2 = p5 + p2 * RJPEG_F2F(-2.562915447f); \
809 p3 = p3 * RJPEG_F2F(-1.961570560f); \
810 p4 = p4 * RJPEG_F2F(-0.390180644f); \
816 static void rjpeg_idct_block(uint8_t *out, int out_stride, short data[64])
818 int i,val[64],*v=val;
823 for (i = 0; i < 8; ++i,++d, ++v)
825 /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
834 /* no shortcut 0 seconds
835 * (1|2|3|4|5|6|7)==0 0 seconds
836 * all separate -0.047 seconds
837 * 1 && 2|3 && 4|5 && 6|7: -0.047 seconds */
838 int dcterm = d[0] << 2;
839 v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
843 RJPEG_IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]);
845 /* constants scaled things up by 1<<12; let's bring them back
846 * down, but keep 2 extra bits of precision */
852 v[ 0] = (x0+t3) >> 10;
853 v[56] = (x0-t3) >> 10;
854 v[ 8] = (x1+t2) >> 10;
855 v[48] = (x1-t2) >> 10;
856 v[16] = (x2+t1) >> 10;
857 v[40] = (x2-t1) >> 10;
858 v[24] = (x3+t0) >> 10;
859 v[32] = (x3-t0) >> 10;
863 for (i = 0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
865 /* no fast case since the first 1D IDCT spread components out */
866 RJPEG_IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]);
868 /* constants scaled things up by 1<<12, plus we had 1<<2 from first
869 * loop, plus horizontal and vertical each scale by sqrt(8) so together
870 * we've got an extra 1<<3, so 1<<17 total we need to remove.
871 * so we want to round that, which means adding 0.5 * 1<<17,
872 * aka 65536. Also, we'll end up with -128 to 127 that we want
873 * to encode as 0..255 by adding 128, so we'll add that before the shift
875 x0 += 65536 + (128<<17);
876 x1 += 65536 + (128<<17);
877 x2 += 65536 + (128<<17);
878 x3 += 65536 + (128<<17);
880 /* Tried computing the shifts into temps, or'ing the temps to see
881 * if any were out of range, but that was slower */
882 o[0] = rjpeg_clamp((x0+t3) >> 17);
883 o[7] = rjpeg_clamp((x0-t3) >> 17);
884 o[1] = rjpeg_clamp((x1+t2) >> 17);
885 o[6] = rjpeg_clamp((x1-t2) >> 17);
886 o[2] = rjpeg_clamp((x2+t1) >> 17);
887 o[5] = rjpeg_clamp((x2-t1) >> 17);
888 o[3] = rjpeg_clamp((x3+t0) >> 17);
889 o[4] = rjpeg_clamp((x3-t0) >> 17);
893 #if defined(__SSE2__)
894 /* sse2 integer IDCT. not the fastest possible implementation but it
895 * produces bit-identical results to the generic C version so it's
896 * fully "transparent".
898 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
900 /* This is constructed to match our regular (generic) integer IDCT exactly. */
901 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
904 /* dot product constant: even elems=x, odd elems=y */
905 #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
907 /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
908 * out(1) = c1[even]*x + c1[odd]*y
910 #define dct_rot(out0,out1, x,y,c0,c1) \
911 __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
912 __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
913 __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
914 __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
915 __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
916 __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
918 /* out = in << 12 (in 16-bit, out 32-bit) */
919 #define dct_widen(out, in) \
920 __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
921 __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
924 #define dct_wadd(out, a, b) \
925 __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
926 __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
929 #define dct_wsub(out, a, b) \
930 __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
931 __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
933 /* butterfly a/b, add bias, then shift by "s" and pack */
934 #define dct_bfly32o(out0, out1, a,b,bias,s) \
936 __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
937 __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
938 dct_wadd(sum, abiased, b); \
939 dct_wsub(dif, abiased, b); \
940 out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
941 out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
944 /* 8-bit interleave step (for transposes) */
945 #define dct_interleave8(a, b) \
947 a = _mm_unpacklo_epi8(a, b); \
948 b = _mm_unpackhi_epi8(tmp, b)
950 /* 16-bit interleave step (for transposes) */
951 #define dct_interleave16(a, b) \
953 a = _mm_unpacklo_epi16(a, b); \
954 b = _mm_unpackhi_epi16(tmp, b)
956 #define dct_pass(bias,shift) \
959 dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
960 __m128i sum04 = _mm_add_epi16(row0, row4); \
961 __m128i dif04 = _mm_sub_epi16(row0, row4); \
962 dct_widen(t0e, sum04); \
963 dct_widen(t1e, dif04); \
964 dct_wadd(x0, t0e, t3e); \
965 dct_wsub(x3, t0e, t3e); \
966 dct_wadd(x1, t1e, t2e); \
967 dct_wsub(x2, t1e, t2e); \
969 dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
970 dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
971 __m128i sum17 = _mm_add_epi16(row1, row7); \
972 __m128i sum35 = _mm_add_epi16(row3, row5); \
973 dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
974 dct_wadd(x4, y0o, y4o); \
975 dct_wadd(x5, y1o, y5o); \
976 dct_wadd(x6, y2o, y5o); \
977 dct_wadd(x7, y3o, y4o); \
978 dct_bfly32o(row0,row7, x0,x7,bias,shift); \
979 dct_bfly32o(row1,row6, x1,x6,bias,shift); \
980 dct_bfly32o(row2,row5, x2,x5,bias,shift); \
981 dct_bfly32o(row3,row4, x3,x4,bias,shift); \
984 __m128i rot0_0 = dct_const(RJPEG_F2F(0.5411961f), RJPEG_F2F(0.5411961f) + RJPEG_F2F(-1.847759065f));
985 __m128i rot0_1 = dct_const(RJPEG_F2F(0.5411961f) + RJPEG_F2F( 0.765366865f), RJPEG_F2F(0.5411961f));
986 __m128i rot1_0 = dct_const(RJPEG_F2F(1.175875602f) + RJPEG_F2F(-0.899976223f), RJPEG_F2F(1.175875602f));
987 __m128i rot1_1 = dct_const(RJPEG_F2F(1.175875602f), RJPEG_F2F(1.175875602f) + RJPEG_F2F(-2.562915447f));
988 __m128i rot2_0 = dct_const(RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 0.298631336f), RJPEG_F2F(-1.961570560f));
989 __m128i rot2_1 = dct_const(RJPEG_F2F(-1.961570560f), RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 3.072711026f));
990 __m128i rot3_0 = dct_const(RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 2.053119869f), RJPEG_F2F(-0.390180644f));
991 __m128i rot3_1 = dct_const(RJPEG_F2F(-0.390180644f), RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 1.501321110f));
993 /* rounding biases in column/row passes, see rjpeg_idct_block for explanation. */
994 __m128i bias_0 = _mm_set1_epi32(512);
995 __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
998 row0 = _mm_load_si128((const __m128i *) (data + 0*8));
999 row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1000 row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1001 row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1002 row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1003 row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1004 row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1005 row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1008 dct_pass(bias_0, 10);
1011 /* 16bit 8x8 transpose pass 1 */
1012 dct_interleave16(row0, row4);
1013 dct_interleave16(row1, row5);
1014 dct_interleave16(row2, row6);
1015 dct_interleave16(row3, row7);
1017 /* transpose pass 2 */
1018 dct_interleave16(row0, row2);
1019 dct_interleave16(row1, row3);
1020 dct_interleave16(row4, row6);
1021 dct_interleave16(row5, row7);
1023 /* transpose pass 3 */
1024 dct_interleave16(row0, row1);
1025 dct_interleave16(row2, row3);
1026 dct_interleave16(row4, row5);
1027 dct_interleave16(row6, row7);
1031 dct_pass(bias_1, 17);
1035 __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1036 __m128i p1 = _mm_packus_epi16(row2, row3);
1037 __m128i p2 = _mm_packus_epi16(row4, row5);
1038 __m128i p3 = _mm_packus_epi16(row6, row7);
1040 /* 8bit 8x8 transpose pass 1 */
1041 dct_interleave8(p0, p2); /* a0e0a1e1... */
1042 dct_interleave8(p1, p3); /* c0g0c1g1... */
1044 /* transpose pass 2 */
1045 dct_interleave8(p0, p1); /* a0c0e0g0... */
1046 dct_interleave8(p2, p3); /* b0d0f0h0... */
1048 /* transpose pass 3 */
1049 dct_interleave8(p0, p2); /* a0b0c0d0... */
1050 dct_interleave8(p1, p3); /* a4b4c4d4... */
1053 _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1054 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1055 _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1056 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1057 _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1058 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1059 _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1060 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1069 #undef dct_interleave8
1070 #undef dct_interleave16
1078 /* NEON integer IDCT. should produce bit-identical
1079 * results to the generic C version. */
1080 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
1082 int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1084 int16x4_t rot0_0 = vdup_n_s16(RJPEG_F2F(0.5411961f));
1085 int16x4_t rot0_1 = vdup_n_s16(RJPEG_F2F(-1.847759065f));
1086 int16x4_t rot0_2 = vdup_n_s16(RJPEG_F2F( 0.765366865f));
1087 int16x4_t rot1_0 = vdup_n_s16(RJPEG_F2F( 1.175875602f));
1088 int16x4_t rot1_1 = vdup_n_s16(RJPEG_F2F(-0.899976223f));
1089 int16x4_t rot1_2 = vdup_n_s16(RJPEG_F2F(-2.562915447f));
1090 int16x4_t rot2_0 = vdup_n_s16(RJPEG_F2F(-1.961570560f));
1091 int16x4_t rot2_1 = vdup_n_s16(RJPEG_F2F(-0.390180644f));
1092 int16x4_t rot3_0 = vdup_n_s16(RJPEG_F2F( 0.298631336f));
1093 int16x4_t rot3_1 = vdup_n_s16(RJPEG_F2F( 2.053119869f));
1094 int16x4_t rot3_2 = vdup_n_s16(RJPEG_F2F( 3.072711026f));
1095 int16x4_t rot3_3 = vdup_n_s16(RJPEG_F2F( 1.501321110f));
1097 #define dct_long_mul(out, inq, coeff) \
1098 int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1099 int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1101 #define dct_long_mac(out, acc, inq, coeff) \
1102 int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1103 int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1105 #define dct_widen(out, inq) \
1106 int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1107 int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1110 #define dct_wadd(out, a, b) \
1111 int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1112 int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1115 #define dct_wsub(out, a, b) \
1116 int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1117 int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1119 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1120 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1122 dct_wadd(sum, a, b); \
1123 dct_wsub(dif, a, b); \
1124 out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1125 out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1128 #define dct_pass(shiftop, shift) \
1131 int16x8_t sum26 = vaddq_s16(row2, row6); \
1132 dct_long_mul(p1e, sum26, rot0_0); \
1133 dct_long_mac(t2e, p1e, row6, rot0_1); \
1134 dct_long_mac(t3e, p1e, row2, rot0_2); \
1135 int16x8_t sum04 = vaddq_s16(row0, row4); \
1136 int16x8_t dif04 = vsubq_s16(row0, row4); \
1137 dct_widen(t0e, sum04); \
1138 dct_widen(t1e, dif04); \
1139 dct_wadd(x0, t0e, t3e); \
1140 dct_wsub(x3, t0e, t3e); \
1141 dct_wadd(x1, t1e, t2e); \
1142 dct_wsub(x2, t1e, t2e); \
1144 int16x8_t sum15 = vaddq_s16(row1, row5); \
1145 int16x8_t sum17 = vaddq_s16(row1, row7); \
1146 int16x8_t sum35 = vaddq_s16(row3, row5); \
1147 int16x8_t sum37 = vaddq_s16(row3, row7); \
1148 int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1149 dct_long_mul(p5o, sumodd, rot1_0); \
1150 dct_long_mac(p1o, p5o, sum17, rot1_1); \
1151 dct_long_mac(p2o, p5o, sum35, rot1_2); \
1152 dct_long_mul(p3o, sum37, rot2_0); \
1153 dct_long_mul(p4o, sum15, rot2_1); \
1154 dct_wadd(sump13o, p1o, p3o); \
1155 dct_wadd(sump24o, p2o, p4o); \
1156 dct_wadd(sump23o, p2o, p3o); \
1157 dct_wadd(sump14o, p1o, p4o); \
1158 dct_long_mac(x4, sump13o, row7, rot3_0); \
1159 dct_long_mac(x5, sump24o, row5, rot3_1); \
1160 dct_long_mac(x6, sump23o, row3, rot3_2); \
1161 dct_long_mac(x7, sump14o, row1, rot3_3); \
1162 dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1163 dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1164 dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1165 dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1169 row0 = vld1q_s16(data + 0*8);
1170 row1 = vld1q_s16(data + 1*8);
1171 row2 = vld1q_s16(data + 2*8);
1172 row3 = vld1q_s16(data + 3*8);
1173 row4 = vld1q_s16(data + 4*8);
1174 row5 = vld1q_s16(data + 5*8);
1175 row6 = vld1q_s16(data + 6*8);
1176 row7 = vld1q_s16(data + 7*8);
1179 row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1182 dct_pass(vrshrn_n_s32, 10);
1184 /* 16bit 8x8 transpose */
1186 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1187 * whether compilers actually get this is another story, sadly. */
1188 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1189 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1190 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1193 dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1194 dct_trn16(row2, row3);
1195 dct_trn16(row4, row5);
1196 dct_trn16(row6, row7);
1199 dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1200 dct_trn32(row1, row3);
1201 dct_trn32(row4, row6);
1202 dct_trn32(row5, row7);
1205 dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1206 dct_trn64(row1, row5);
1207 dct_trn64(row2, row6);
1208 dct_trn64(row3, row7);
1216 * vrshrn_n_s32 only supports shifts up to 16, we need
1217 * 17. so do a non-rounding shift of 16 first then follow
1218 * up with a rounding shift by 1. */
1219 dct_pass(vshrn_n_s32, 16);
1222 /* pack and round */
1223 uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1224 uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1225 uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1226 uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1227 uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1228 uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1229 uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1230 uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1232 /* again, these can translate into one instruction, but often don't. */
1233 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1234 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1235 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1237 /* sadly can't use interleaved stores here since we only write
1238 * 8 bytes to each scan line! */
1240 /* 8x8 8-bit transpose pass 1 */
1247 dct_trn8_16(p0, p2);
1248 dct_trn8_16(p1, p3);
1249 dct_trn8_16(p4, p6);
1250 dct_trn8_16(p5, p7);
1253 dct_trn8_32(p0, p4);
1254 dct_trn8_32(p1, p5);
1255 dct_trn8_32(p2, p6);
1256 dct_trn8_32(p3, p7);
1289 #endif /* RJPEG_NEON */
1291 static uint8_t rjpeg_get_marker(rjpeg_jpeg *j)
1295 if (j->marker != RJPEG_MARKER_NONE)
1298 j->marker = RJPEG_MARKER_NONE;
1302 x = rjpeg_get8(j->s);
1304 return RJPEG_MARKER_NONE;
1306 x = rjpeg_get8(j->s);
1310 /* after a restart interval, rjpeg_jpeg_reset the entropy decoder and
1313 static void rjpeg_jpeg_reset(rjpeg_jpeg *j)
1318 j->img_comp[0].dc_pred = 0;
1319 j->img_comp[1].dc_pred = 0;
1320 j->img_comp[2].dc_pred = 0;
1321 j->marker = RJPEG_MARKER_NONE;
1322 j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1325 /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1326 * since we don't even allow 1<<30 pixels */
1329 static int rjpeg_parse_entropy_coded_data(rjpeg_jpeg *z)
1331 rjpeg_jpeg_reset(z);
1336 int n = z->order[0];
1337 int w = (z->img_comp[n].x+7) >> 3;
1338 int h = (z->img_comp[n].y+7) >> 3;
1340 /* non-interleaved data, we just need to process one block at a time,
1341 * in trivial scanline order
1342 * number of blocks to do just depends on how many actual "pixels" this
1343 * component has, independent of interleaved MCU blocking and such */
1347 for (j = 0; j < h; ++j)
1349 for (i = 0; i < w; ++i)
1351 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1353 if (z->spec_start == 0)
1355 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1360 int ha = z->img_comp[n].ha;
1361 if (!rjpeg_jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1365 /* every data block is an MCU, so countdown the restart interval */
1368 if (z->code_bits < 24)
1369 rjpeg_grow_buffer_unsafe(z);
1371 if (!RJPEG_RESTART(z->marker))
1373 rjpeg_jpeg_reset(z);
1380 RJPEG_SIMD_ALIGN(short, data[64]);
1382 for (j = 0; j < h; ++j)
1384 for (i = 0; i < w; ++i)
1386 int ha = z->img_comp[n].ha;
1387 if (!rjpeg_jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1388 z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1391 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1392 z->img_comp[n].w2, data);
1394 /* every data block is an MCU, so countdown the restart interval */
1397 if (z->code_bits < 24)
1398 rjpeg_grow_buffer_unsafe(z);
1400 /* if it's NOT a restart, then just bail,
1401 * so we get corrupt data rather than no data */
1402 if (!RJPEG_RESTART(z->marker))
1404 rjpeg_jpeg_reset(z);
1417 for (j = 0; j < z->img_mcu_y; ++j)
1419 for (i = 0; i < z->img_mcu_x; ++i)
1421 /* scan an interleaved MCU... process scan_n components in order */
1422 for (k = 0; k < z->scan_n; ++k)
1424 int n = z->order[k];
1425 /* scan out an MCU's worth of this component; that's just determined
1426 * by the basic H and V specified for the component */
1427 for (y = 0; y < z->img_comp[n].v; ++y)
1429 for (x = 0; x < z->img_comp[n].h; ++x)
1431 int x2 = (i*z->img_comp[n].h + x);
1432 int y2 = (j*z->img_comp[n].v + y);
1433 short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1434 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1440 /* after all interleaved components, that's an interleaved MCU,
1441 * so now count down the restart interval */
1444 if (z->code_bits < 24)
1445 rjpeg_grow_buffer_unsafe(z);
1446 if (!RJPEG_RESTART(z->marker))
1448 rjpeg_jpeg_reset(z);
1455 RJPEG_SIMD_ALIGN(short, data[64]);
1457 for (j = 0; j < z->img_mcu_y; ++j)
1459 for (i = 0; i < z->img_mcu_x; ++i)
1461 /* scan an interleaved MCU... process scan_n components in order */
1462 for (k = 0; k < z->scan_n; ++k)
1464 int n = z->order[k];
1465 /* scan out an MCU's worth of this component; that's just determined
1466 * by the basic H and V specified for the component */
1467 for (y = 0; y < z->img_comp[n].v; ++y)
1469 for (x = 0; x < z->img_comp[n].h; ++x)
1471 int x2 = (i*z->img_comp[n].h + x)*8;
1472 int y2 = (j*z->img_comp[n].v + y)*8;
1473 int ha = z->img_comp[n].ha;
1475 if (!rjpeg_jpeg_decode_block(z, data,
1476 z->huff_dc+z->img_comp[n].hd,
1477 z->huff_ac+ha, z->fast_ac[ha],
1478 n, z->dequant[z->img_comp[n].tq]))
1481 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1482 z->img_comp[n].w2, data);
1487 /* after all interleaved components, that's an interleaved MCU,
1488 * so now count down the restart interval */
1491 if (z->code_bits < 24)
1492 rjpeg_grow_buffer_unsafe(z);
1493 if (!RJPEG_RESTART(z->marker))
1495 rjpeg_jpeg_reset(z);
1505 static void rjpeg_jpeg_dequantize(short *data, uint8_t *dequant)
1508 for (i = 0; i < 64; ++i)
1509 data[i] *= dequant[i];
1512 static void rjpeg_jpeg_finish(rjpeg_jpeg *z)
1516 if (!z->progressive)
1519 /* dequantize and IDCT the data */
1520 for (n = 0; n < z->s->img_n; ++n)
1522 int w = (z->img_comp[n].x+7) >> 3;
1523 int h = (z->img_comp[n].y+7) >> 3;
1524 for (j = 0; j < h; ++j)
1526 for (i = 0; i < w; ++i)
1528 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1529 rjpeg_jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1530 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1531 z->img_comp[n].w2, data);
1537 static int rjpeg_process_marker(rjpeg_jpeg *z, int m)
1542 case RJPEG_MARKER_NONE: /* no marker found */
1543 /* Expected marker. Corrupt JPEG? */
1546 case 0xDD: /* DRI - specify restart interval */
1548 /* Bad DRI length. Corrupt JPEG? */
1549 if (RJPEG_GET16BE(z->s) != 4)
1552 z->restart_interval = RJPEG_GET16BE(z->s);
1555 case 0xDB: /* DQT - define quantization table */
1556 L = RJPEG_GET16BE(z->s)-2;
1559 int q = rjpeg_get8(z->s);
1563 /* Bad DQT type. Corrupt JPEG? */
1567 /* Bad DQT table. Corrupt JPEG? */
1571 for (i = 0; i < 64; ++i)
1572 z->dequant[t][rjpeg_jpeg_dezigzag[i]] = rjpeg_get8(z->s);
1577 case 0xC4: /* DHT - define huffman table */
1578 L = RJPEG_GET16BE(z->s)-2;
1581 int sizes[16],i,n = 0;
1583 int q = rjpeg_get8(z->s);
1587 /* Bad DHT header. Corrupt JPEG? */
1588 if (tc > 1 || th > 3)
1591 for (i = 0; i < 16; ++i)
1593 sizes[i] = rjpeg_get8(z->s);
1600 if (!rjpeg_build_huffman(z->huff_dc+th, sizes))
1602 v = z->huff_dc[th].values;
1606 if (!rjpeg_build_huffman(z->huff_ac+th, sizes))
1608 v = z->huff_ac[th].values;
1610 for (i = 0; i < n; ++i)
1611 v[i] = rjpeg_get8(z->s);
1613 rjpeg_build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1619 /* check for comment block or APP blocks */
1620 if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1622 int n = RJPEG_GET16BE(z->s)-2;
1625 z->s->img_buffer = z->s->img_buffer_end;
1627 z->s->img_buffer += n;
1634 /* after we see SOS */
1635 static int rjpeg_process_scan_header(rjpeg_jpeg *z)
1639 int Ls = RJPEG_GET16BE(z->s);
1641 z->scan_n = rjpeg_get8(z->s);
1643 /* Bad SOS component count. Corrupt JPEG? */
1644 if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1647 /* Bad SOS length. Corrupt JPEG? */
1648 if (Ls != 6+2*z->scan_n)
1651 for (i = 0; i < z->scan_n; ++i)
1654 int id = rjpeg_get8(z->s);
1655 int q = rjpeg_get8(z->s);
1657 for (which = 0; which < z->s->img_n; ++which)
1658 if (z->img_comp[which].id == id)
1660 if (which == z->s->img_n)
1661 return 0; /* no match */
1663 /* Bad DC huff. Corrupt JPEG? */
1664 z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3)
1667 /* Bad AC huff. Corrupt JPEG? */
1668 z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3)
1671 z->order[i] = which;
1674 z->spec_start = rjpeg_get8(z->s);
1675 z->spec_end = rjpeg_get8(z->s); /* should be 63, but might be 0 */
1676 aa = rjpeg_get8(z->s);
1677 z->succ_high = (aa >> 4);
1678 z->succ_low = (aa & 15);
1682 /* Bad SOS. Corrupt JPEG? */
1683 if ( z->spec_start > 63 ||
1685 z->spec_start > z->spec_end ||
1686 z->succ_high > 13 ||
1692 /* Bad SOS. Corrupt JPEG? */
1693 if (z->spec_start != 0)
1695 if (z->succ_high != 0 || z->succ_low != 0)
1704 static int rjpeg_process_frame_header(rjpeg_jpeg *z, int scan)
1706 rjpeg_context *s = z->s;
1707 int Lf,p,i,q, h_max=1,v_max=1,c;
1708 Lf = RJPEG_GET16BE(s);
1712 /* Bad SOF len. Corrupt JPEG? */
1720 /* Only 8-bit. JPEG format not supported? */
1724 s->img_y = RJPEG_GET16BE(s);
1726 /* Legal, but we don't handle it--but neither does IJG */
1728 /* No header height, JPEG format not supported? */
1732 s->img_x = RJPEG_GET16BE(s);
1734 /* No header width. Corrupt JPEG? */
1742 /* Bad component count. Corrupt JPEG? */
1743 if (c != 3 && c != 1)
1748 for (i = 0; i < c; ++i)
1750 z->img_comp[i].data = NULL;
1751 z->img_comp[i].linebuf = NULL;
1754 /* Bad SOF length. Corrupt JPEG? */
1755 if (Lf != 8+3*s->img_n)
1758 for (i = 0; i < s->img_n; ++i)
1760 z->img_comp[i].id = rjpeg_get8(s);
1761 if (z->img_comp[i].id != i+1) /* JFIF requires */
1762 if (z->img_comp[i].id != i) /* some version of jpegtran outputs non-JFIF-compliant files! */
1766 z->img_comp[i].h = (q >> 4);
1768 /* Bad H. Corrupt JPEG? */
1769 if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1772 z->img_comp[i].v = q & 15;
1774 /* Bad V. Corrupt JPEG? */
1775 if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1778 z->img_comp[i].tq = rjpeg_get8(s);
1780 /* Bad TQ. Corrupt JPEG? */
1781 if (z->img_comp[i].tq > 3)
1785 if (scan != RJPEG_SCAN_LOAD)
1788 /* Image too large to decode? */
1789 if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1792 for (i = 0; i < s->img_n; ++i)
1794 if (z->img_comp[i].h > h_max)
1795 h_max = z->img_comp[i].h;
1796 if (z->img_comp[i].v > v_max)
1797 v_max = z->img_comp[i].v;
1800 /* compute interleaved MCU info */
1801 z->img_h_max = h_max;
1802 z->img_v_max = v_max;
1803 z->img_mcu_w = h_max * 8;
1804 z->img_mcu_h = v_max * 8;
1805 z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1806 z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1810 for (i = 0; i < s->img_n; ++i)
1812 /* number of effective pixels (e.g. for non-interleaved MCU) */
1813 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1814 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1816 /* to simplify generation, we'll allocate enough memory to decode
1817 * the bogus oversized data from using interleaved MCUs and their
1818 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1819 * discard the extra data until colorspace conversion */
1820 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1821 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1822 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1824 /* Out of memory? */
1825 if (!z->img_comp[i].raw_data)
1827 for (--i; i >= 0; --i)
1829 free(z->img_comp[i].raw_data);
1830 z->img_comp[i].data = NULL;
1836 /* align blocks for IDCT using MMX/SSE */
1837 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1838 z->img_comp[i].linebuf = NULL;
1839 z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1840 z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1841 z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1842 z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1843 z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1848 for (i = 0; i < s->img_n; ++i)
1850 /* number of effective pixels (e.g. for non-interleaved MCU) */
1851 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1852 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1854 /* to simplify generation, we'll allocate enough memory to decode
1855 * the bogus oversized data from using interleaved MCUs and their
1856 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1857 * discard the extra data until colorspace conversion */
1858 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1859 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1860 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1862 /* Out of memory? */
1863 if (!z->img_comp[i].raw_data)
1865 for (--i; i >= 0; --i)
1867 free(z->img_comp[i].raw_data);
1868 z->img_comp[i].data = NULL;
1872 /* align blocks for IDCT using MMX/SSE */
1873 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1874 z->img_comp[i].linebuf = NULL;
1875 z->img_comp[i].coeff = 0;
1876 z->img_comp[i].raw_coeff = 0;
1883 static int rjpeg_decode_jpeg_header(rjpeg_jpeg *z, int scan)
1886 z->marker = RJPEG_MARKER_NONE; /* initialize cached marker to empty */
1887 m = rjpeg_get_marker(z);
1889 /* No SOI. Corrupt JPEG? */
1890 if (m != JPEG_MARKER_SOI)
1893 if (scan == RJPEG_SCAN_TYPE)
1896 m = rjpeg_get_marker(z);
1897 while (!RJPEG_SOF(m))
1899 if (!rjpeg_process_marker(z,m))
1901 m = rjpeg_get_marker(z);
1902 while (m == RJPEG_MARKER_NONE)
1904 /* some files have extra padding after their blocks, so ok, we'll scan */
1906 /* No SOF. Corrupt JPEG? */
1907 if (RJPEG_AT_EOF(z->s))
1910 m = rjpeg_get_marker(z);
1913 z->progressive = RJPEG_SOF_PROGRESSIVE(m);
1914 if (!rjpeg_process_frame_header(z, scan))
1919 /* decode image to YCbCr format */
1920 static int rjpeg_decode_jpeg_image(rjpeg_jpeg *j)
1923 for (m = 0; m < 4; m++)
1925 j->img_comp[m].raw_data = NULL;
1926 j->img_comp[m].raw_coeff = NULL;
1928 j->restart_interval = 0;
1929 if (!rjpeg_decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1931 m = rjpeg_get_marker(j);
1933 while (m != JPEG_MARKER_EOI)
1935 if (m == JPEG_MARKER_SOS)
1937 if (!rjpeg_process_scan_header(j))
1939 if (!rjpeg_parse_entropy_coded_data(j))
1942 if (j->marker == RJPEG_MARKER_NONE )
1944 /* handle 0s at the end of image data from IP Kamera 9060 */
1946 while (!RJPEG_AT_EOF(j->s))
1948 int x = rjpeg_get8(j->s);
1951 j->marker = rjpeg_get8(j->s);
1954 else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1958 /* if we reach eof without hitting a marker,
1959 * rjpeg_get_marker() below will fail and we'll eventually return 0 */
1964 if (!rjpeg_process_marker(j, m))
1967 m = rjpeg_get_marker(j);
1971 rjpeg_jpeg_finish(j);
1975 /* static jfif-centered resampling (across block boundaries) */
1977 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1978 uint8_t *in_far, int w, int hs)
1987 static uint8_t* rjpeg_resample_row_v_2(uint8_t *out, uint8_t *in_near,
1988 uint8_t *in_far, int w, int hs)
1990 /* need to generate two samples vertically for every one in input */
1993 for (i = 0; i < w; ++i)
1994 out[i] = RJPEG_DIV4(3*in_near[i] + in_far[i] + 2);
1998 static uint8_t* rjpeg_resample_row_h_2(uint8_t *out, uint8_t *in_near,
1999 uint8_t *in_far, int w, int hs)
2001 /* need to generate two samples horizontally for every one in input */
2003 uint8_t *input = in_near;
2007 /* if only one sample, can't do any interpolation */
2008 out[0] = out[1] = input[0];
2013 out[1] = RJPEG_DIV4(input[0]*3 + input[1] + 2);
2015 for (i=1; i < w-1; ++i)
2017 int n = 3 * input[i] + 2;
2018 out[i*2+0] = RJPEG_DIV4(n+input[i-1]);
2019 out[i*2+1] = RJPEG_DIV4(n+input[i+1]);
2021 out[i*2+0] = RJPEG_DIV4(input[w-2]*3 + input[w-1] + 2);
2022 out[i*2+1] = input[w-1];
2030 static uint8_t *rjpeg_resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2031 uint8_t *in_far, int w, int hs)
2033 /* need to generate 2x2 samples for every one in input */
2037 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2041 t1 = 3*in_near[0] + in_far[0];
2042 out[0] = RJPEG_DIV4(t1+2);
2044 for (i = 1; i < w; ++i)
2047 t1 = 3*in_near[i]+in_far[i];
2048 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2049 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2051 out[w*2-1] = RJPEG_DIV4(t1+2);
2058 #if defined(__SSE2__) || defined(RJPEG_NEON)
2059 static uint8_t *rjpeg_resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2060 uint8_t *in_far, int w, int hs)
2062 /* need to generate 2x2 samples for every one in input */
2067 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2071 t1 = 3*in_near[0] + in_far[0];
2072 /* process groups of 8 pixels for as long as we can.
2073 * note we can't handle the last pixel in a row in this loop
2074 * because we need to handle the filter boundary conditions.
2076 for (; i < ((w-1) & ~7); i += 8)
2078 #if defined(__SSE2__)
2079 /* load and perform the vertical filtering pass
2080 * this uses 3*x + y = 4*x + (y - x) */
2081 __m128i zero = _mm_setzero_si128();
2082 __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
2083 __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2084 __m128i farw = _mm_unpacklo_epi8(farb, zero);
2085 __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2086 __m128i diff = _mm_sub_epi16(farw, nearw);
2087 __m128i nears = _mm_slli_epi16(nearw, 2);
2088 __m128i curr = _mm_add_epi16(nears, diff); /* current row */
2090 /* horizontal filter works the same based on shifted vers of current
2091 * row. "prev" is current row shifted right by 1 pixel; we need to
2092 * insert the previous pixel value (from t1).
2093 * "next" is current row shifted left by 1 pixel, with first pixel
2094 * of next block of 8 pixels added in.
2096 __m128i prv0 = _mm_slli_si128(curr, 2);
2097 __m128i nxt0 = _mm_srli_si128(curr, 2);
2098 __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2099 __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2101 /* horizontal filter, polyphase implementation since it's convenient:
2102 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2103 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2104 * note the shared term. */
2105 __m128i bias = _mm_set1_epi16(8);
2106 __m128i curs = _mm_slli_epi16(curr, 2);
2107 __m128i prvd = _mm_sub_epi16(prev, curr);
2108 __m128i nxtd = _mm_sub_epi16(next, curr);
2109 __m128i curb = _mm_add_epi16(curs, bias);
2110 __m128i even = _mm_add_epi16(prvd, curb);
2111 __m128i odd = _mm_add_epi16(nxtd, curb);
2113 /* interleave even and odd pixels, then undo scaling. */
2114 __m128i int0 = _mm_unpacklo_epi16(even, odd);
2115 __m128i int1 = _mm_unpackhi_epi16(even, odd);
2116 __m128i de0 = _mm_srli_epi16(int0, 4);
2117 __m128i de1 = _mm_srli_epi16(int1, 4);
2119 /* pack and write output */
2120 __m128i outv = _mm_packus_epi16(de0, de1);
2121 _mm_storeu_si128((__m128i *) (out + i*2), outv);
2122 #elif defined(RJPEG_NEON)
2123 /* load and perform the vertical filtering pass
2124 * this uses 3*x + y = 4*x + (y - x) */
2125 uint8x8_t farb = vld1_u8(in_far + i);
2126 uint8x8_t nearb = vld1_u8(in_near + i);
2127 int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2128 int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2129 int16x8_t curr = vaddq_s16(nears, diff); /* current row */
2131 /* horizontal filter works the same based on shifted vers of current
2132 * row. "prev" is current row shifted right by 1 pixel; we need to
2133 * insert the previous pixel value (from t1).
2134 * "next" is current row shifted left by 1 pixel, with first pixel
2135 * of next block of 8 pixels added in. */
2136 int16x8_t prv0 = vextq_s16(curr, curr, 7);
2137 int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2138 int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2139 int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2141 /* horizontal filter, polyphase implementation since it's convenient:
2142 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2143 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2144 * note the shared term.
2146 int16x8_t curs = vshlq_n_s16(curr, 2);
2147 int16x8_t prvd = vsubq_s16(prev, curr);
2148 int16x8_t nxtd = vsubq_s16(next, curr);
2149 int16x8_t even = vaddq_s16(curs, prvd);
2150 int16x8_t odd = vaddq_s16(curs, nxtd);
2152 /* undo scaling and round, then store with even/odd phases interleaved */
2154 o.val[0] = vqrshrun_n_s16(even, 4);
2155 o.val[1] = vqrshrun_n_s16(odd, 4);
2156 vst2_u8(out + i*2, o);
2159 /* "previous" value for next iteration */
2160 t1 = 3*in_near[i+7] + in_far[i+7];
2164 t1 = 3*in_near[i] + in_far[i];
2165 out[i*2] = RJPEG_DIV16(3*t1 + t0 + 8);
2167 for (++i; i < w; ++i)
2170 t1 = 3*in_near[i]+in_far[i];
2171 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2172 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2174 out[w*2-1] = RJPEG_DIV4(t1+2);
2182 static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
2183 uint8_t *in_near, uint8_t *in_far, int w, int hs)
2185 /* resample with nearest-neighbor */
2189 for (i = 0; i < w; ++i)
2190 for (j = 0; j < hs; ++j)
2191 out[i*hs+j] = in_near[i];
2195 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2196 * to make sure the code produces the same results in both SIMD and scalar */
2198 #define FLOAT2FIXED(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
2201 static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2202 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2205 for (i = 0; i < count; ++i)
2207 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2208 int cr = pcr[i] - 128;
2209 int cb = pcb[i] - 128;
2210 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2211 int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2212 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2216 if ((unsigned) r > 255)
2218 if ((unsigned) g > 255)
2220 if ((unsigned) b > 255)
2222 out[0] = (uint8_t)r;
2223 out[1] = (uint8_t)g;
2224 out[2] = (uint8_t)b;
2230 #if defined(__SSE2__) || defined(RJPEG_NEON)
2231 static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2232 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2236 #if defined(__SSE2__)
2237 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2238 * it's useful in practice (you wouldn't use it for textures, for example).
2239 * so just accelerate step == 4 case.
2243 /* this is a fairly straightforward implementation and not super-optimized. */
2244 __m128i signflip = _mm_set1_epi8(-0x80);
2245 __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
2246 __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2247 __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2248 __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
2249 __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2250 __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2252 for (; i+7 < count; i += 8)
2255 __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2256 __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2257 __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2258 __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2259 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2261 /* unpack to short (and left-shift cr, cb by 8) */
2262 __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
2263 __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2264 __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2266 /* color transform */
2267 __m128i yws = _mm_srli_epi16(yw, 4);
2268 __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2269 __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2270 __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2271 __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2272 __m128i rws = _mm_add_epi16(cr0, yws);
2273 __m128i gwt = _mm_add_epi16(cb0, yws);
2274 __m128i bws = _mm_add_epi16(yws, cb1);
2275 __m128i gws = _mm_add_epi16(gwt, cr1);
2278 __m128i rw = _mm_srai_epi16(rws, 4);
2279 __m128i bw = _mm_srai_epi16(bws, 4);
2280 __m128i gw = _mm_srai_epi16(gws, 4);
2282 /* back to byte, set up for transpose */
2283 __m128i brb = _mm_packus_epi16(rw, bw);
2284 __m128i gxb = _mm_packus_epi16(gw, xw);
2286 /* transpose to interleave channels */
2287 __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2288 __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2289 __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2290 __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2293 _mm_storeu_si128((__m128i *) (out + 0), o0);
2294 _mm_storeu_si128((__m128i *) (out + 16), o1);
2301 /* in this version, step=3 support would be easy to add. but is there demand? */
2304 /* this is a fairly straightforward implementation and not super-optimized. */
2305 uint8x8_t signflip = vdup_n_u8(0x80);
2306 int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
2307 int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2308 int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2309 int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
2311 for (; i+7 < count; i += 8)
2316 uint8x8_t y_bytes = vld1_u8(y + i);
2317 uint8x8_t cr_bytes = vld1_u8(pcr + i);
2318 uint8x8_t cb_bytes = vld1_u8(pcb + i);
2319 int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2320 int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2323 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2324 int16x8_t crw = vshll_n_s8(cr_biased, 7);
2325 int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2327 /* color transform */
2328 int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2329 int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2330 int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2331 int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2332 int16x8_t rws = vaddq_s16(yws, cr0);
2333 int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2334 int16x8_t bws = vaddq_s16(yws, cb1);
2336 /* undo scaling, round, convert to byte */
2337 o.val[0] = vqrshrun_n_s16(rws, 4);
2338 o.val[1] = vqrshrun_n_s16(gws, 4);
2339 o.val[2] = vqrshrun_n_s16(bws, 4);
2340 o.val[3] = vdup_n_u8(255);
2342 /* store, interleaving r/g/b/a */
2349 for (; i < count; ++i)
2351 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2352 int cr = pcr[i] - 128;
2353 int cb = pcb[i] - 128;
2354 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2355 int g = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2356 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2360 if ((unsigned) r > 255)
2362 if ((unsigned) g > 255)
2364 if ((unsigned) b > 255)
2366 out[0] = (uint8_t)r;
2367 out[1] = (uint8_t)g;
2368 out[2] = (uint8_t)b;
2375 /* set up the kernels */
2376 static void rjpeg_setup_jpeg(rjpeg_jpeg *j)
2378 uint64_t mask = cpu_features_get();
2382 j->idct_block_kernel = rjpeg_idct_block;
2383 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_row;
2384 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2;
2386 #if defined(__SSE2__)
2387 if (mask & RETRO_SIMD_SSE2)
2389 j->idct_block_kernel = rjpeg_idct_simd;
2390 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2391 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2396 j->idct_block_kernel = rjpeg_idct_simd;
2397 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2398 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2402 /* clean up the temporary component buffers */
2403 static void rjpeg_cleanup_jpeg(rjpeg_jpeg *j)
2406 for (i = 0; i < j->s->img_n; ++i)
2408 if (j->img_comp[i].raw_data)
2410 free(j->img_comp[i].raw_data);
2411 j->img_comp[i].raw_data = NULL;
2412 j->img_comp[i].data = NULL;
2415 if (j->img_comp[i].raw_coeff)
2417 free(j->img_comp[i].raw_coeff);
2418 j->img_comp[i].raw_coeff = 0;
2419 j->img_comp[i].coeff = 0;
2422 if (j->img_comp[i].linebuf)
2424 free(j->img_comp[i].linebuf);
2425 j->img_comp[i].linebuf = NULL;
2430 static uint8_t *rjpeg_load_jpeg_image(rjpeg_jpeg *z,
2431 unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2436 rjpeg_resample res_comp[4];
2437 uint8_t *coutput[4] = {0};
2438 uint8_t *output = NULL;
2441 /* load a jpeg image from whichever source, but leave in YCbCr format */
2442 if (!rjpeg_decode_jpeg_image(z))
2445 /* determine actual number of components to generate */
2446 n = req_comp ? req_comp : z->s->img_n;
2448 if (z->s->img_n == 3 && n < 3)
2451 decode_n = z->s->img_n;
2453 /* resample and color-convert */
2454 for (k = 0; k < decode_n; ++k)
2456 rjpeg_resample *r = &res_comp[k];
2458 /* allocate line buffer big enough for upsampling off the edges
2459 * with upsample factor of 4 */
2460 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2461 if (!z->img_comp[k].linebuf)
2464 r->hs = z->img_h_max / z->img_comp[k].h;
2465 r->vs = z->img_v_max / z->img_comp[k].v;
2466 r->ystep = r->vs >> 1;
2467 r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
2469 r->line0 = r->line1 = z->img_comp[k].data;
2470 r->resample = rjpeg_resample_row_generic;
2472 if (r->hs == 1 && r->vs == 1)
2473 r->resample = rjpeg_resample_row_1;
2474 else if (r->hs == 1 && r->vs == 2)
2475 r->resample = rjpeg_resample_row_v_2;
2476 else if (r->hs == 2 && r->vs == 1)
2477 r->resample = rjpeg_resample_row_h_2;
2478 else if (r->hs == 2 && r->vs == 2)
2479 r->resample = z->resample_row_hv_2_kernel;
2482 /* can't error after this so, this is safe */
2483 output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2488 /* now go ahead and resample */
2489 for (j = 0; j < z->s->img_y; ++j)
2491 uint8_t *out = output + n * z->s->img_x * j;
2492 for (k = 0; k < decode_n; ++k)
2494 rjpeg_resample *r = &res_comp[k];
2495 int y_bot = r->ystep >= (r->vs >> 1);
2497 coutput[k] = r->resample(z->img_comp[k].linebuf,
2498 y_bot ? r->line1 : r->line0,
2499 y_bot ? r->line0 : r->line1,
2502 if (++r->ystep >= r->vs)
2505 r->line0 = r->line1;
2506 if (++r->ypos < z->img_comp[k].y)
2507 r->line1 += z->img_comp[k].w2;
2513 uint8_t *y = coutput[0];
2516 if (z->s->img_n == 3)
2517 z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2519 for (i = 0; i < z->s->img_x; ++i)
2521 out[0] = out[1] = out[2] = y[i];
2522 out[3] = 255; /* not used if n==3 */
2529 uint8_t *y = coutput[0];
2531 for (i = 0; i < z->s->img_x; ++i)
2534 for (i = 0; i < z->s->img_x; ++i)
2542 rjpeg_cleanup_jpeg(z);
2543 *out_x = z->s->img_x;
2544 *out_y = z->s->img_y;
2547 *comp = z->s->img_n; /* report original components, not output */
2551 rjpeg_cleanup_jpeg(z);
2555 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2556 size_t size, unsigned *width, unsigned *height)
2561 uint32_t *img = NULL;
2562 uint32_t *pixels = NULL;
2563 unsigned size_tex = 0;
2566 return IMAGE_PROCESS_ERROR;
2568 s.img_buffer = (uint8_t*)rjpeg->buff_data;
2569 s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2570 s.img_buffer_end = (uint8_t*)rjpeg->buff_data + (int)size;
2574 rjpeg_setup_jpeg(&j);
2576 img = (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2579 return IMAGE_PROCESS_ERROR;
2581 size_tex = (*width) * (*height);
2582 pixels = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2587 return IMAGE_PROCESS_ERROR;
2592 /* Convert RGBA to ARGB */
2595 unsigned int texel = img[size_tex];
2596 unsigned int A = texel & 0xFF000000;
2597 unsigned int B = texel & 0x00FF0000;
2598 unsigned int G = texel & 0x0000FF00;
2599 unsigned int R = texel & 0x000000FF;
2600 ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2605 return IMAGE_PROCESS_END;
2608 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2613 rjpeg->buff_data = (uint8_t*)data;
2618 void rjpeg_free(rjpeg_t *rjpeg)
2626 rjpeg_t *rjpeg_alloc(void)
2628 rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));