git subrepo clone https://github.com/libretro/libretro-common.git deps/libretro-common
[pcsx_rearmed.git] / deps / libretro-common / formats / jpeg / rjpeg.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (rjpeg.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* Modified version of stb_image's JPEG sources. */
24
25#include <stdint.h>
26#include <stdarg.h>
27#include <stddef.h> /* ptrdiff_t on osx */
28#include <stdlib.h>
29#include <string.h>
30
31#include <retro_inline.h>
32#include <boolean.h>
33#include <formats/image.h>
34#include <formats/rjpeg.h>
35#include <features/features_cpu.h>
36
37enum
38{
39 RJPEG_DEFAULT = 0, /* only used for req_comp */
40 RJPEG_GREY,
41 RJPEG_GREY_ALPHA,
42 RJPEG_RGB,
43 RJPEG_RGB_ALPHA
44};
45
46enum
47{
48 RJPEG_SCAN_LOAD = 0,
49 RJPEG_SCAN_TYPE,
50 RJPEG_SCAN_HEADER
51};
52
53typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
54 int w, int hs);
55
56typedef struct
57{
58 rjpeg_resample_row_func resample;
59 uint8_t *line0;
60 uint8_t *line1;
61 int hs,vs; /* expansion factor in each axis */
62 int w_lores; /* horizontal pixels pre-expansion */
63 int ystep; /* how far through vertical expansion we are */
64 int ypos; /* which pre-expansion row we're on */
65} rjpeg_resample;
66
67struct rjpeg
68{
69 uint8_t *buff_data;
70};
71
72#ifdef _MSC_VER
73#define RJPEG_HAS_LROTL
74#endif
75
76#ifdef RJPEG_HAS_LROTL
77 #define RJPEG_LROT(x,y) _lrotl(x,y)
78#else
79 #define RJPEG_LROT(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
80#endif
81
82/* x86/x64 detection */
83#if defined(__x86_64__) || defined(_M_X64)
84#define RJPEG_X64_TARGET
85#elif defined(__i386) || defined(_M_IX86)
86#define RJPEG_X86_TARGET
87#endif
88
89#if defined(__GNUC__) && (defined(RJPEG_X86_TARGET) || defined(RJPEG_X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
90/* NOTE: not clear do we actually need this for the 64-bit path?
91 * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
92 * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
93 * this is just broken and gcc are jerks for not fixing it properly
94 * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
95 */
96#define RJPEG_NO_SIMD
97#endif
98
99#if defined(__MINGW32__) && defined(RJPEG_X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
100/* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG_X64_TARGET
101 *
102 * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
103 * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
104 * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
105 * simultaneously enabling "-mstackrealign".
106 *
107 * See https://github.com/nothings/stb/issues/81 for more information.
108 *
109 * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
110 * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
111 */
112#define RJPEG_NO_SIMD
113#endif
114
115#if defined(__SSE2__)
116#include <emmintrin.h>
117
118#ifdef _MSC_VER
119#define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
120#else
121#define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
122#endif
123
124#endif
125
126/* ARM NEON */
127#if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
128#undef RJPEG_NEON
129#endif
130
131#ifdef RJPEG_NEON
132#include <arm_neon.h>
133/* assume GCC or Clang on ARM targets */
134#define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
135#endif
136
137#ifndef RJPEG_SIMD_ALIGN
138#define RJPEG_SIMD_ALIGN(type, name) type name
139#endif
140
141typedef struct
142{
143 uint8_t *img_buffer;
144 uint8_t *img_buffer_end;
145 uint8_t *img_buffer_original;
146 int img_n;
147 int img_out_n;
148 int buflen;
149 uint32_t img_x;
150 uint32_t img_y;
151 uint8_t buffer_start[128];
152} rjpeg_context;
153
154static INLINE uint8_t rjpeg_get8(rjpeg_context *s)
155{
156 if (s->img_buffer < s->img_buffer_end)
157 return *s->img_buffer++;
158
159 return 0;
160}
161
162#define RJPEG_AT_EOF(s) ((s)->img_buffer >= (s)->img_buffer_end)
163
164#define RJPEG_GET16BE(s) ((rjpeg_get8((s)) << 8) + rjpeg_get8((s)))
165
166/* huffman decoding acceleration */
167#define FAST_BITS 9 /* larger handles more cases; smaller stomps less cache */
168
169typedef struct
170{
171 unsigned int maxcode[18];
172 int delta[17]; /* old 'firstsymbol' - old 'firstcode' */
173 /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
174 uint16_t code[256];
175 uint8_t fast[1 << FAST_BITS];
176 uint8_t values[256];
177 uint8_t size[257];
178} rjpeg_huffman;
179
180typedef struct
181{
182 rjpeg_context *s;
183 /* kernels */
184 void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
185 void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
186 const uint8_t *pcr, int count, int step);
187 uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
188 uint8_t *in_far, int w, int hs);
189
190 /* definition of jpeg image component */
191 struct
192 {
193 uint8_t *data;
194 void *raw_data, *raw_coeff;
195 uint8_t *linebuf;
196 short *coeff; /* progressive only */
197 int id;
198 int h,v;
199 int tq;
200 int hd,ha;
201 int dc_pred;
202
203 int x,y,w2,h2;
204 int coeff_w; /* number of 8x8 coefficient blocks */
205 int coeff_h; /* number of 8x8 coefficient blocks */
206 } img_comp[4];
207
208 /* sizes for components, interleaved MCUs */
209 int img_h_max, img_v_max;
210 int img_mcu_x, img_mcu_y;
211 int img_mcu_w, img_mcu_h;
212
213 int code_bits; /* number of valid bits */
214 int nomore; /* flag if we saw a marker so must stop */
215 int progressive;
216 int spec_start;
217 int spec_end;
218 int succ_high;
219 int succ_low;
220 int eob_run;
221 int scan_n, order[4];
222 int restart_interval, todo;
223 uint32_t code_buffer; /* jpeg entropy-coded buffer */
224 rjpeg_huffman huff_dc[4]; /* unsigned int alignment */
225 rjpeg_huffman huff_ac[4]; /* unsigned int alignment */
226 int16_t fast_ac[4][1 << FAST_BITS];
227 unsigned char marker; /* marker seen while filling entropy buffer */
228 uint8_t dequant[4][64];
229} rjpeg_jpeg;
230
231#define RJPEG_F2F(x) ((int) (((x) * 4096 + 0.5)))
232#define RJPEG_FSH(x) ((x) << 12)
233
234#define RJPEG_MARKER_NONE 0xff
235/* if there's a pending marker from the entropy stream, return that
236 * otherwise, fetch from the stream and get a marker. if there's no
237 * marker, return 0xff, which is never a valid marker value
238 */
239
240/* in each scan, we'll have scan_n components, and the order
241 * of the components is specified by order[]
242 */
243#define RJPEG_RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
244
245#define JPEG_MARKER 0xFF
246#define JPEG_MARKER_SOI 0xD8
247#define JPEG_MARKER_SOS 0xDA
248#define JPEG_MARKER_EOI 0xD9
249#define JPEG_MARKER_APP1 0xE1
250#define JPEG_MARKER_APP2 0xE2
251
252/* use comparisons since in some cases we handle more than one case (e.g. SOF) */
253#define RJPEG_SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
254
255#define RJPEG_SOF_PROGRESSIVE(x) ((x) == 0xc2)
256#define RJPEG_DIV4(x) ((uint8_t) ((x) >> 2))
257#define RJPEG_DIV16(x) ((uint8_t) ((x) >> 4))
258
259static int rjpeg_build_huffman(rjpeg_huffman *h, int *count)
260{
261 int i,j,k = 0,code;
262
263 /* build size list for each symbol (from JPEG spec) */
264 for (i = 0; i < 16; ++i)
265 for (j = 0; j < count[i]; ++j)
266 h->size[k++] = (uint8_t) (i+1);
267
268 h->size[k] = 0;
269 /* compute actual symbols (from jpeg spec) */
270 code = 0;
271 k = 0;
272
273 for (j = 1; j <= 16; ++j)
274 {
275 /* compute delta to add to code to compute symbol id */
276 h->delta[j] = k - code;
277 if (h->size[k] == j)
278 {
279 while (h->size[k] == j)
280 h->code[k++] = (uint16_t) (code++);
281
282 /* Bad code lengths, corrupt JPEG? */
283 if (code-1 >= (1 << j))
284 return 0;
285 }
286 /* compute largest code + 1 for this size, preshifted as needed later */
287 h->maxcode[j] = code << (16-j);
288 code <<= 1;
289 }
290 h->maxcode[j] = 0xffffffff;
291
292 /* build non-spec acceleration table; 255 is flag for not-accelerated */
293 memset(h->fast, 255, 1 << FAST_BITS);
294 for (i = 0; i < k; ++i)
295 {
296 int s = h->size[i];
297 if (s <= FAST_BITS)
298 {
299 int c = h->code[i] << (FAST_BITS-s);
300 int m = 1 << (FAST_BITS-s);
301 for (j = 0; j < m; ++j)
302 h->fast[c+j] = (uint8_t) i;
303 }
304 }
305 return 1;
306}
307
308/* build a table that decodes both magnitude and value of small ACs in
309 * one go. */
310static void rjpeg_build_fast_ac(int16_t *fast_ac, rjpeg_huffman *h)
311{
312 int i;
313
314 for (i = 0; i < (1 << FAST_BITS); ++i)
315 {
316 uint8_t fast = h->fast[i];
317
318 fast_ac[i] = 0;
319
320 if (fast < 255)
321 {
322 int rs = h->values[fast];
323 int run = (rs >> 4) & 15;
324 int magbits = rs & 15;
325 int len = h->size[fast];
326
327 if (magbits && len + magbits <= FAST_BITS)
328 {
329 /* magnitude code followed by receive_extend code */
330 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
331 int m = 1 << (magbits - 1);
332 if (k < m)
333 k += (-1 << magbits) + 1;
334
335 /* if the result is small enough, we can fit it in fast_ac table */
336 if (k >= -128 && k <= 127)
337 fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
338 }
339 }
340 }
341}
342
343static void rjpeg_grow_buffer_unsafe(rjpeg_jpeg *j)
344{
345 do
346 {
347 int b = j->nomore ? 0 : rjpeg_get8(j->s);
348 if (b == 0xff)
349 {
350 int c = rjpeg_get8(j->s);
351
352 if (c != 0)
353 {
354 j->marker = (unsigned char) c;
355 j->nomore = 1;
356 return;
357 }
358 }
359 j->code_buffer |= b << (24 - j->code_bits);
360 j->code_bits += 8;
361 } while (j->code_bits <= 24);
362}
363
364/* (1 << n) - 1 */
365static uint32_t rjpeg_bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
366
367/* decode a JPEG huffman value from the bitstream */
368static INLINE int rjpeg_jpeg_huff_decode(rjpeg_jpeg *j, rjpeg_huffman *h)
369{
370 unsigned int temp;
371 int c,k;
372
373 if (j->code_bits < 16)
374 rjpeg_grow_buffer_unsafe(j);
375
376 /* look at the top FAST_BITS and determine what symbol ID it is,
377 * if the code is <= FAST_BITS */
378 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
379 k = h->fast[c];
380
381 if (k < 255)
382 {
383 int s = h->size[k];
384 if (s > j->code_bits)
385 return -1;
386 j->code_buffer <<= s;
387 j->code_bits -= s;
388 return h->values[k];
389 }
390
391 /* naive test is to shift the code_buffer down so k bits are
392 * valid, then test against maxcode. To speed this up, we've
393 * preshifted maxcode left so that it has (16-k) 0s at the
394 * end; in other words, regardless of the number of bits, it
395 * wants to be compared against something shifted to have 16;
396 * that way we don't need to shift inside the loop. */
397 temp = j->code_buffer >> 16;
398 for (k=FAST_BITS+1 ; ; ++k)
399 if (temp < h->maxcode[k])
400 break;
401
402 if (k == 17)
403 {
404 /* error! code not found */
405 j->code_bits -= 16;
406 return -1;
407 }
408
409 if (k > j->code_bits)
410 return -1;
411
412 /* convert the huffman code to the symbol id */
413 c = ((j->code_buffer >> (32 - k)) & rjpeg_bmask[k]) + h->delta[k];
414
415 /* convert the id to a symbol */
416 j->code_bits -= k;
417 j->code_buffer <<= k;
418 return h->values[c];
419}
420
421/* bias[n] = (-1<<n) + 1 */
422static int const rjpeg_jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
423
424/* combined JPEG 'receive' and JPEG 'extend', since baseline
425 * always extends everything it receives. */
426static INLINE int rjpeg_extend_receive(rjpeg_jpeg *j, int n)
427{
428 unsigned int k;
429 int sgn;
430 if (j->code_bits < n)
431 rjpeg_grow_buffer_unsafe(j);
432
433 sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
434 k = RJPEG_LROT(j->code_buffer, n);
435 j->code_buffer = k & ~rjpeg_bmask[n];
436 k &= rjpeg_bmask[n];
437 j->code_bits -= n;
438 return k + (rjpeg_jbias[n] & ~sgn);
439}
440
441/* get some unsigned bits */
442static INLINE int rjpeg_jpeg_get_bits(rjpeg_jpeg *j, int n)
443{
444 unsigned int k;
445 if (j->code_bits < n)
446 rjpeg_grow_buffer_unsafe(j);
447 k = RJPEG_LROT(j->code_buffer, n);
448 j->code_buffer = k & ~rjpeg_bmask[n];
449 k &= rjpeg_bmask[n];
450 j->code_bits -= n;
451 return k;
452}
453
454static INLINE int rjpeg_jpeg_get_bit(rjpeg_jpeg *j)
455{
456 unsigned int k;
457 if (j->code_bits < 1)
458 rjpeg_grow_buffer_unsafe(j);
459
460 k = j->code_buffer;
461 j->code_buffer <<= 1;
462 --j->code_bits;
463 return k & 0x80000000;
464}
465
466/* given a value that's at position X in the zigzag stream,
467 * where does it appear in the 8x8 matrix coded as row-major? */
468static uint8_t rjpeg_jpeg_dezigzag[64+15] =
469{
470 0, 1, 8, 16, 9, 2, 3, 10,
471 17, 24, 32, 25, 18, 11, 4, 5,
472 12, 19, 26, 33, 40, 48, 41, 34,
473 27, 20, 13, 6, 7, 14, 21, 28,
474 35, 42, 49, 56, 57, 50, 43, 36,
475 29, 22, 15, 23, 30, 37, 44, 51,
476 58, 59, 52, 45, 38, 31, 39, 46,
477 53, 60, 61, 54, 47, 55, 62, 63,
478 /* let corrupt input sample past end */
479 63, 63, 63, 63, 63, 63, 63, 63,
480 63, 63, 63, 63, 63, 63, 63
481};
482
483/* decode one 64-entry block-- */
484static int rjpeg_jpeg_decode_block(
485 rjpeg_jpeg *j, short data[64],
486 rjpeg_huffman *hdc,
487 rjpeg_huffman *hac,
488 int16_t *fac,
489 int b,
490 uint8_t *dequant)
491{
492 int dc,k;
493 int t;
494 int diff = 0;
495
496 if (j->code_bits < 16)
497 rjpeg_grow_buffer_unsafe(j);
498 t = rjpeg_jpeg_huff_decode(j, hdc);
499
500 /* Bad huffman code. Corrupt JPEG? */
501 if (t < 0)
502 return 0;
503
504 /* 0 all the ac values now so we can do it 32-bits at a time */
505 memset(data,0,64*sizeof(data[0]));
506
507 if (t)
508 diff = rjpeg_extend_receive(j, t);
509 dc = j->img_comp[b].dc_pred + diff;
510 j->img_comp[b].dc_pred = dc;
511 data[0] = (short) (dc * dequant[0]);
512
513 /* decode AC components, see JPEG spec */
514 k = 1;
515 do
516 {
517 unsigned int zig;
518 int c,r,s;
519 if (j->code_bits < 16)
520 rjpeg_grow_buffer_unsafe(j);
521 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
522 r = fac[c];
523 if (r)
524 {
525 /* fast-AC path */
526 k += (r >> 4) & 15; /* run */
527 s = r & 15; /* combined length */
528 j->code_buffer <<= s;
529 j->code_bits -= s;
530 /* decode into unzigzag'd location */
531 zig = rjpeg_jpeg_dezigzag[k++];
532 data[zig] = (short) ((r >> 8) * dequant[zig]);
533 }
534 else
535 {
536 int rs = rjpeg_jpeg_huff_decode(j, hac);
537
538 /* Bad huffman code. Corrupt JPEG? */
539 if (rs < 0)
540 return 0;
541
542 s = rs & 15;
543 r = rs >> 4;
544 if (s == 0)
545 {
546 if (rs != 0xf0)
547 break; /* end block */
548 k += 16;
549 }
550 else
551 {
552 k += r;
553 /* decode into unzigzag'd location */
554 zig = rjpeg_jpeg_dezigzag[k++];
555 data[zig] = (short) (rjpeg_extend_receive(j,s) * dequant[zig]);
556 }
557 }
558 } while (k < 64);
559 return 1;
560}
561
562static int rjpeg_jpeg_decode_block_prog_dc(
563 rjpeg_jpeg *j,
564 short data[64],
565 rjpeg_huffman *hdc,
566 int b)
567{
568 /* Can't merge DC and AC. Corrupt JPEG? */
569 if (j->spec_end != 0)
570 return 0;
571
572 if (j->code_bits < 16)
573 rjpeg_grow_buffer_unsafe(j);
574
575 if (j->succ_high == 0)
576 {
577 int t;
578 int dc;
579 int diff = 0;
580
581 /* first scan for DC coefficient, must be first */
582 memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
583 t = rjpeg_jpeg_huff_decode(j, hdc);
584 if (t)
585 diff = rjpeg_extend_receive(j, t);
586
587 dc = j->img_comp[b].dc_pred + diff;
588 j->img_comp[b].dc_pred = dc;
589 data[0] = (short) (dc << j->succ_low);
590 }
591 else
592 {
593 /* refinement scan for DC coefficient */
594 if (rjpeg_jpeg_get_bit(j))
595 data[0] += (short) (1 << j->succ_low);
596 }
597 return 1;
598}
599
600static int rjpeg_jpeg_decode_block_prog_ac(
601 rjpeg_jpeg *j,
602 short data[64],
603 rjpeg_huffman *hac,
604 int16_t *fac)
605{
606 int k;
607
608 /* Can't merge DC and AC. Corrupt JPEG? */
609 if (j->spec_start == 0)
610 return 0;
611
612 if (j->succ_high == 0)
613 {
614 int shift = j->succ_low;
615
616 if (j->eob_run)
617 {
618 --j->eob_run;
619 return 1;
620 }
621
622 k = j->spec_start;
623 do
624 {
625 unsigned int zig;
626 int c,r,s;
627 if (j->code_bits < 16)
628 rjpeg_grow_buffer_unsafe(j);
629 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
630 r = fac[c];
631 if (r)
632 {
633 /* fast-AC path */
634 k += (r >> 4) & 15; /* run */
635 s = r & 15; /* combined length */
636 j->code_buffer <<= s;
637 j->code_bits -= s;
638 zig = rjpeg_jpeg_dezigzag[k++];
639 data[zig] = (short) ((r >> 8) << shift);
640 }
641 else
642 {
643 int rs = rjpeg_jpeg_huff_decode(j, hac);
644
645 /* Bad huffman code. Corrupt JPEG? */
646 if (rs < 0)
647 return 0;
648
649 s = rs & 15;
650 r = rs >> 4;
651 if (s == 0)
652 {
653 if (r < 15)
654 {
655 j->eob_run = (1 << r);
656 if (r)
657 j->eob_run += rjpeg_jpeg_get_bits(j, r);
658 --j->eob_run;
659 break;
660 }
661 k += 16;
662 }
663 else
664 {
665 k += r;
666 zig = rjpeg_jpeg_dezigzag[k++];
667 data[zig] = (short) (rjpeg_extend_receive(j,s) << shift);
668 }
669 }
670 } while (k <= j->spec_end);
671 }
672 else
673 {
674 /* refinement scan for these AC coefficients */
675
676 short bit = (short) (1 << j->succ_low);
677
678 if (j->eob_run)
679 {
680 --j->eob_run;
681 for (k = j->spec_start; k <= j->spec_end; ++k)
682 {
683 short *p = &data[rjpeg_jpeg_dezigzag[k]];
684 if (*p != 0)
685 if (rjpeg_jpeg_get_bit(j))
686 if ((*p & bit) == 0)
687 {
688 if (*p > 0)
689 *p += bit;
690 else
691 *p -= bit;
692 }
693 }
694 }
695 else
696 {
697 k = j->spec_start;
698 do
699 {
700 int r,s;
701 int rs = rjpeg_jpeg_huff_decode(j, hac);
702
703 /* Bad huffman code. Corrupt JPEG? */
704 if (rs < 0)
705 return 0;
706
707 s = rs & 15;
708 r = rs >> 4;
709 if (s == 0)
710 {
711 if (r < 15)
712 {
713 j->eob_run = (1 << r) - 1;
714 if (r)
715 j->eob_run += rjpeg_jpeg_get_bits(j, r);
716 r = 64; /* force end of block */
717 }
718 else
719 {
720 /* r=15 s=0 should write 16 0s, so we just do
721 * a run of 15 0s and then write s (which is 0),
722 * so we don't have to do anything special here */
723 }
724 }
725 else
726 {
727 /* Bad huffman code. Corrupt JPEG? */
728 if (s != 1)
729 return 0;
730
731 /* sign bit */
732 if (rjpeg_jpeg_get_bit(j))
733 s = bit;
734 else
735 s = -bit;
736 }
737
738 /* advance by r */
739 while (k <= j->spec_end)
740 {
741 short *p = &data[rjpeg_jpeg_dezigzag[k++]];
742 if (*p != 0)
743 {
744 if (rjpeg_jpeg_get_bit(j))
745 if ((*p & bit) == 0)
746 {
747 if (*p > 0)
748 *p += bit;
749 else
750 *p -= bit;
751 }
752 }
753 else
754 {
755 if (r == 0)
756 {
757 *p = (short) s;
758 break;
759 }
760 --r;
761 }
762 }
763 } while (k <= j->spec_end);
764 }
765 }
766 return 1;
767}
768
769/* take a -128..127 value and rjpeg_clamp it and convert to 0..255 */
770static INLINE uint8_t rjpeg_clamp(int x)
771{
772 /* trick to use a single test to catch both cases */
773 if ((unsigned int) x > 255)
774 return 255;
775 return (uint8_t) x;
776}
777
778/* derived from jidctint -- DCT_ISLOW */
779#define RJPEG_IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
780 int t0,t1,p4,p5,x0,x1,x2,x3; \
781 int p2 = s2; \
782 int p3 = s6; \
783 int p1 = (p2+p3) * RJPEG_F2F(0.5411961f); \
784 int t2 = p1 + p3 * RJPEG_F2F(-1.847759065f);\
785 int t3 = p1 + p2 * RJPEG_F2F( 0.765366865f);\
786 p2 = s0; \
787 p3 = s4; \
788 t0 = RJPEG_FSH(p2+p3); \
789 t1 = RJPEG_FSH(p2-p3); \
790 x0 = t0+t3; \
791 x3 = t0-t3; \
792 x1 = t1+t2; \
793 x2 = t1-t2; \
794 t0 = s7; \
795 t1 = s5; \
796 t2 = s3; \
797 t3 = s1; \
798 p3 = t0+t2; \
799 p4 = t1+t3; \
800 p1 = t0+t3; \
801 p2 = t1+t2; \
802 p5 = (p3+p4) * RJPEG_F2F( 1.175875602f); \
803 t0 = t0 * RJPEG_F2F( 0.298631336f); \
804 t1 = t1 * RJPEG_F2F( 2.053119869f); \
805 t2 = t2 * RJPEG_F2F( 3.072711026f); \
806 t3 = t3 * RJPEG_F2F( 1.501321110f); \
807 p1 = p5 + p1 * RJPEG_F2F(-0.899976223f); \
808 p2 = p5 + p2 * RJPEG_F2F(-2.562915447f); \
809 p3 = p3 * RJPEG_F2F(-1.961570560f); \
810 p4 = p4 * RJPEG_F2F(-0.390180644f); \
811 t3 += p1+p4; \
812 t2 += p2+p3; \
813 t1 += p2+p4; \
814 t0 += p1+p3
815
816static void rjpeg_idct_block(uint8_t *out, int out_stride, short data[64])
817{
818 int i,val[64],*v=val;
819 uint8_t *o = NULL;
820 int16_t *d = data;
821
822 /* columns */
823 for (i = 0; i < 8; ++i,++d, ++v)
824 {
825 /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
826 if ( d[ 8] == 0
827 && d[16] == 0
828 && d[24] == 0
829 && d[32] == 0
830 && d[40] == 0
831 && d[48] == 0
832 && d[56] == 0)
833 {
834 /* no shortcut 0 seconds
835 * (1|2|3|4|5|6|7)==0 0 seconds
836 * all separate -0.047 seconds
837 * 1 && 2|3 && 4|5 && 6|7: -0.047 seconds */
838 int dcterm = d[0] << 2;
839 v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
840 }
841 else
842 {
843 RJPEG_IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]);
844
845 /* constants scaled things up by 1<<12; let's bring them back
846 * down, but keep 2 extra bits of precision */
847 x0 += 512;
848 x1 += 512;
849 x2 += 512;
850 x3 += 512;
851
852 v[ 0] = (x0+t3) >> 10;
853 v[56] = (x0-t3) >> 10;
854 v[ 8] = (x1+t2) >> 10;
855 v[48] = (x1-t2) >> 10;
856 v[16] = (x2+t1) >> 10;
857 v[40] = (x2-t1) >> 10;
858 v[24] = (x3+t0) >> 10;
859 v[32] = (x3-t0) >> 10;
860 }
861 }
862
863 for (i = 0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
864 {
865 /* no fast case since the first 1D IDCT spread components out */
866 RJPEG_IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]);
867
868 /* constants scaled things up by 1<<12, plus we had 1<<2 from first
869 * loop, plus horizontal and vertical each scale by sqrt(8) so together
870 * we've got an extra 1<<3, so 1<<17 total we need to remove.
871 * so we want to round that, which means adding 0.5 * 1<<17,
872 * aka 65536. Also, we'll end up with -128 to 127 that we want
873 * to encode as 0..255 by adding 128, so we'll add that before the shift
874 */
875 x0 += 65536 + (128<<17);
876 x1 += 65536 + (128<<17);
877 x2 += 65536 + (128<<17);
878 x3 += 65536 + (128<<17);
879
880 /* Tried computing the shifts into temps, or'ing the temps to see
881 * if any were out of range, but that was slower */
882 o[0] = rjpeg_clamp((x0+t3) >> 17);
883 o[7] = rjpeg_clamp((x0-t3) >> 17);
884 o[1] = rjpeg_clamp((x1+t2) >> 17);
885 o[6] = rjpeg_clamp((x1-t2) >> 17);
886 o[2] = rjpeg_clamp((x2+t1) >> 17);
887 o[5] = rjpeg_clamp((x2-t1) >> 17);
888 o[3] = rjpeg_clamp((x3+t0) >> 17);
889 o[4] = rjpeg_clamp((x3-t0) >> 17);
890 }
891}
892
893#if defined(__SSE2__)
894/* sse2 integer IDCT. not the fastest possible implementation but it
895 * produces bit-identical results to the generic C version so it's
896 * fully "transparent".
897 */
898static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
899{
900 /* This is constructed to match our regular (generic) integer IDCT exactly. */
901 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
902 __m128i tmp;
903
904 /* dot product constant: even elems=x, odd elems=y */
905 #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
906
907 /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
908 * out(1) = c1[even]*x + c1[odd]*y
909 */
910 #define dct_rot(out0,out1, x,y,c0,c1) \
911 __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
912 __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
913 __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
914 __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
915 __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
916 __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
917
918 /* out = in << 12 (in 16-bit, out 32-bit) */
919 #define dct_widen(out, in) \
920 __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
921 __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
922
923 /* wide add */
924 #define dct_wadd(out, a, b) \
925 __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
926 __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
927
928 /* wide sub */
929 #define dct_wsub(out, a, b) \
930 __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
931 __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
932
933 /* butterfly a/b, add bias, then shift by "s" and pack */
934 #define dct_bfly32o(out0, out1, a,b,bias,s) \
935 { \
936 __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
937 __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
938 dct_wadd(sum, abiased, b); \
939 dct_wsub(dif, abiased, b); \
940 out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
941 out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
942 }
943
944 /* 8-bit interleave step (for transposes) */
945 #define dct_interleave8(a, b) \
946 tmp = a; \
947 a = _mm_unpacklo_epi8(a, b); \
948 b = _mm_unpackhi_epi8(tmp, b)
949
950 /* 16-bit interleave step (for transposes) */
951 #define dct_interleave16(a, b) \
952 tmp = a; \
953 a = _mm_unpacklo_epi16(a, b); \
954 b = _mm_unpackhi_epi16(tmp, b)
955
956 #define dct_pass(bias,shift) \
957 { \
958 /* even part */ \
959 dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
960 __m128i sum04 = _mm_add_epi16(row0, row4); \
961 __m128i dif04 = _mm_sub_epi16(row0, row4); \
962 dct_widen(t0e, sum04); \
963 dct_widen(t1e, dif04); \
964 dct_wadd(x0, t0e, t3e); \
965 dct_wsub(x3, t0e, t3e); \
966 dct_wadd(x1, t1e, t2e); \
967 dct_wsub(x2, t1e, t2e); \
968 /* odd part */ \
969 dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
970 dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
971 __m128i sum17 = _mm_add_epi16(row1, row7); \
972 __m128i sum35 = _mm_add_epi16(row3, row5); \
973 dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
974 dct_wadd(x4, y0o, y4o); \
975 dct_wadd(x5, y1o, y5o); \
976 dct_wadd(x6, y2o, y5o); \
977 dct_wadd(x7, y3o, y4o); \
978 dct_bfly32o(row0,row7, x0,x7,bias,shift); \
979 dct_bfly32o(row1,row6, x1,x6,bias,shift); \
980 dct_bfly32o(row2,row5, x2,x5,bias,shift); \
981 dct_bfly32o(row3,row4, x3,x4,bias,shift); \
982 }
983
984 __m128i rot0_0 = dct_const(RJPEG_F2F(0.5411961f), RJPEG_F2F(0.5411961f) + RJPEG_F2F(-1.847759065f));
985 __m128i rot0_1 = dct_const(RJPEG_F2F(0.5411961f) + RJPEG_F2F( 0.765366865f), RJPEG_F2F(0.5411961f));
986 __m128i rot1_0 = dct_const(RJPEG_F2F(1.175875602f) + RJPEG_F2F(-0.899976223f), RJPEG_F2F(1.175875602f));
987 __m128i rot1_1 = dct_const(RJPEG_F2F(1.175875602f), RJPEG_F2F(1.175875602f) + RJPEG_F2F(-2.562915447f));
988 __m128i rot2_0 = dct_const(RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 0.298631336f), RJPEG_F2F(-1.961570560f));
989 __m128i rot2_1 = dct_const(RJPEG_F2F(-1.961570560f), RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 3.072711026f));
990 __m128i rot3_0 = dct_const(RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 2.053119869f), RJPEG_F2F(-0.390180644f));
991 __m128i rot3_1 = dct_const(RJPEG_F2F(-0.390180644f), RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 1.501321110f));
992
993 /* rounding biases in column/row passes, see rjpeg_idct_block for explanation. */
994 __m128i bias_0 = _mm_set1_epi32(512);
995 __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
996
997 /* load */
998 row0 = _mm_load_si128((const __m128i *) (data + 0*8));
999 row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1000 row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1001 row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1002 row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1003 row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1004 row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1005 row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1006
1007 /* column pass */
1008 dct_pass(bias_0, 10);
1009
1010 {
1011 /* 16bit 8x8 transpose pass 1 */
1012 dct_interleave16(row0, row4);
1013 dct_interleave16(row1, row5);
1014 dct_interleave16(row2, row6);
1015 dct_interleave16(row3, row7);
1016
1017 /* transpose pass 2 */
1018 dct_interleave16(row0, row2);
1019 dct_interleave16(row1, row3);
1020 dct_interleave16(row4, row6);
1021 dct_interleave16(row5, row7);
1022
1023 /* transpose pass 3 */
1024 dct_interleave16(row0, row1);
1025 dct_interleave16(row2, row3);
1026 dct_interleave16(row4, row5);
1027 dct_interleave16(row6, row7);
1028 }
1029
1030 /* row pass */
1031 dct_pass(bias_1, 17);
1032
1033 {
1034 /* pack */
1035 __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1036 __m128i p1 = _mm_packus_epi16(row2, row3);
1037 __m128i p2 = _mm_packus_epi16(row4, row5);
1038 __m128i p3 = _mm_packus_epi16(row6, row7);
1039
1040 /* 8bit 8x8 transpose pass 1 */
1041 dct_interleave8(p0, p2); /* a0e0a1e1... */
1042 dct_interleave8(p1, p3); /* c0g0c1g1... */
1043
1044 /* transpose pass 2 */
1045 dct_interleave8(p0, p1); /* a0c0e0g0... */
1046 dct_interleave8(p2, p3); /* b0d0f0h0... */
1047
1048 /* transpose pass 3 */
1049 dct_interleave8(p0, p2); /* a0b0c0d0... */
1050 dct_interleave8(p1, p3); /* a4b4c4d4... */
1051
1052 /* store */
1053 _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1054 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1055 _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1056 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1057 _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1058 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1059 _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1060 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1061 }
1062
1063#undef dct_const
1064#undef dct_rot
1065#undef dct_widen
1066#undef dct_wadd
1067#undef dct_wsub
1068#undef dct_bfly32o
1069#undef dct_interleave8
1070#undef dct_interleave16
1071#undef dct_pass
1072}
1073
1074#endif
1075
1076#ifdef RJPEG_NEON
1077
1078/* NEON integer IDCT. should produce bit-identical
1079 * results to the generic C version. */
1080static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
1081{
1082 int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1083
1084 int16x4_t rot0_0 = vdup_n_s16(RJPEG_F2F(0.5411961f));
1085 int16x4_t rot0_1 = vdup_n_s16(RJPEG_F2F(-1.847759065f));
1086 int16x4_t rot0_2 = vdup_n_s16(RJPEG_F2F( 0.765366865f));
1087 int16x4_t rot1_0 = vdup_n_s16(RJPEG_F2F( 1.175875602f));
1088 int16x4_t rot1_1 = vdup_n_s16(RJPEG_F2F(-0.899976223f));
1089 int16x4_t rot1_2 = vdup_n_s16(RJPEG_F2F(-2.562915447f));
1090 int16x4_t rot2_0 = vdup_n_s16(RJPEG_F2F(-1.961570560f));
1091 int16x4_t rot2_1 = vdup_n_s16(RJPEG_F2F(-0.390180644f));
1092 int16x4_t rot3_0 = vdup_n_s16(RJPEG_F2F( 0.298631336f));
1093 int16x4_t rot3_1 = vdup_n_s16(RJPEG_F2F( 2.053119869f));
1094 int16x4_t rot3_2 = vdup_n_s16(RJPEG_F2F( 3.072711026f));
1095 int16x4_t rot3_3 = vdup_n_s16(RJPEG_F2F( 1.501321110f));
1096
1097#define dct_long_mul(out, inq, coeff) \
1098 int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1099 int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1100
1101#define dct_long_mac(out, acc, inq, coeff) \
1102 int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1103 int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1104
1105#define dct_widen(out, inq) \
1106 int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1107 int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1108
1109/* wide add */
1110#define dct_wadd(out, a, b) \
1111 int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1112 int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1113
1114/* wide sub */
1115#define dct_wsub(out, a, b) \
1116 int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1117 int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1118
1119/* butterfly a/b, then shift using "shiftop" by "s" and pack */
1120#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1121 { \
1122 dct_wadd(sum, a, b); \
1123 dct_wsub(dif, a, b); \
1124 out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1125 out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1126 }
1127
1128#define dct_pass(shiftop, shift) \
1129 { \
1130 /* even part */ \
1131 int16x8_t sum26 = vaddq_s16(row2, row6); \
1132 dct_long_mul(p1e, sum26, rot0_0); \
1133 dct_long_mac(t2e, p1e, row6, rot0_1); \
1134 dct_long_mac(t3e, p1e, row2, rot0_2); \
1135 int16x8_t sum04 = vaddq_s16(row0, row4); \
1136 int16x8_t dif04 = vsubq_s16(row0, row4); \
1137 dct_widen(t0e, sum04); \
1138 dct_widen(t1e, dif04); \
1139 dct_wadd(x0, t0e, t3e); \
1140 dct_wsub(x3, t0e, t3e); \
1141 dct_wadd(x1, t1e, t2e); \
1142 dct_wsub(x2, t1e, t2e); \
1143 /* odd part */ \
1144 int16x8_t sum15 = vaddq_s16(row1, row5); \
1145 int16x8_t sum17 = vaddq_s16(row1, row7); \
1146 int16x8_t sum35 = vaddq_s16(row3, row5); \
1147 int16x8_t sum37 = vaddq_s16(row3, row7); \
1148 int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1149 dct_long_mul(p5o, sumodd, rot1_0); \
1150 dct_long_mac(p1o, p5o, sum17, rot1_1); \
1151 dct_long_mac(p2o, p5o, sum35, rot1_2); \
1152 dct_long_mul(p3o, sum37, rot2_0); \
1153 dct_long_mul(p4o, sum15, rot2_1); \
1154 dct_wadd(sump13o, p1o, p3o); \
1155 dct_wadd(sump24o, p2o, p4o); \
1156 dct_wadd(sump23o, p2o, p3o); \
1157 dct_wadd(sump14o, p1o, p4o); \
1158 dct_long_mac(x4, sump13o, row7, rot3_0); \
1159 dct_long_mac(x5, sump24o, row5, rot3_1); \
1160 dct_long_mac(x6, sump23o, row3, rot3_2); \
1161 dct_long_mac(x7, sump14o, row1, rot3_3); \
1162 dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1163 dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1164 dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1165 dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1166 }
1167
1168 /* load */
1169 row0 = vld1q_s16(data + 0*8);
1170 row1 = vld1q_s16(data + 1*8);
1171 row2 = vld1q_s16(data + 2*8);
1172 row3 = vld1q_s16(data + 3*8);
1173 row4 = vld1q_s16(data + 4*8);
1174 row5 = vld1q_s16(data + 5*8);
1175 row6 = vld1q_s16(data + 6*8);
1176 row7 = vld1q_s16(data + 7*8);
1177
1178 /* add DC bias */
1179 row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1180
1181 /* column pass */
1182 dct_pass(vrshrn_n_s32, 10);
1183
1184 /* 16bit 8x8 transpose */
1185 {
1186/* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1187 * whether compilers actually get this is another story, sadly. */
1188#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1189#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1190#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1191
1192 /* pass 1 */
1193 dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1194 dct_trn16(row2, row3);
1195 dct_trn16(row4, row5);
1196 dct_trn16(row6, row7);
1197
1198 /* pass 2 */
1199 dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1200 dct_trn32(row1, row3);
1201 dct_trn32(row4, row6);
1202 dct_trn32(row5, row7);
1203
1204 /* pass 3 */
1205 dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1206 dct_trn64(row1, row5);
1207 dct_trn64(row2, row6);
1208 dct_trn64(row3, row7);
1209
1210#undef dct_trn16
1211#undef dct_trn32
1212#undef dct_trn64
1213 }
1214
1215 /* row pass
1216 * vrshrn_n_s32 only supports shifts up to 16, we need
1217 * 17. so do a non-rounding shift of 16 first then follow
1218 * up with a rounding shift by 1. */
1219 dct_pass(vshrn_n_s32, 16);
1220
1221 {
1222 /* pack and round */
1223 uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1224 uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1225 uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1226 uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1227 uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1228 uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1229 uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1230 uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1231
1232 /* again, these can translate into one instruction, but often don't. */
1233#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1234#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1235#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1236
1237 /* sadly can't use interleaved stores here since we only write
1238 * 8 bytes to each scan line! */
1239
1240 /* 8x8 8-bit transpose pass 1 */
1241 dct_trn8_8(p0, p1);
1242 dct_trn8_8(p2, p3);
1243 dct_trn8_8(p4, p5);
1244 dct_trn8_8(p6, p7);
1245
1246 /* pass 2 */
1247 dct_trn8_16(p0, p2);
1248 dct_trn8_16(p1, p3);
1249 dct_trn8_16(p4, p6);
1250 dct_trn8_16(p5, p7);
1251
1252 /* pass 3 */
1253 dct_trn8_32(p0, p4);
1254 dct_trn8_32(p1, p5);
1255 dct_trn8_32(p2, p6);
1256 dct_trn8_32(p3, p7);
1257
1258 /* store */
1259 vst1_u8(out, p0);
1260 out += out_stride;
1261 vst1_u8(out, p1);
1262 out += out_stride;
1263 vst1_u8(out, p2);
1264 out += out_stride;
1265 vst1_u8(out, p3);
1266 out += out_stride;
1267 vst1_u8(out, p4);
1268 out += out_stride;
1269 vst1_u8(out, p5);
1270 out += out_stride;
1271 vst1_u8(out, p6);
1272 out += out_stride;
1273 vst1_u8(out, p7);
1274
1275#undef dct_trn8_8
1276#undef dct_trn8_16
1277#undef dct_trn8_32
1278 }
1279
1280#undef dct_long_mul
1281#undef dct_long_mac
1282#undef dct_widen
1283#undef dct_wadd
1284#undef dct_wsub
1285#undef dct_bfly32o
1286#undef dct_pass
1287}
1288
1289#endif /* RJPEG_NEON */
1290
1291static uint8_t rjpeg_get_marker(rjpeg_jpeg *j)
1292{
1293 uint8_t x;
1294
1295 if (j->marker != RJPEG_MARKER_NONE)
1296 {
1297 x = j->marker;
1298 j->marker = RJPEG_MARKER_NONE;
1299 return x;
1300 }
1301
1302 x = rjpeg_get8(j->s);
1303 if (x != 0xff)
1304 return RJPEG_MARKER_NONE;
1305 while (x == 0xff)
1306 x = rjpeg_get8(j->s);
1307 return x;
1308}
1309
1310/* after a restart interval, rjpeg_jpeg_reset the entropy decoder and
1311 * the dc prediction
1312 */
1313static void rjpeg_jpeg_reset(rjpeg_jpeg *j)
1314{
1315 j->code_bits = 0;
1316 j->code_buffer = 0;
1317 j->nomore = 0;
1318 j->img_comp[0].dc_pred = 0;
1319 j->img_comp[1].dc_pred = 0;
1320 j->img_comp[2].dc_pred = 0;
1321 j->marker = RJPEG_MARKER_NONE;
1322 j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1323 j->eob_run = 0;
1324
1325 /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1326 * since we don't even allow 1<<30 pixels */
1327}
1328
1329static int rjpeg_parse_entropy_coded_data(rjpeg_jpeg *z)
1330{
1331 rjpeg_jpeg_reset(z);
1332
1333 if (z->scan_n == 1)
1334 {
1335 int i, j;
1336 int n = z->order[0];
1337 int w = (z->img_comp[n].x+7) >> 3;
1338 int h = (z->img_comp[n].y+7) >> 3;
1339
1340 /* non-interleaved data, we just need to process one block at a time,
1341 * in trivial scanline order
1342 * number of blocks to do just depends on how many actual "pixels" this
1343 * component has, independent of interleaved MCU blocking and such */
1344
1345 if (z->progressive)
1346 {
1347 for (j = 0; j < h; ++j)
1348 {
1349 for (i = 0; i < w; ++i)
1350 {
1351 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1352
1353 if (z->spec_start == 0)
1354 {
1355 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1356 return 0;
1357 }
1358 else
1359 {
1360 int ha = z->img_comp[n].ha;
1361 if (!rjpeg_jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1362 return 0;
1363 }
1364
1365 /* every data block is an MCU, so countdown the restart interval */
1366 if (--z->todo <= 0)
1367 {
1368 if (z->code_bits < 24)
1369 rjpeg_grow_buffer_unsafe(z);
1370
1371 if (!RJPEG_RESTART(z->marker))
1372 return 1;
1373 rjpeg_jpeg_reset(z);
1374 }
1375 }
1376 }
1377 }
1378 else
1379 {
1380 RJPEG_SIMD_ALIGN(short, data[64]);
1381
1382 for (j = 0; j < h; ++j)
1383 {
1384 for (i = 0; i < w; ++i)
1385 {
1386 int ha = z->img_comp[n].ha;
1387 if (!rjpeg_jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1388 z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1389 return 0;
1390
1391 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1392 z->img_comp[n].w2, data);
1393
1394 /* every data block is an MCU, so countdown the restart interval */
1395 if (--z->todo <= 0)
1396 {
1397 if (z->code_bits < 24)
1398 rjpeg_grow_buffer_unsafe(z);
1399
1400 /* if it's NOT a restart, then just bail,
1401 * so we get corrupt data rather than no data */
1402 if (!RJPEG_RESTART(z->marker))
1403 return 1;
1404 rjpeg_jpeg_reset(z);
1405 }
1406 }
1407 }
1408 }
1409 }
1410 else
1411 {
1412 /* interleaved */
1413 int i,j,k,x,y;
1414
1415 if (z->progressive)
1416 {
1417 for (j = 0; j < z->img_mcu_y; ++j)
1418 {
1419 for (i = 0; i < z->img_mcu_x; ++i)
1420 {
1421 /* scan an interleaved MCU... process scan_n components in order */
1422 for (k = 0; k < z->scan_n; ++k)
1423 {
1424 int n = z->order[k];
1425 /* scan out an MCU's worth of this component; that's just determined
1426 * by the basic H and V specified for the component */
1427 for (y = 0; y < z->img_comp[n].v; ++y)
1428 {
1429 for (x = 0; x < z->img_comp[n].h; ++x)
1430 {
1431 int x2 = (i*z->img_comp[n].h + x);
1432 int y2 = (j*z->img_comp[n].v + y);
1433 short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1434 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1435 return 0;
1436 }
1437 }
1438 }
1439
1440 /* after all interleaved components, that's an interleaved MCU,
1441 * so now count down the restart interval */
1442 if (--z->todo <= 0)
1443 {
1444 if (z->code_bits < 24)
1445 rjpeg_grow_buffer_unsafe(z);
1446 if (!RJPEG_RESTART(z->marker))
1447 return 1;
1448 rjpeg_jpeg_reset(z);
1449 }
1450 }
1451 }
1452 }
1453 else
1454 {
1455 RJPEG_SIMD_ALIGN(short, data[64]);
1456
1457 for (j = 0; j < z->img_mcu_y; ++j)
1458 {
1459 for (i = 0; i < z->img_mcu_x; ++i)
1460 {
1461 /* scan an interleaved MCU... process scan_n components in order */
1462 for (k = 0; k < z->scan_n; ++k)
1463 {
1464 int n = z->order[k];
1465 /* scan out an MCU's worth of this component; that's just determined
1466 * by the basic H and V specified for the component */
1467 for (y = 0; y < z->img_comp[n].v; ++y)
1468 {
1469 for (x = 0; x < z->img_comp[n].h; ++x)
1470 {
1471 int x2 = (i*z->img_comp[n].h + x)*8;
1472 int y2 = (j*z->img_comp[n].v + y)*8;
1473 int ha = z->img_comp[n].ha;
1474
1475 if (!rjpeg_jpeg_decode_block(z, data,
1476 z->huff_dc+z->img_comp[n].hd,
1477 z->huff_ac+ha, z->fast_ac[ha],
1478 n, z->dequant[z->img_comp[n].tq]))
1479 return 0;
1480
1481 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1482 z->img_comp[n].w2, data);
1483 }
1484 }
1485 }
1486
1487 /* after all interleaved components, that's an interleaved MCU,
1488 * so now count down the restart interval */
1489 if (--z->todo <= 0)
1490 {
1491 if (z->code_bits < 24)
1492 rjpeg_grow_buffer_unsafe(z);
1493 if (!RJPEG_RESTART(z->marker))
1494 return 1;
1495 rjpeg_jpeg_reset(z);
1496 }
1497 }
1498 }
1499 }
1500 }
1501
1502 return 1;
1503}
1504
1505static void rjpeg_jpeg_dequantize(short *data, uint8_t *dequant)
1506{
1507 int i;
1508 for (i = 0; i < 64; ++i)
1509 data[i] *= dequant[i];
1510}
1511
1512static void rjpeg_jpeg_finish(rjpeg_jpeg *z)
1513{
1514 int i,j,n;
1515
1516 if (!z->progressive)
1517 return;
1518
1519 /* dequantize and IDCT the data */
1520 for (n = 0; n < z->s->img_n; ++n)
1521 {
1522 int w = (z->img_comp[n].x+7) >> 3;
1523 int h = (z->img_comp[n].y+7) >> 3;
1524 for (j = 0; j < h; ++j)
1525 {
1526 for (i = 0; i < w; ++i)
1527 {
1528 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1529 rjpeg_jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1530 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1531 z->img_comp[n].w2, data);
1532 }
1533 }
1534 }
1535}
1536
1537static int rjpeg_process_marker(rjpeg_jpeg *z, int m)
1538{
1539 int L;
1540 switch (m)
1541 {
1542 case RJPEG_MARKER_NONE: /* no marker found */
1543 /* Expected marker. Corrupt JPEG? */
1544 return 0;
1545
1546 case 0xDD: /* DRI - specify restart interval */
1547
1548 /* Bad DRI length. Corrupt JPEG? */
1549 if (RJPEG_GET16BE(z->s) != 4)
1550 return 0;
1551
1552 z->restart_interval = RJPEG_GET16BE(z->s);
1553 return 1;
1554
1555 case 0xDB: /* DQT - define quantization table */
1556 L = RJPEG_GET16BE(z->s)-2;
1557 while (L > 0)
1558 {
1559 int q = rjpeg_get8(z->s);
1560 int p = q >> 4;
1561 int t = q & 15,i;
1562
1563 /* Bad DQT type. Corrupt JPEG? */
1564 if (p != 0)
1565 return 0;
1566
1567 /* Bad DQT table. Corrupt JPEG? */
1568 if (t > 3)
1569 return 0;
1570
1571 for (i = 0; i < 64; ++i)
1572 z->dequant[t][rjpeg_jpeg_dezigzag[i]] = rjpeg_get8(z->s);
1573 L -= 65;
1574 }
1575 return L == 0;
1576
1577 case 0xC4: /* DHT - define huffman table */
1578 L = RJPEG_GET16BE(z->s)-2;
1579 while (L > 0)
1580 {
1581 int sizes[16],i,n = 0;
1582 uint8_t *v = NULL;
1583 int q = rjpeg_get8(z->s);
1584 int tc = q >> 4;
1585 int th = q & 15;
1586
1587 /* Bad DHT header. Corrupt JPEG? */
1588 if (tc > 1 || th > 3)
1589 return 0;
1590
1591 for (i = 0; i < 16; ++i)
1592 {
1593 sizes[i] = rjpeg_get8(z->s);
1594 n += sizes[i];
1595 }
1596 L -= 17;
1597
1598 if (tc == 0)
1599 {
1600 if (!rjpeg_build_huffman(z->huff_dc+th, sizes))
1601 return 0;
1602 v = z->huff_dc[th].values;
1603 }
1604 else
1605 {
1606 if (!rjpeg_build_huffman(z->huff_ac+th, sizes))
1607 return 0;
1608 v = z->huff_ac[th].values;
1609 }
1610 for (i = 0; i < n; ++i)
1611 v[i] = rjpeg_get8(z->s);
1612 if (tc != 0)
1613 rjpeg_build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1614 L -= n;
1615 }
1616 return L == 0;
1617 }
1618
1619 /* check for comment block or APP blocks */
1620 if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1621 {
1622 int n = RJPEG_GET16BE(z->s)-2;
1623
1624 if (n < 0)
1625 z->s->img_buffer = z->s->img_buffer_end;
1626 else
1627 z->s->img_buffer += n;
1628
1629 return 1;
1630 }
1631 return 0;
1632}
1633
1634/* after we see SOS */
1635static int rjpeg_process_scan_header(rjpeg_jpeg *z)
1636{
1637 int i;
1638 int aa;
1639 int Ls = RJPEG_GET16BE(z->s);
1640
1641 z->scan_n = rjpeg_get8(z->s);
1642
1643 /* Bad SOS component count. Corrupt JPEG? */
1644 if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1645 return 0;
1646
1647 /* Bad SOS length. Corrupt JPEG? */
1648 if (Ls != 6+2*z->scan_n)
1649 return 0;
1650
1651 for (i = 0; i < z->scan_n; ++i)
1652 {
1653 int which;
1654 int id = rjpeg_get8(z->s);
1655 int q = rjpeg_get8(z->s);
1656
1657 for (which = 0; which < z->s->img_n; ++which)
1658 if (z->img_comp[which].id == id)
1659 break;
1660 if (which == z->s->img_n)
1661 return 0; /* no match */
1662
1663 /* Bad DC huff. Corrupt JPEG? */
1664 z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3)
1665 return 0;
1666
1667 /* Bad AC huff. Corrupt JPEG? */
1668 z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3)
1669 return 0;
1670
1671 z->order[i] = which;
1672 }
1673
1674 z->spec_start = rjpeg_get8(z->s);
1675 z->spec_end = rjpeg_get8(z->s); /* should be 63, but might be 0 */
1676 aa = rjpeg_get8(z->s);
1677 z->succ_high = (aa >> 4);
1678 z->succ_low = (aa & 15);
1679
1680 if (z->progressive)
1681 {
1682 /* Bad SOS. Corrupt JPEG? */
1683 if ( z->spec_start > 63 ||
1684 z->spec_end > 63 ||
1685 z->spec_start > z->spec_end ||
1686 z->succ_high > 13 ||
1687 z->succ_low > 13)
1688 return 0;
1689 }
1690 else
1691 {
1692 /* Bad SOS. Corrupt JPEG? */
1693 if (z->spec_start != 0)
1694 return 0;
1695 if (z->succ_high != 0 || z->succ_low != 0)
1696 return 0;
1697
1698 z->spec_end = 63;
1699 }
1700
1701 return 1;
1702}
1703
1704static int rjpeg_process_frame_header(rjpeg_jpeg *z, int scan)
1705{
1706 rjpeg_context *s = z->s;
1707 int Lf,p,i,q, h_max=1,v_max=1,c;
1708 Lf = RJPEG_GET16BE(s);
1709
1710 /* JPEG */
1711
1712 /* Bad SOF len. Corrupt JPEG? */
1713 if (Lf < 11)
1714 return 0;
1715
1716 p = rjpeg_get8(s);
1717
1718 /* JPEG baseline */
1719
1720 /* Only 8-bit. JPEG format not supported? */
1721 if (p != 8)
1722 return 0;
1723
1724 s->img_y = RJPEG_GET16BE(s);
1725
1726 /* Legal, but we don't handle it--but neither does IJG */
1727
1728 /* No header height, JPEG format not supported? */
1729 if (s->img_y == 0)
1730 return 0;
1731
1732 s->img_x = RJPEG_GET16BE(s);
1733
1734 /* No header width. Corrupt JPEG? */
1735 if (s->img_x == 0)
1736 return 0;
1737
1738 c = rjpeg_get8(s);
1739
1740 /* JFIF requires */
1741
1742 /* Bad component count. Corrupt JPEG? */
1743 if (c != 3 && c != 1)
1744 return 0;
1745
1746 s->img_n = c;
1747
1748 for (i = 0; i < c; ++i)
1749 {
1750 z->img_comp[i].data = NULL;
1751 z->img_comp[i].linebuf = NULL;
1752 }
1753
1754 /* Bad SOF length. Corrupt JPEG? */
1755 if (Lf != 8+3*s->img_n)
1756 return 0;
1757
1758 for (i = 0; i < s->img_n; ++i)
1759 {
1760 z->img_comp[i].id = rjpeg_get8(s);
1761 if (z->img_comp[i].id != i+1) /* JFIF requires */
1762 if (z->img_comp[i].id != i) /* some version of jpegtran outputs non-JFIF-compliant files! */
1763 return 0;
1764
1765 q = rjpeg_get8(s);
1766 z->img_comp[i].h = (q >> 4);
1767
1768 /* Bad H. Corrupt JPEG? */
1769 if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1770 return 0;
1771
1772 z->img_comp[i].v = q & 15;
1773
1774 /* Bad V. Corrupt JPEG? */
1775 if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1776 return 0;
1777
1778 z->img_comp[i].tq = rjpeg_get8(s);
1779
1780 /* Bad TQ. Corrupt JPEG? */
1781 if (z->img_comp[i].tq > 3)
1782 return 0;
1783 }
1784
1785 if (scan != RJPEG_SCAN_LOAD)
1786 return 1;
1787
1788 /* Image too large to decode? */
1789 if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1790 return 0;
1791
1792 for (i = 0; i < s->img_n; ++i)
1793 {
1794 if (z->img_comp[i].h > h_max)
1795 h_max = z->img_comp[i].h;
1796 if (z->img_comp[i].v > v_max)
1797 v_max = z->img_comp[i].v;
1798 }
1799
1800 /* compute interleaved MCU info */
1801 z->img_h_max = h_max;
1802 z->img_v_max = v_max;
1803 z->img_mcu_w = h_max * 8;
1804 z->img_mcu_h = v_max * 8;
1805 z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1806 z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1807
1808 if (z->progressive)
1809 {
1810 for (i = 0; i < s->img_n; ++i)
1811 {
1812 /* number of effective pixels (e.g. for non-interleaved MCU) */
1813 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1814 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1815
1816 /* to simplify generation, we'll allocate enough memory to decode
1817 * the bogus oversized data from using interleaved MCUs and their
1818 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1819 * discard the extra data until colorspace conversion */
1820 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1821 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1822 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1823
1824 /* Out of memory? */
1825 if (!z->img_comp[i].raw_data)
1826 {
1827 for (--i; i >= 0; --i)
1828 {
1829 free(z->img_comp[i].raw_data);
1830 z->img_comp[i].data = NULL;
1831 }
1832
1833 return 0;
1834 }
1835
1836 /* align blocks for IDCT using MMX/SSE */
1837 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1838 z->img_comp[i].linebuf = NULL;
1839 z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1840 z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1841 z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1842 z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1843 z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1844 }
1845 }
1846 else
1847 {
1848 for (i = 0; i < s->img_n; ++i)
1849 {
1850 /* number of effective pixels (e.g. for non-interleaved MCU) */
1851 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1852 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1853
1854 /* to simplify generation, we'll allocate enough memory to decode
1855 * the bogus oversized data from using interleaved MCUs and their
1856 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1857 * discard the extra data until colorspace conversion */
1858 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1859 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1860 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1861
1862 /* Out of memory? */
1863 if (!z->img_comp[i].raw_data)
1864 {
1865 for (--i; i >= 0; --i)
1866 {
1867 free(z->img_comp[i].raw_data);
1868 z->img_comp[i].data = NULL;
1869 }
1870 }
1871
1872 /* align blocks for IDCT using MMX/SSE */
1873 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1874 z->img_comp[i].linebuf = NULL;
1875 z->img_comp[i].coeff = 0;
1876 z->img_comp[i].raw_coeff = 0;
1877 }
1878 }
1879
1880 return 1;
1881}
1882
1883static int rjpeg_decode_jpeg_header(rjpeg_jpeg *z, int scan)
1884{
1885 int m;
1886 z->marker = RJPEG_MARKER_NONE; /* initialize cached marker to empty */
1887 m = rjpeg_get_marker(z);
1888
1889 /* No SOI. Corrupt JPEG? */
1890 if (m != JPEG_MARKER_SOI)
1891 return 0;
1892
1893 if (scan == RJPEG_SCAN_TYPE)
1894 return 1;
1895
1896 m = rjpeg_get_marker(z);
1897 while (!RJPEG_SOF(m))
1898 {
1899 if (!rjpeg_process_marker(z,m))
1900 return 0;
1901 m = rjpeg_get_marker(z);
1902 while (m == RJPEG_MARKER_NONE)
1903 {
1904 /* some files have extra padding after their blocks, so ok, we'll scan */
1905
1906 /* No SOF. Corrupt JPEG? */
1907 if (RJPEG_AT_EOF(z->s))
1908 return 0;
1909
1910 m = rjpeg_get_marker(z);
1911 }
1912 }
1913 z->progressive = RJPEG_SOF_PROGRESSIVE(m);
1914 if (!rjpeg_process_frame_header(z, scan))
1915 return 0;
1916 return 1;
1917}
1918
1919/* decode image to YCbCr format */
1920static int rjpeg_decode_jpeg_image(rjpeg_jpeg *j)
1921{
1922 int m;
1923 for (m = 0; m < 4; m++)
1924 {
1925 j->img_comp[m].raw_data = NULL;
1926 j->img_comp[m].raw_coeff = NULL;
1927 }
1928 j->restart_interval = 0;
1929 if (!rjpeg_decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1930 return 0;
1931 m = rjpeg_get_marker(j);
1932
1933 while (m != JPEG_MARKER_EOI)
1934 {
1935 if (m == JPEG_MARKER_SOS)
1936 {
1937 if (!rjpeg_process_scan_header(j))
1938 return 0;
1939 if (!rjpeg_parse_entropy_coded_data(j))
1940 return 0;
1941
1942 if (j->marker == RJPEG_MARKER_NONE )
1943 {
1944 /* handle 0s at the end of image data from IP Kamera 9060 */
1945
1946 while (!RJPEG_AT_EOF(j->s))
1947 {
1948 int x = rjpeg_get8(j->s);
1949 if (x == 255)
1950 {
1951 j->marker = rjpeg_get8(j->s);
1952 break;
1953 }
1954 else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1955 return 0;
1956 }
1957
1958 /* if we reach eof without hitting a marker,
1959 * rjpeg_get_marker() below will fail and we'll eventually return 0 */
1960 }
1961 }
1962 else
1963 {
1964 if (!rjpeg_process_marker(j, m))
1965 return 0;
1966 }
1967 m = rjpeg_get_marker(j);
1968 }
1969
1970 if (j->progressive)
1971 rjpeg_jpeg_finish(j);
1972 return 1;
1973}
1974
1975/* static jfif-centered resampling (across block boundaries) */
1976
1977static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1978 uint8_t *in_far, int w, int hs)
1979{
1980 (void)out;
1981 (void)in_far;
1982 (void)w;
1983 (void)hs;
1984 return in_near;
1985}
1986
1987static uint8_t* rjpeg_resample_row_v_2(uint8_t *out, uint8_t *in_near,
1988 uint8_t *in_far, int w, int hs)
1989{
1990 /* need to generate two samples vertically for every one in input */
1991 int i;
1992 (void)hs;
1993 for (i = 0; i < w; ++i)
1994 out[i] = RJPEG_DIV4(3*in_near[i] + in_far[i] + 2);
1995 return out;
1996}
1997
1998static uint8_t* rjpeg_resample_row_h_2(uint8_t *out, uint8_t *in_near,
1999 uint8_t *in_far, int w, int hs)
2000{
2001 /* need to generate two samples horizontally for every one in input */
2002 int i;
2003 uint8_t *input = in_near;
2004
2005 if (w == 1)
2006 {
2007 /* if only one sample, can't do any interpolation */
2008 out[0] = out[1] = input[0];
2009 return out;
2010 }
2011
2012 out[0] = input[0];
2013 out[1] = RJPEG_DIV4(input[0]*3 + input[1] + 2);
2014
2015 for (i=1; i < w-1; ++i)
2016 {
2017 int n = 3 * input[i] + 2;
2018 out[i*2+0] = RJPEG_DIV4(n+input[i-1]);
2019 out[i*2+1] = RJPEG_DIV4(n+input[i+1]);
2020 }
2021 out[i*2+0] = RJPEG_DIV4(input[w-2]*3 + input[w-1] + 2);
2022 out[i*2+1] = input[w-1];
2023
2024 (void)in_far;
2025 (void)hs;
2026
2027 return out;
2028}
2029
2030static uint8_t *rjpeg_resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2031 uint8_t *in_far, int w, int hs)
2032{
2033 /* need to generate 2x2 samples for every one in input */
2034 int i,t0,t1;
2035 if (w == 1)
2036 {
2037 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2038 return out;
2039 }
2040
2041 t1 = 3*in_near[0] + in_far[0];
2042 out[0] = RJPEG_DIV4(t1+2);
2043
2044 for (i = 1; i < w; ++i)
2045 {
2046 t0 = t1;
2047 t1 = 3*in_near[i]+in_far[i];
2048 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2049 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2050 }
2051 out[w*2-1] = RJPEG_DIV4(t1+2);
2052
2053 (void)hs;
2054
2055 return out;
2056}
2057
2058#if defined(__SSE2__) || defined(RJPEG_NEON)
2059static uint8_t *rjpeg_resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2060 uint8_t *in_far, int w, int hs)
2061{
2062 /* need to generate 2x2 samples for every one in input */
2063 int i = 0,t0,t1;
2064
2065 if (w == 1)
2066 {
2067 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2068 return out;
2069 }
2070
2071 t1 = 3*in_near[0] + in_far[0];
2072 /* process groups of 8 pixels for as long as we can.
2073 * note we can't handle the last pixel in a row in this loop
2074 * because we need to handle the filter boundary conditions.
2075 */
2076 for (; i < ((w-1) & ~7); i += 8)
2077 {
2078#if defined(__SSE2__)
2079 /* load and perform the vertical filtering pass
2080 * this uses 3*x + y = 4*x + (y - x) */
2081 __m128i zero = _mm_setzero_si128();
2082 __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
2083 __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2084 __m128i farw = _mm_unpacklo_epi8(farb, zero);
2085 __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2086 __m128i diff = _mm_sub_epi16(farw, nearw);
2087 __m128i nears = _mm_slli_epi16(nearw, 2);
2088 __m128i curr = _mm_add_epi16(nears, diff); /* current row */
2089
2090 /* horizontal filter works the same based on shifted vers of current
2091 * row. "prev" is current row shifted right by 1 pixel; we need to
2092 * insert the previous pixel value (from t1).
2093 * "next" is current row shifted left by 1 pixel, with first pixel
2094 * of next block of 8 pixels added in.
2095 */
2096 __m128i prv0 = _mm_slli_si128(curr, 2);
2097 __m128i nxt0 = _mm_srli_si128(curr, 2);
2098 __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2099 __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2100
2101 /* horizontal filter, polyphase implementation since it's convenient:
2102 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2103 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2104 * note the shared term. */
2105 __m128i bias = _mm_set1_epi16(8);
2106 __m128i curs = _mm_slli_epi16(curr, 2);
2107 __m128i prvd = _mm_sub_epi16(prev, curr);
2108 __m128i nxtd = _mm_sub_epi16(next, curr);
2109 __m128i curb = _mm_add_epi16(curs, bias);
2110 __m128i even = _mm_add_epi16(prvd, curb);
2111 __m128i odd = _mm_add_epi16(nxtd, curb);
2112
2113 /* interleave even and odd pixels, then undo scaling. */
2114 __m128i int0 = _mm_unpacklo_epi16(even, odd);
2115 __m128i int1 = _mm_unpackhi_epi16(even, odd);
2116 __m128i de0 = _mm_srli_epi16(int0, 4);
2117 __m128i de1 = _mm_srli_epi16(int1, 4);
2118
2119 /* pack and write output */
2120 __m128i outv = _mm_packus_epi16(de0, de1);
2121 _mm_storeu_si128((__m128i *) (out + i*2), outv);
2122#elif defined(RJPEG_NEON)
2123 /* load and perform the vertical filtering pass
2124 * this uses 3*x + y = 4*x + (y - x) */
2125 uint8x8_t farb = vld1_u8(in_far + i);
2126 uint8x8_t nearb = vld1_u8(in_near + i);
2127 int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2128 int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2129 int16x8_t curr = vaddq_s16(nears, diff); /* current row */
2130
2131 /* horizontal filter works the same based on shifted vers of current
2132 * row. "prev" is current row shifted right by 1 pixel; we need to
2133 * insert the previous pixel value (from t1).
2134 * "next" is current row shifted left by 1 pixel, with first pixel
2135 * of next block of 8 pixels added in. */
2136 int16x8_t prv0 = vextq_s16(curr, curr, 7);
2137 int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2138 int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2139 int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2140
2141 /* horizontal filter, polyphase implementation since it's convenient:
2142 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2143 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2144 * note the shared term.
2145 */
2146 int16x8_t curs = vshlq_n_s16(curr, 2);
2147 int16x8_t prvd = vsubq_s16(prev, curr);
2148 int16x8_t nxtd = vsubq_s16(next, curr);
2149 int16x8_t even = vaddq_s16(curs, prvd);
2150 int16x8_t odd = vaddq_s16(curs, nxtd);
2151
2152 /* undo scaling and round, then store with even/odd phases interleaved */
2153 uint8x8x2_t o;
2154 o.val[0] = vqrshrun_n_s16(even, 4);
2155 o.val[1] = vqrshrun_n_s16(odd, 4);
2156 vst2_u8(out + i*2, o);
2157#endif
2158
2159 /* "previous" value for next iteration */
2160 t1 = 3*in_near[i+7] + in_far[i+7];
2161 }
2162
2163 t0 = t1;
2164 t1 = 3*in_near[i] + in_far[i];
2165 out[i*2] = RJPEG_DIV16(3*t1 + t0 + 8);
2166
2167 for (++i; i < w; ++i)
2168 {
2169 t0 = t1;
2170 t1 = 3*in_near[i]+in_far[i];
2171 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2172 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2173 }
2174 out[w*2-1] = RJPEG_DIV4(t1+2);
2175
2176 (void)hs;
2177
2178 return out;
2179}
2180#endif
2181
2182static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
2183 uint8_t *in_near, uint8_t *in_far, int w, int hs)
2184{
2185 /* resample with nearest-neighbor */
2186 int i,j;
2187 (void)in_far;
2188
2189 for (i = 0; i < w; ++i)
2190 for (j = 0; j < hs; ++j)
2191 out[i*hs+j] = in_near[i];
2192 return out;
2193}
2194
2195/* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2196 * to make sure the code produces the same results in both SIMD and scalar */
2197#ifndef FLOAT2FIXED
2198#define FLOAT2FIXED(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
2199#endif
2200
2201static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2202 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2203{
2204 int i;
2205 for (i = 0; i < count; ++i)
2206 {
2207 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2208 int cr = pcr[i] - 128;
2209 int cb = pcb[i] - 128;
2210 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2211 int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2212 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2213 r >>= 20;
2214 g >>= 20;
2215 b >>= 20;
2216 if ((unsigned) r > 255)
2217 r = 255;
2218 if ((unsigned) g > 255)
2219 g = 255;
2220 if ((unsigned) b > 255)
2221 b = 255;
2222 out[0] = (uint8_t)r;
2223 out[1] = (uint8_t)g;
2224 out[2] = (uint8_t)b;
2225 out[3] = 255;
2226 out += step;
2227 }
2228}
2229
2230#if defined(__SSE2__) || defined(RJPEG_NEON)
2231static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2232 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2233{
2234 int i = 0;
2235
2236#if defined(__SSE2__)
2237 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2238 * it's useful in practice (you wouldn't use it for textures, for example).
2239 * so just accelerate step == 4 case.
2240 */
2241 if (step == 4)
2242 {
2243 /* this is a fairly straightforward implementation and not super-optimized. */
2244 __m128i signflip = _mm_set1_epi8(-0x80);
2245 __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
2246 __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2247 __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2248 __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
2249 __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2250 __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2251
2252 for (; i+7 < count; i += 8)
2253 {
2254 /* load */
2255 __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2256 __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2257 __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2258 __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2259 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2260
2261 /* unpack to short (and left-shift cr, cb by 8) */
2262 __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
2263 __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2264 __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2265
2266 /* color transform */
2267 __m128i yws = _mm_srli_epi16(yw, 4);
2268 __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2269 __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2270 __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2271 __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2272 __m128i rws = _mm_add_epi16(cr0, yws);
2273 __m128i gwt = _mm_add_epi16(cb0, yws);
2274 __m128i bws = _mm_add_epi16(yws, cb1);
2275 __m128i gws = _mm_add_epi16(gwt, cr1);
2276
2277 /* descale */
2278 __m128i rw = _mm_srai_epi16(rws, 4);
2279 __m128i bw = _mm_srai_epi16(bws, 4);
2280 __m128i gw = _mm_srai_epi16(gws, 4);
2281
2282 /* back to byte, set up for transpose */
2283 __m128i brb = _mm_packus_epi16(rw, bw);
2284 __m128i gxb = _mm_packus_epi16(gw, xw);
2285
2286 /* transpose to interleave channels */
2287 __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2288 __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2289 __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2290 __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2291
2292 /* store */
2293 _mm_storeu_si128((__m128i *) (out + 0), o0);
2294 _mm_storeu_si128((__m128i *) (out + 16), o1);
2295 out += 32;
2296 }
2297 }
2298#endif
2299
2300#ifdef RJPEG_NEON
2301 /* in this version, step=3 support would be easy to add. but is there demand? */
2302 if (step == 4)
2303 {
2304 /* this is a fairly straightforward implementation and not super-optimized. */
2305 uint8x8_t signflip = vdup_n_u8(0x80);
2306 int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
2307 int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2308 int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2309 int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
2310
2311 for (; i+7 < count; i += 8)
2312 {
2313 uint8x8x4_t o;
2314
2315 /* load */
2316 uint8x8_t y_bytes = vld1_u8(y + i);
2317 uint8x8_t cr_bytes = vld1_u8(pcr + i);
2318 uint8x8_t cb_bytes = vld1_u8(pcb + i);
2319 int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2320 int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2321
2322 /* expand to s16 */
2323 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2324 int16x8_t crw = vshll_n_s8(cr_biased, 7);
2325 int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2326
2327 /* color transform */
2328 int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2329 int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2330 int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2331 int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2332 int16x8_t rws = vaddq_s16(yws, cr0);
2333 int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2334 int16x8_t bws = vaddq_s16(yws, cb1);
2335
2336 /* undo scaling, round, convert to byte */
2337 o.val[0] = vqrshrun_n_s16(rws, 4);
2338 o.val[1] = vqrshrun_n_s16(gws, 4);
2339 o.val[2] = vqrshrun_n_s16(bws, 4);
2340 o.val[3] = vdup_n_u8(255);
2341
2342 /* store, interleaving r/g/b/a */
2343 vst4_u8(out, o);
2344 out += 8*4;
2345 }
2346 }
2347#endif
2348
2349 for (; i < count; ++i)
2350 {
2351 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2352 int cr = pcr[i] - 128;
2353 int cb = pcb[i] - 128;
2354 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2355 int g = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2356 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2357 r >>= 20;
2358 g >>= 20;
2359 b >>= 20;
2360 if ((unsigned) r > 255)
2361 r = 255;
2362 if ((unsigned) g > 255)
2363 g = 255;
2364 if ((unsigned) b > 255)
2365 b = 255;
2366 out[0] = (uint8_t)r;
2367 out[1] = (uint8_t)g;
2368 out[2] = (uint8_t)b;
2369 out[3] = 255;
2370 out += step;
2371 }
2372}
2373#endif
2374
2375/* set up the kernels */
2376static void rjpeg_setup_jpeg(rjpeg_jpeg *j)
2377{
2378 uint64_t mask = cpu_features_get();
2379
2380 (void)mask;
2381
2382 j->idct_block_kernel = rjpeg_idct_block;
2383 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_row;
2384 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2;
2385
2386#if defined(__SSE2__)
2387 if (mask & RETRO_SIMD_SSE2)
2388 {
2389 j->idct_block_kernel = rjpeg_idct_simd;
2390 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2391 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2392 }
2393#endif
2394
2395#ifdef RJPEG_NEON
2396 j->idct_block_kernel = rjpeg_idct_simd;
2397 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2398 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2399#endif
2400}
2401
2402/* clean up the temporary component buffers */
2403static void rjpeg_cleanup_jpeg(rjpeg_jpeg *j)
2404{
2405 int i;
2406 for (i = 0; i < j->s->img_n; ++i)
2407 {
2408 if (j->img_comp[i].raw_data)
2409 {
2410 free(j->img_comp[i].raw_data);
2411 j->img_comp[i].raw_data = NULL;
2412 j->img_comp[i].data = NULL;
2413 }
2414
2415 if (j->img_comp[i].raw_coeff)
2416 {
2417 free(j->img_comp[i].raw_coeff);
2418 j->img_comp[i].raw_coeff = 0;
2419 j->img_comp[i].coeff = 0;
2420 }
2421
2422 if (j->img_comp[i].linebuf)
2423 {
2424 free(j->img_comp[i].linebuf);
2425 j->img_comp[i].linebuf = NULL;
2426 }
2427 }
2428}
2429
2430static uint8_t *rjpeg_load_jpeg_image(rjpeg_jpeg *z,
2431 unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2432{
2433 int n, decode_n;
2434 int k;
2435 unsigned int i,j;
2436 rjpeg_resample res_comp[4];
2437 uint8_t *coutput[4] = {0};
2438 uint8_t *output = NULL;
2439 z->s->img_n = 0;
2440
2441 /* load a jpeg image from whichever source, but leave in YCbCr format */
2442 if (!rjpeg_decode_jpeg_image(z))
2443 goto error;
2444
2445 /* determine actual number of components to generate */
2446 n = req_comp ? req_comp : z->s->img_n;
2447
2448 if (z->s->img_n == 3 && n < 3)
2449 decode_n = 1;
2450 else
2451 decode_n = z->s->img_n;
2452
2453 /* resample and color-convert */
2454 for (k = 0; k < decode_n; ++k)
2455 {
2456 rjpeg_resample *r = &res_comp[k];
2457
2458 /* allocate line buffer big enough for upsampling off the edges
2459 * with upsample factor of 4 */
2460 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2461 if (!z->img_comp[k].linebuf)
2462 goto error;
2463
2464 r->hs = z->img_h_max / z->img_comp[k].h;
2465 r->vs = z->img_v_max / z->img_comp[k].v;
2466 r->ystep = r->vs >> 1;
2467 r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
2468 r->ypos = 0;
2469 r->line0 = r->line1 = z->img_comp[k].data;
2470 r->resample = rjpeg_resample_row_generic;
2471
2472 if (r->hs == 1 && r->vs == 1)
2473 r->resample = rjpeg_resample_row_1;
2474 else if (r->hs == 1 && r->vs == 2)
2475 r->resample = rjpeg_resample_row_v_2;
2476 else if (r->hs == 2 && r->vs == 1)
2477 r->resample = rjpeg_resample_row_h_2;
2478 else if (r->hs == 2 && r->vs == 2)
2479 r->resample = z->resample_row_hv_2_kernel;
2480 }
2481
2482 /* can't error after this so, this is safe */
2483 output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2484
2485 if (!output)
2486 goto error;
2487
2488 /* now go ahead and resample */
2489 for (j = 0; j < z->s->img_y; ++j)
2490 {
2491 uint8_t *out = output + n * z->s->img_x * j;
2492 for (k = 0; k < decode_n; ++k)
2493 {
2494 rjpeg_resample *r = &res_comp[k];
2495 int y_bot = r->ystep >= (r->vs >> 1);
2496
2497 coutput[k] = r->resample(z->img_comp[k].linebuf,
2498 y_bot ? r->line1 : r->line0,
2499 y_bot ? r->line0 : r->line1,
2500 r->w_lores, r->hs);
2501
2502 if (++r->ystep >= r->vs)
2503 {
2504 r->ystep = 0;
2505 r->line0 = r->line1;
2506 if (++r->ypos < z->img_comp[k].y)
2507 r->line1 += z->img_comp[k].w2;
2508 }
2509 }
2510
2511 if (n >= 3)
2512 {
2513 uint8_t *y = coutput[0];
2514 if (y)
2515 {
2516 if (z->s->img_n == 3)
2517 z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2518 else
2519 for (i = 0; i < z->s->img_x; ++i)
2520 {
2521 out[0] = out[1] = out[2] = y[i];
2522 out[3] = 255; /* not used if n==3 */
2523 out += n;
2524 }
2525 }
2526 }
2527 else
2528 {
2529 uint8_t *y = coutput[0];
2530 if (n == 1)
2531 for (i = 0; i < z->s->img_x; ++i)
2532 out[i] = y[i];
2533 else
2534 for (i = 0; i < z->s->img_x; ++i)
2535 {
2536 *out++ = y[i];
2537 *out++ = 255;
2538 }
2539 }
2540 }
2541
2542 rjpeg_cleanup_jpeg(z);
2543 *out_x = z->s->img_x;
2544 *out_y = z->s->img_y;
2545
2546 if (comp)
2547 *comp = z->s->img_n; /* report original components, not output */
2548 return output;
2549
2550error:
2551 rjpeg_cleanup_jpeg(z);
2552 return NULL;
2553}
2554
2555int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2556 size_t size, unsigned *width, unsigned *height)
2557{
2558 rjpeg_jpeg j;
2559 rjpeg_context s;
2560 int comp;
2561 uint32_t *img = NULL;
2562 uint32_t *pixels = NULL;
2563 unsigned size_tex = 0;
2564
2565 if (!rjpeg)
2566 return IMAGE_PROCESS_ERROR;
2567
2568 s.img_buffer = (uint8_t*)rjpeg->buff_data;
2569 s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2570 s.img_buffer_end = (uint8_t*)rjpeg->buff_data + (int)size;
2571
2572 j.s = &s;
2573
2574 rjpeg_setup_jpeg(&j);
2575
2576 img = (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2577
2578 if (!img)
2579 return IMAGE_PROCESS_ERROR;
2580
2581 size_tex = (*width) * (*height);
2582 pixels = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2583
2584 if (!pixels)
2585 {
2586 free(img);
2587 return IMAGE_PROCESS_ERROR;
2588 }
2589
2590 *buf_data = pixels;
2591
2592 /* Convert RGBA to ARGB */
2593 while (size_tex--)
2594 {
2595 unsigned int texel = img[size_tex];
2596 unsigned int A = texel & 0xFF000000;
2597 unsigned int B = texel & 0x00FF0000;
2598 unsigned int G = texel & 0x0000FF00;
2599 unsigned int R = texel & 0x000000FF;
2600 ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2601 }
2602
2603 free(img);
2604
2605 return IMAGE_PROCESS_END;
2606}
2607
2608bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2609{
2610 if (!rjpeg)
2611 return false;
2612
2613 rjpeg->buff_data = (uint8_t*)data;
2614
2615 return true;
2616}
2617
2618void rjpeg_free(rjpeg_t *rjpeg)
2619{
2620 if (!rjpeg)
2621 return;
2622
2623 free(rjpeg);
2624}
2625
2626rjpeg_t *rjpeg_alloc(void)
2627{
2628 rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2629 if (!rjpeg)
2630 return NULL;
2631 return rjpeg;
2632}