1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Mupen64plus-rsp-hle - jpeg.c *
3 * Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
4 * Copyright (C) 2012 Bobby Smiles *
5 * Copyright (C) 2009 Richard Goedeken *
6 * Copyright (C) 2002 Hacktarux *
8 * This program is free software; you can redistribute it and/or modify *
9 * it under the terms of the GNU General Public License as published by *
10 * the Free Software Foundation; either version 2 of the License, or *
11 * (at your option) any later version. *
13 * This program is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU General Public License for more details. *
18 * You should have received a copy of the GNU General Public License *
19 * along with this program; if not, write to the *
20 * Free Software Foundation, Inc., *
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
22 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
28 #define M64P_PLUGIN_PROTOTYPES 1
29 #include "m64p_types.h"
30 #include "m64p_plugin.h"
34 #define SUBBLOCK_SIZE 64
36 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
37 typedef void (*subblock_transform_t)(int16_t *dst, const int16_t *src);
40 * FIXME: these functions deserve their own module
42 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
43 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
44 static uint32_t rdram_read_u32(uint32_t address);
45 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
47 /* standard jpeg ucode decoder */
48 static void jpeg_decode_std(const char *const version,
49 const subblock_transform_t transform_luma,
50 const subblock_transform_t transform_chroma,
51 const tile_line_emitter_t emit_line);
53 /* helper functions */
54 static uint8_t clamp_u8(int16_t x);
55 static int16_t clamp_s12(int16_t x);
56 static uint16_t clamp_RGBA_component(int16_t x);
58 /* pixel conversion & foratting */
59 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
60 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
62 /* tile line emitters */
63 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
64 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
66 /* macroblocks operations */
67 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
68 static void decode_macroblock_std(const subblock_transform_t transform_luma,
69 const subblock_transform_t transform_chroma,
71 unsigned int subblock_count,
72 const int16_t qtables[3][SUBBLOCK_SIZE]);
73 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
74 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
76 /* subblocks operations */
77 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
78 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
79 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
80 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
81 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
82 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
83 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride);
84 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
85 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
86 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
88 /* transposed dequantization table */
89 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
90 16, 12, 14, 14, 18, 24, 49, 72,
91 11, 12, 13, 17, 22, 35, 64, 92,
92 10, 14, 16, 22, 37, 55, 78, 95,
93 16, 19, 24, 29, 56, 64, 87, 98,
94 24, 26, 40, 51, 68, 81, 103, 112,
95 40, 58, 57, 87, 109, 104, 121, 100,
96 51, 60, 69, 80, 103, 113, 120, 103,
97 61, 55, 56, 62, 77, 92, 101, 99
100 /* zig-zag indices */
101 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
102 0, 1, 5, 6, 14, 15, 27, 28,
103 2, 4, 7, 13, 16, 26, 29, 42,
104 3, 8, 12, 17, 25, 30, 41, 43,
105 9, 11, 18, 24, 31, 40, 44, 53,
106 10, 19, 23, 32, 39, 45, 52, 54,
107 20, 22, 33, 38, 46, 51, 55, 60,
108 21, 34, 37, 47, 50, 56, 59, 61,
109 35, 36, 48, 49, 57, 58, 62, 63
112 /* transposition indices */
113 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
114 0, 8, 16, 24, 32, 40, 48, 56,
115 1, 9, 17, 25, 33, 41, 49, 57,
116 2, 10, 18, 26, 34, 42, 50, 58,
117 3, 11, 19, 27, 35, 43, 51, 59,
118 4, 12, 20, 28, 36, 44, 52, 60,
119 5, 13, 21, 29, 37, 45, 53, 61,
120 6, 14, 22, 30, 38, 46, 54, 62,
121 7, 15, 23, 31, 39, 47, 55, 63
126 /* IDCT related constants
127 * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
128 static const float IDCT_C3 = 1.175875602f;
129 static const float IDCT_C6 = 0.541196100f;
130 static const float IDCT_K[10] = {
131 0.765366865f, /* C2-C6 */
132 -1.847759065f, /* -C2-C6 */
133 -0.390180644f, /* C5-C3 */
134 -1.961570561f, /* -C5-C3 */
135 1.501321110f, /* C1+C3-C5-C7 */
136 2.053119869f, /* C1+C3-C5+C7 */
137 3.072711027f, /* C1+C3+C5-C7 */
138 0.298631336f, /* -C1+C3+C5-C7 */
139 -0.899976223f, /* C7-C3 */
140 -2.562915448f /* -C1-C3 */
144 /* global functions */
146 /***************************************************************************
147 * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
148 **************************************************************************/
149 void jpeg_decode_PS0(void)
151 jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
154 /***************************************************************************
155 * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
157 **************************************************************************/
158 void jpeg_decode_PS(void)
160 jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
163 /***************************************************************************
164 * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
165 **************************************************************************/
166 void jpeg_decode_OB(void)
168 int16_t qtable[SUBBLOCK_SIZE];
175 const OSTask_t *const task = get_task();
177 uint32_t address = task->data_ptr;
178 const unsigned int macroblock_count = task->data_size;
179 const int qscale = task->yield_data_size;
181 DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
188 ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
190 RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
193 for (mb = 0; mb < macroblock_count; ++mb) {
194 int16_t macroblock[6 * SUBBLOCK_SIZE];
196 rdram_read_many_u16((uint16_t *)macroblock, address, 6 * SUBBLOCK_SIZE);
197 decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
198 EmitTilesMode2(EmitYUVTileLine, macroblock, address);
200 address += (2 * 6 * SUBBLOCK_SIZE);
205 /* local functions */
206 static void jpeg_decode_std(const char *const version,
207 const subblock_transform_t transform_luma,
208 const subblock_transform_t transform_chroma,
209 const tile_line_emitter_t emit_line)
211 int16_t qtables[3][SUBBLOCK_SIZE];
214 uint32_t macroblock_count;
216 uint32_t qtableY_ptr;
217 uint32_t qtableU_ptr;
218 uint32_t qtableV_ptr;
219 unsigned int subblock_count;
220 unsigned int macroblock_size;
221 /* macroblock contains at most 6 subblocks */
222 int16_t macroblock[6 * SUBBLOCK_SIZE];
223 const OSTask_t *const task = get_task();
225 if (task->flags & 0x1) {
226 DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
230 address = rdram_read_u32(task->data_ptr);
231 macroblock_count = rdram_read_u32(task->data_ptr + 4);
232 mode = rdram_read_u32(task->data_ptr + 8);
233 qtableY_ptr = rdram_read_u32(task->data_ptr + 12);
234 qtableU_ptr = rdram_read_u32(task->data_ptr + 16);
235 qtableV_ptr = rdram_read_u32(task->data_ptr + 20);
237 DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
246 if (mode != 0 && mode != 2) {
247 DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
251 subblock_count = mode + 4;
252 macroblock_size = subblock_count * SUBBLOCK_SIZE;
254 rdram_read_many_u16((uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
255 rdram_read_many_u16((uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
256 rdram_read_many_u16((uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
258 for (mb = 0; mb < macroblock_count; ++mb) {
259 rdram_read_many_u16((uint16_t *)macroblock, address, macroblock_size);
260 decode_macroblock_std(transform_luma, transform_chroma,
261 macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
264 EmitTilesMode0(emit_line, macroblock, address);
266 EmitTilesMode2(emit_line, macroblock, address);
268 address += 2 * macroblock_size;
272 static uint8_t clamp_u8(int16_t x)
274 return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
277 static int16_t clamp_s12(int16_t x)
286 static uint16_t clamp_RGBA_component(int16_t x)
295 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
297 return (uint32_t)clamp_u8(u) << 24 |
298 (uint32_t)clamp_u8(y1) << 16 |
299 (uint32_t)clamp_u8(v) << 8 |
300 (uint32_t)clamp_u8(y2);
303 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
305 const float fY = (float)y + 2048.0f;
306 const float fU = (float)u;
307 const float fV = (float)v;
309 const uint16_t r = clamp_RGBA_component((int16_t)(fY + 1.4025 * fV));
310 const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
311 const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));
313 return (r << 4) | (g >> 1) | (b >> 6) | 1;
316 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
320 const int16_t *const v = u + SUBBLOCK_SIZE;
321 const int16_t *const y2 = y + SUBBLOCK_SIZE;
323 uyvy[0] = GetUYVY(y[0], y[1], u[0], v[0]);
324 uyvy[1] = GetUYVY(y[2], y[3], u[1], v[1]);
325 uyvy[2] = GetUYVY(y[4], y[5], u[2], v[2]);
326 uyvy[3] = GetUYVY(y[6], y[7], u[3], v[3]);
327 uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
328 uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
329 uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
330 uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
332 rdram_write_many_u32(uyvy, address, 8);
335 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
339 const int16_t *const v = u + SUBBLOCK_SIZE;
340 const int16_t *const y2 = y + SUBBLOCK_SIZE;
342 rgba[0] = GetRGBA(y[0], u[0], v[0]);
343 rgba[1] = GetRGBA(y[1], u[0], v[0]);
344 rgba[2] = GetRGBA(y[2], u[1], v[1]);
345 rgba[3] = GetRGBA(y[3], u[1], v[1]);
346 rgba[4] = GetRGBA(y[4], u[2], v[2]);
347 rgba[5] = GetRGBA(y[5], u[2], v[2]);
348 rgba[6] = GetRGBA(y[6], u[3], v[3]);
349 rgba[7] = GetRGBA(y[7], u[3], v[3]);
350 rgba[8] = GetRGBA(y2[0], u[4], v[4]);
351 rgba[9] = GetRGBA(y2[1], u[4], v[4]);
352 rgba[10] = GetRGBA(y2[2], u[5], v[5]);
353 rgba[11] = GetRGBA(y2[3], u[5], v[5]);
354 rgba[12] = GetRGBA(y2[4], u[6], v[6]);
355 rgba[13] = GetRGBA(y2[5], u[6], v[6]);
356 rgba[14] = GetRGBA(y2[6], u[7], v[7]);
357 rgba[15] = GetRGBA(y2[7], u[7], v[7]);
359 rdram_write_many_u16(rgba, address, 16);
362 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
366 unsigned int y_offset = 0;
367 unsigned int u_offset = 2 * SUBBLOCK_SIZE;
369 for (i = 0; i < 8; ++i) {
370 emit_line(¯oblock[y_offset], ¯oblock[u_offset], address);
378 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
382 unsigned int y_offset = 0;
383 unsigned int u_offset = 4 * SUBBLOCK_SIZE;
385 for (i = 0; i < 8; ++i) {
386 emit_line(¯oblock[y_offset], ¯oblock[u_offset], address);
387 emit_line(¯oblock[y_offset + 8], ¯oblock[u_offset], address + 32);
389 y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
395 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
399 for (sb = 0; sb < 6; ++sb) {
400 int16_t tmp_sb[SUBBLOCK_SIZE];
403 int32_t dc = (int32_t)macroblock[0];
410 macroblock[0] = *y_dc & 0xffff;
414 macroblock[0] = *u_dc & 0xffff;
418 macroblock[0] = *v_dc & 0xffff;
422 ZigZagSubBlock(tmp_sb, macroblock);
424 MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
425 TransposeSubBlock(macroblock, tmp_sb);
426 InverseDCTSubBlock(macroblock, macroblock);
428 macroblock += SUBBLOCK_SIZE;
432 static void decode_macroblock_std(const subblock_transform_t transform_luma,
433 const subblock_transform_t transform_chroma,
435 unsigned int subblock_count,
436 const int16_t qtables[3][SUBBLOCK_SIZE])
441 for (sb = 0; sb < subblock_count; ++sb) {
442 int16_t tmp_sb[SUBBLOCK_SIZE];
443 const int isChromaSubBlock = (subblock_count - sb <= 2);
445 if (isChromaSubBlock)
448 MultSubBlocks(macroblock, macroblock, qtables[q], 4);
449 ZigZagSubBlock(tmp_sb, macroblock);
450 InverseDCTSubBlock(macroblock, tmp_sb);
452 if (isChromaSubBlock) {
453 if (transform_chroma != NULL)
454 transform_chroma(macroblock, macroblock);
456 if (transform_luma != NULL)
457 transform_luma(macroblock, macroblock);
460 macroblock += SUBBLOCK_SIZE;
464 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
466 ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
469 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
471 ReorderSubBlock(dst, src, ZIGZAG_TABLE);
474 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
478 /* source and destination sublocks cannot overlap */
479 assert(abs(dst - src) > SUBBLOCK_SIZE);
481 for (i = 0; i < SUBBLOCK_SIZE; ++i)
482 dst[i] = src[table[i]];
485 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
489 for (i = 0; i < SUBBLOCK_SIZE; ++i) {
490 int32_t v = src1[i] * src2[i];
491 dst[i] = clamp_s16(v) << shift;
495 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
499 for (i = 0; i < SUBBLOCK_SIZE; ++i) {
500 int32_t v = src[i] * scale;
501 dst[i] = clamp_s16(v);
505 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
509 for (i = 0; i < SUBBLOCK_SIZE; ++i)
510 dst[i] = src[i] >> shift;
513 /***************************************************************************
514 * Fast 2D IDCT using separable formulation and normalization
515 * Computations use single precision floats
516 * Implementation based on Wikipedia :
517 * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
518 **************************************************************************/
519 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride)
523 float x26, x1357, x15, x37, x17, x35;
525 x15 = IDCT_K[2] * (x[1] + x[5]);
526 x37 = IDCT_K[3] * (x[3] + x[7]);
527 x17 = IDCT_K[8] * (x[1] + x[7]);
528 x35 = IDCT_K[9] * (x[3] + x[5]);
529 x1357 = IDCT_C3 * (x[1] + x[3] + x[5] + x[7]);
530 x26 = IDCT_C6 * (x[2] + x[6]);
534 f[2] = x26 + IDCT_K[0] * x[2];
535 f[3] = x26 + IDCT_K[1] * x[6];
537 e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
538 e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
539 e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
540 e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;
542 *dst = f[0] + f[2] + e[0];
544 *dst = f[1] + f[3] + e[1];
546 *dst = f[1] - f[3] + e[2];
548 *dst = f[0] - f[2] + e[3];
550 *dst = f[0] - f[2] - e[3];
552 *dst = f[1] - f[3] - e[2];
554 *dst = f[1] + f[3] - e[1];
556 *dst = f[0] + f[2] - e[0];
559 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
562 float block[SUBBLOCK_SIZE];
565 /* idct 1d on rows (+transposition) */
566 for (i = 0; i < 8; ++i) {
567 for (j = 0; j < 8; ++j)
568 x[j] = (float)src[i * 8 + j];
570 InverseDCT1D(x, &block[i], 8);
573 /* idct 1d on columns (thanks to previous transposition) */
574 for (i = 0; i < 8; ++i) {
575 InverseDCT1D(&block[i * 8], x, 1);
577 /* C4 = 1 normalization implies a division by 8 */
578 for (j = 0; j < 8; ++j)
579 dst[i + j * 8] = (int16_t)x[j] >> 3;
583 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
587 for (i = 0; i < SUBBLOCK_SIZE; ++i)
588 dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
591 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
595 for (i = 0; i < SUBBLOCK_SIZE; ++i)
596 dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
601 /* FIXME: assume presence of expansion pack */
602 #define MEMMASK 0x7fffff
604 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
607 uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
609 s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
617 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
620 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
621 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
627 static uint32_t rdram_read_u32(uint32_t address)
629 uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK];
631 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
633 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
635 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
640 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
643 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
644 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
645 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
646 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);