1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Mupen64plus-rsp-hle - jpeg.c *
3 * Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
4 * Copyright (C) 2012 Bobby Smiles *
5 * Copyright (C) 2009 Richard Goedeken *
6 * Copyright (C) 2002 Hacktarux *
8 * This program is free software; you can redistribute it and/or modify *
9 * it under the terms of the GNU General Public License as published by *
10 * the Free Software Foundation; either version 2 of the License, or *
11 * (at your option) any later version. *
13 * This program is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU General Public License for more details. *
18 * You should have received a copy of the GNU General Public License *
19 * along with this program; if not, write to the *
20 * Free Software Foundation, Inc., *
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
22 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
28 #define M64P_PLUGIN_PROTOTYPES 1
29 #include "m64p_types.h"
30 #include "m64p_plugin.h"
33 #define SUBBLOCK_SIZE 64
35 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
36 typedef void (*subblock_transform_t)(int16_t* dst, const int16_t* src);
38 /* rdram operations */
39 // FIXME: these functions deserve their own module
40 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
41 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
42 static uint32_t rdram_read_u32(uint32_t address);
43 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
45 /* standard jpeg ucode decoder */
46 static void jpeg_decode_std(const char * const version,
47 const subblock_transform_t transform_luma,
48 const subblock_transform_t transform_chroma,
49 const tile_line_emitter_t emit_line);
51 /* helper functions */
52 static uint8_t clamp_u8(int16_t x);
53 static int16_t clamp_s12(int16_t x);
54 static int16_t clamp_s16(int32_t x);
55 static uint16_t clamp_RGBA_component(int16_t x);
57 /* pixel conversion & foratting */
58 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
59 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
61 /* tile line emitters */
62 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
63 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
65 /* macroblocks operations */
66 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
67 static void decode_macroblock_std(
68 const subblock_transform_t transform_luma,
69 const subblock_transform_t transform_chroma,
70 int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
71 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
72 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
74 /* subblocks operations */
75 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
76 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
77 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
78 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
79 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
80 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
81 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride);
82 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
83 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
84 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
86 /* transposed dequantization table */
87 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] =
89 16, 12, 14, 14, 18, 24, 49, 72,
90 11, 12, 13, 17, 22, 35, 64, 92,
91 10, 14, 16, 22, 37, 55, 78, 95,
92 16, 19, 24, 29, 56, 64, 87, 98,
93 24, 26, 40, 51, 68, 81, 103, 112,
94 40, 58, 57, 87, 109, 104, 121, 100,
95 51, 60, 69, 80, 103, 113, 120, 103,
96 61, 55, 56, 62, 77, 92, 101, 99
100 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] =
102 0, 1, 5, 6, 14, 15, 27, 28,
103 2, 4, 7, 13, 16, 26, 29, 42,
104 3, 8, 12, 17, 25, 30, 41, 43,
105 9, 11, 18, 24, 31, 40, 44, 53,
106 10, 19, 23, 32, 39, 45, 52, 54,
107 20, 22, 33, 38, 46, 51, 55, 60,
108 21, 34, 37, 47, 50, 56, 59, 61,
109 35, 36, 48, 49, 57, 58, 62, 63
112 /* transposition indices */
113 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
115 0, 8, 16, 24, 32, 40, 48, 56,
116 1, 9, 17, 25, 33, 41, 49, 57,
117 2, 10, 18, 26, 34, 42, 50, 58,
118 3, 11, 19, 27, 35, 43, 51, 59,
119 4, 12, 20, 28, 36, 44, 52, 60,
120 5, 13, 21, 29, 37, 45, 53, 61,
121 6, 14, 22, 30, 38, 46, 54, 62,
122 7, 15, 23, 31, 39, 47, 55, 63
127 /* IDCT related constants
128 * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
129 static const float IDCT_C3 = 1.175875602f;
130 static const float IDCT_C6 = 0.541196100f;
131 static const float IDCT_K[10] =
133 0.765366865f, /* C2-C6 */
134 -1.847759065f, /* -C2-C6 */
135 -0.390180644f, /* C5-C3 */
136 -1.961570561f, /* -C5-C3 */
137 1.501321110f, /* C1+C3-C5-C7 */
138 2.053119869f, /* C1+C3-C5+C7 */
139 3.072711027f, /* C1+C3+C5-C7 */
140 0.298631336f, /* -C1+C3+C5-C7 */
141 -0.899976223f, /* C7-C3 */
142 -2.562915448f /* -C1-C3 */
146 /* global functions */
148 /***************************************************************************
149 * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
150 **************************************************************************/
151 void jpeg_decode_PS0()
153 jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
156 /***************************************************************************
157 * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
159 **************************************************************************/
160 void jpeg_decode_PS()
162 jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
165 /***************************************************************************
166 * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
167 **************************************************************************/
168 void jpeg_decode_OB()
170 int16_t qtable[SUBBLOCK_SIZE];
177 const OSTask_t * const task = get_task();
179 uint32_t address = task->data_ptr;
180 const unsigned int macroblock_count = task->data_size;
181 const int qscale = task->yield_data_size;
183 DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
192 ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
196 RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
200 for (mb = 0; mb < macroblock_count; ++mb)
202 int16_t macroblock[6*SUBBLOCK_SIZE];
204 rdram_read_many_u16((uint16_t*)macroblock, address, 6*SUBBLOCK_SIZE);
205 decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
206 EmitTilesMode2(EmitYUVTileLine, macroblock, address);
208 address += (2*6*SUBBLOCK_SIZE);
213 /* local functions */
214 static void jpeg_decode_std(const char * const version,
215 const subblock_transform_t transform_luma,
216 const subblock_transform_t transform_chroma,
217 const tile_line_emitter_t emit_line)
219 int16_t qtables[3][SUBBLOCK_SIZE];
222 uint32_t macroblock_count;
224 uint32_t qtableY_ptr;
225 uint32_t qtableU_ptr;
226 uint32_t qtableV_ptr;
227 unsigned int subblock_count;
228 unsigned int macroblock_size;
229 int16_t macroblock[6*SUBBLOCK_SIZE]; /* macroblock contains at most 6 subblobcks */
230 const OSTask_t * const task = get_task();
232 if (task->flags & 0x1)
234 DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
238 address = rdram_read_u32(task->data_ptr);
239 macroblock_count = rdram_read_u32(task->data_ptr + 4);
240 mode = rdram_read_u32(task->data_ptr + 8);
241 qtableY_ptr = rdram_read_u32(task->data_ptr + 12);
242 qtableU_ptr = rdram_read_u32(task->data_ptr + 16);
243 qtableV_ptr = rdram_read_u32(task->data_ptr + 20);
245 DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
254 if (mode != 0 && mode != 2)
256 DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
260 subblock_count = mode + 4;
261 macroblock_size = subblock_count*SUBBLOCK_SIZE;
263 rdram_read_many_u16((uint16_t*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
264 rdram_read_many_u16((uint16_t*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
265 rdram_read_many_u16((uint16_t*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
267 for (mb = 0; mb < macroblock_count; ++mb)
269 rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size);
270 decode_macroblock_std(transform_luma, transform_chroma,
271 macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
275 EmitTilesMode0(emit_line, macroblock, address);
279 EmitTilesMode2(emit_line, macroblock, address);
282 address += 2*macroblock_size;
286 static uint8_t clamp_u8(int16_t x)
288 return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
291 static int16_t clamp_s12(int16_t x)
293 if (x < -0x800) { x = -0x800; } else if (x > 0x7f0) { x = 0x7f0; }
297 static int16_t clamp_s16(int32_t x)
299 if (x > 32767) { x = 32767; } else if (x < -32768) { x = -32768; }
303 static uint16_t clamp_RGBA_component(int16_t x)
305 if (x > 0xff0) { x = 0xff0; } else if (x < 0) { x = 0; }
309 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
311 return (uint32_t)clamp_u8(u) << 24
312 | (uint32_t)clamp_u8(y1) << 16
313 | (uint32_t)clamp_u8(v) << 8
314 | (uint32_t)clamp_u8(y2);
317 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
319 const float fY = (float)y + 2048.0f;
320 const float fU = (float)u;
321 const float fV = (float)v;
323 const uint16_t r = clamp_RGBA_component((int16_t)(fY + 1.4025*fV));
324 const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443*fU - 0.7144*fV));
325 const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729*fU ));
327 return (r << 4) | (g >> 1) | (b >> 6) | 1;
330 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
334 const int16_t * const v = u + SUBBLOCK_SIZE;
335 const int16_t * const y2 = y + SUBBLOCK_SIZE;
337 uyvy[0] = GetUYVY(y[0], y[1], u[0], v[0]);
338 uyvy[1] = GetUYVY(y[2], y[3], u[1], v[1]);
339 uyvy[2] = GetUYVY(y[4], y[5], u[2], v[2]);
340 uyvy[3] = GetUYVY(y[6], y[7], u[3], v[3]);
341 uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
342 uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
343 uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
344 uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
346 rdram_write_many_u32(uyvy, address, 8);
349 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
353 const int16_t * const v = u + SUBBLOCK_SIZE;
354 const int16_t * const y2 = y + SUBBLOCK_SIZE;
356 rgba[0] = GetRGBA(y[0], u[0], v[0]);
357 rgba[1] = GetRGBA(y[1], u[0], v[0]);
358 rgba[2] = GetRGBA(y[2], u[1], v[1]);
359 rgba[3] = GetRGBA(y[3], u[1], v[1]);
360 rgba[4] = GetRGBA(y[4], u[2], v[2]);
361 rgba[5] = GetRGBA(y[5], u[2], v[2]);
362 rgba[6] = GetRGBA(y[6], u[3], v[3]);
363 rgba[7] = GetRGBA(y[7], u[3], v[3]);
364 rgba[8] = GetRGBA(y2[0], u[4], v[4]);
365 rgba[9] = GetRGBA(y2[1], u[4], v[4]);
366 rgba[10] = GetRGBA(y2[2], u[5], v[5]);
367 rgba[11] = GetRGBA(y2[3], u[5], v[5]);
368 rgba[12] = GetRGBA(y2[4], u[6], v[6]);
369 rgba[13] = GetRGBA(y2[5], u[6], v[6]);
370 rgba[14] = GetRGBA(y2[6], u[7], v[7]);
371 rgba[15] = GetRGBA(y2[7], u[7], v[7]);
373 rdram_write_many_u16(rgba, address, 16);
376 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
380 unsigned int y_offset = 0;
381 unsigned int u_offset = 2*SUBBLOCK_SIZE;
383 for (i = 0; i < 8; ++i)
385 emit_line(¯oblock[y_offset], ¯oblock[u_offset], address);
393 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
397 unsigned int y_offset = 0;
398 unsigned int u_offset = 4*SUBBLOCK_SIZE;
400 for (i = 0; i < 8; ++i)
402 emit_line(¯oblock[y_offset], ¯oblock[u_offset], address);
403 emit_line(¯oblock[y_offset + 8], ¯oblock[u_offset], address + 32);
405 y_offset += (i == 3) ? SUBBLOCK_SIZE+16 : 16;
411 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
415 for (sb = 0; sb < 6; ++sb)
417 int16_t tmp_sb[SUBBLOCK_SIZE];
420 int32_t dc = (int32_t)macroblock[0];
423 case 0: case 1: case 2: case 3:
424 *y_dc += dc; macroblock[0] = *y_dc & 0xffff; break;
425 case 4: *u_dc += dc; macroblock[0] = *u_dc & 0xffff; break;
426 case 5: *v_dc += dc; macroblock[0] = *v_dc & 0xffff; break;
429 ZigZagSubBlock(tmp_sb, macroblock);
430 if (qtable != NULL) { MultSubBlocks(tmp_sb, tmp_sb, qtable, 0); }
431 TransposeSubBlock(macroblock, tmp_sb);
432 InverseDCTSubBlock(macroblock, macroblock);
434 macroblock += SUBBLOCK_SIZE;
438 static void decode_macroblock_std(
439 const subblock_transform_t transform_luma,
440 const subblock_transform_t transform_chroma,
441 int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
446 for (sb = 0; sb < subblock_count; ++sb)
448 int16_t tmp_sb[SUBBLOCK_SIZE];
449 const int isChromaSubBlock = (subblock_count - sb <= 2);
451 if (isChromaSubBlock) { ++q; }
453 MultSubBlocks(macroblock, macroblock, qtables[q], 4);
454 ZigZagSubBlock(tmp_sb, macroblock);
455 InverseDCTSubBlock(macroblock, tmp_sb);
457 if (isChromaSubBlock)
459 if (transform_chroma != NULL)
460 transform_chroma(macroblock, macroblock);
464 if (transform_luma != NULL)
465 transform_luma(macroblock, macroblock);
468 macroblock += SUBBLOCK_SIZE;
472 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
474 ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
477 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
479 ReorderSubBlock(dst, src, ZIGZAG_TABLE);
482 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
486 /* source and destination sublocks cannot overlap */
487 assert(abs(dst - src) > SUBBLOCK_SIZE);
489 for (i = 0; i < SUBBLOCK_SIZE; ++i)
491 dst[i] = src[table[i]];
495 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
499 for (i = 0; i < SUBBLOCK_SIZE; ++i)
501 int32_t v = src1[i] * src2[i];
502 dst[i] = clamp_s16(v) << shift;
506 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
510 for (i = 0; i < SUBBLOCK_SIZE; ++i)
512 int32_t v = src[i] * scale;
513 dst[i] = clamp_s16(v);
517 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
521 for (i = 0; i < SUBBLOCK_SIZE; ++i)
523 dst[i] = src[i] >> shift;
527 /***************************************************************************
528 * Fast 2D IDCT using separable formulation and normalization
529 * Computations use single precision floats
530 * Implementation based on Wikipedia :
531 * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
532 **************************************************************************/
533 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride)
537 float x26, x1357, x15, x37, x17, x35;
539 x15 = IDCT_K[2] * (x[1] + x[5]);
540 x37 = IDCT_K[3] * (x[3] + x[7]);
541 x17 = IDCT_K[8] * (x[1] + x[7]);
542 x35 = IDCT_K[9] * (x[3] + x[5]);
543 x1357 = IDCT_C3 * (x[1] + x[3] + x[5] + x[7]);
544 x26 = IDCT_C6 * (x[2] + x[6]);
548 f[2] = x26 + IDCT_K[0]*x[2];
549 f[3] = x26 + IDCT_K[1]*x[6];
551 e[0] = x1357 + x15 + IDCT_K[4]*x[1] + x17;
552 e[1] = x1357 + x37 + IDCT_K[6]*x[3] + x35;
553 e[2] = x1357 + x15 + IDCT_K[5]*x[5] + x35;
554 e[3] = x1357 + x37 + IDCT_K[7]*x[7] + x17;
556 *dst = f[0] + f[2] + e[0]; dst += stride;
557 *dst = f[1] + f[3] + e[1]; dst += stride;
558 *dst = f[1] - f[3] + e[2]; dst += stride;
559 *dst = f[0] - f[2] + e[3]; dst += stride;
560 *dst = f[0] - f[2] - e[3]; dst += stride;
561 *dst = f[1] - f[3] - e[2]; dst += stride;
562 *dst = f[1] + f[3] - e[1]; dst += stride;
563 *dst = f[0] + f[2] - e[0]; dst += stride;
566 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
569 float block[SUBBLOCK_SIZE];
572 /* idct 1d on rows (+transposition) */
573 for (i = 0; i < 8; ++i)
575 for (j = 0; j < 8; ++j)
577 x[j] = (float)src[i*8+j];
580 InverseDCT1D(x, &block[i], 8);
583 /* idct 1d on columns (thanks to previous transposition) */
584 for (i = 0; i < 8; ++i)
586 InverseDCT1D(&block[i*8], x, 1);
588 /* C4 = 1 normalization implies a division by 8 */
589 for (j = 0; j < 8; ++j)
591 dst[i+j*8] = (int16_t)x[j] >> 3;
596 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
600 for (i = 0; i < SUBBLOCK_SIZE; ++i)
602 dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
606 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
610 for (i = 0; i < SUBBLOCK_SIZE; ++i)
612 dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
618 /* FIXME: assume presence of expansion pack */
619 #define MEMMASK 0x7fffff
621 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
625 uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
627 s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
635 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
639 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
640 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
646 static uint32_t rdram_read_u32(uint32_t address)
648 uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
649 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
650 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
651 r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
656 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
660 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
661 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
662 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
663 rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);