source/mupen64plus-rsp-hle/src/jpeg.c

   1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
   2  *   Mupen64plus-rsp-hle - jpeg.c                                          *
   3  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
   4  *   Copyright (C) 2012 Bobby Smiles                                       *
   5  *   Copyright (C) 2009 Richard Goedeken                                   *
   6  *   Copyright (C) 2002 Hacktarux                                          *
   7  *                                                                         *
   8  *   This program is free software; you can redistribute it and/or modify  *
   9  *   it under the terms of the GNU General Public License as published by  *
  10  *   the Free Software Foundation; either version 2 of the License, or     *
  11  *   (at your option) any later version.                                   *
  12  *                                                                         *
  13  *   This program is distributed in the hope that it will be useful,       *
  14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  16  *   GNU General Public License for more details.                          *
  17  *                                                                         *
  18  *   You should have received a copy of the GNU General Public License     *
  19  *   along with this program; if not, write to the                         *
  20  *   Free Software Foundation, Inc.,                                       *
  21  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  22  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  23
  24 #include <assert.h>
  25 #include <stdlib.h>
  26 #include <stdint.h>
  27
  28 #define M64P_PLUGIN_PROTOTYPES 1
  29 #include "m64p_types.h"
  30 #include "m64p_plugin.h"
  31 #include "hle.h"
  32
  33 #define SUBBLOCK_SIZE 64
  34
  35 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
  36 typedef void (*subblock_transform_t)(int16_t* dst, const int16_t* src);
  37
  38 /* rdram operations */
  39 // FIXME: these functions deserve their own module
  40 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
  41 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
  42 static uint32_t rdram_read_u32(uint32_t address);
  43 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
  44
  45 /* standard jpeg ucode decoder */
  46 static void jpeg_decode_std(const char * const version,
  47         const subblock_transform_t transform_luma,
  48         const subblock_transform_t transform_chroma,
  49         const tile_line_emitter_t emit_line);
  50
  51 /* helper functions */
  52 static uint8_t clamp_u8(int16_t x);
  53 static int16_t clamp_s12(int16_t x);
  54 static int16_t clamp_s16(int32_t x);
  55 static uint16_t clamp_RGBA_component(int16_t x);
  56
  57 /* pixel conversion & foratting */
  58 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
  59 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
  60
  61 /* tile line emitters */
  62 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
  63 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
  64
  65 /* macroblocks operations */
  66 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
  67 static void decode_macroblock_std(
  68         const subblock_transform_t transform_luma,
  69         const subblock_transform_t transform_chroma,
  70         int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
  71 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  72 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  73
  74 /* subblocks operations */
  75 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
  76 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
  77 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
  78 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
  79 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
  80 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
  81 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride);
  82 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
  83 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
  84 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
  85
  86 /* transposed dequantization table */
  87 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] =
  88 {
  89     16, 12, 14, 14,  18,  24,  49,  72,
  90     11, 12, 13, 17,  22,  35,  64,  92,
  91     10, 14, 16, 22,  37,  55,  78,  95,
  92     16, 19, 24, 29,  56,  64,  87,  98,
  93     24, 26, 40, 51,  68,  81, 103, 112,
  94     40, 58, 57, 87, 109, 104, 121, 100,
  95     51, 60, 69, 80, 103, 113, 120, 103,
  96     61, 55, 56, 62,  77,  92, 101,  99
  97 };
  98
  99 /* zig-zag indices */
 100 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] =
 101 {
 102      0,  1,  5,  6, 14, 15, 27, 28,
 103      2,  4,  7, 13, 16, 26, 29, 42,
 104      3,  8, 12, 17, 25, 30, 41, 43,
 105      9, 11, 18, 24, 31, 40, 44, 53,
 106     10, 19, 23, 32, 39, 45, 52, 54,
 107     20, 22, 33, 38, 46, 51, 55, 60,
 108     21, 34, 37, 47, 50, 56, 59, 61,
 109     35, 36, 48, 49, 57, 58, 62, 63
 110 };
 111
 112 /* transposition indices */
 113 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
 114 {
 115     0,  8, 16, 24, 32, 40, 48, 56,
 116     1,  9, 17, 25, 33, 41, 49, 57,
 117     2, 10, 18, 26, 34, 42, 50, 58,
 118     3, 11, 19, 27, 35, 43, 51, 59,
 119     4, 12, 20, 28, 36, 44, 52, 60,
 120     5, 13, 21, 29, 37, 45, 53, 61,
 121     6, 14, 22, 30, 38, 46, 54, 62,
 122     7, 15, 23, 31, 39, 47, 55, 63
 123 };
 124
 125
 126
 127 /* IDCT related constants
 128  * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
 129 static const float IDCT_C3 = 1.175875602f;
 130 static const float IDCT_C6 = 0.541196100f;
 131 static const float IDCT_K[10] =
 132 {
 133   0.765366865f,   /*  C2-C6         */
 134  -1.847759065f,   /* -C2-C6         */
 135  -0.390180644f,   /*  C5-C3         */
 136  -1.961570561f,   /* -C5-C3         */
 137   1.501321110f,   /*  C1+C3-C5-C7   */
 138   2.053119869f,   /*  C1+C3-C5+C7   */
 139   3.072711027f,   /*  C1+C3+C5-C7   */
 140   0.298631336f,   /* -C1+C3+C5-C7   */
 141  -0.899976223f,   /*  C7-C3         */
 142  -2.562915448f    /* -C1-C3         */
 143 };
 144
 145
 146 /* global functions */
 147
 148 /***************************************************************************
 149  * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
 150  **************************************************************************/
 151 void jpeg_decode_PS0()
 152 {
 153     jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
 154 }
 155
 156 /***************************************************************************
 157  * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
 158  * Pokemon Stadium 2.
 159  **************************************************************************/
 160 void jpeg_decode_PS()
 161 {
 162     jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
 163 }
 164
 165 /***************************************************************************
 166  * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
 167  **************************************************************************/
 168 void jpeg_decode_OB()
 169 {
 170     int16_t qtable[SUBBLOCK_SIZE];
 171     unsigned int mb;
 172
 173     int32_t y_dc = 0;
 174     int32_t u_dc = 0;
 175     int32_t v_dc = 0;
 176
 177     const OSTask_t * const task = get_task();
 178
 179     uint32_t           address          = task->data_ptr;
 180     const unsigned int macroblock_count = task->data_size;
 181     const int          qscale           = task->yield_data_size;
 182
 183     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
 184             address,
 185             macroblock_count,
 186             qscale);
 187
 188     if (qscale != 0)
 189     {
 190         if (qscale > 0)
 191         {
 192             ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
 193         }
 194         else
 195         {
 196             RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
 197         }
 198     }
 199
 200     for (mb = 0; mb < macroblock_count; ++mb)
 201     {
 202         int16_t macroblock[6*SUBBLOCK_SIZE];
 203
 204         rdram_read_many_u16((uint16_t*)macroblock, address, 6*SUBBLOCK_SIZE);
 205         decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
 206         EmitTilesMode2(EmitYUVTileLine, macroblock, address);
 207
 208         address += (2*6*SUBBLOCK_SIZE);
 209     }
 210 }
 211
 212
 213 /* local functions */
 214 static void jpeg_decode_std(const char * const version,
 215         const subblock_transform_t transform_luma,
 216         const subblock_transform_t transform_chroma,
 217         const tile_line_emitter_t emit_line)
 218 {
 219     int16_t qtables[3][SUBBLOCK_SIZE];
 220     unsigned int mb;
 221     uint32_t address;
 222     uint32_t macroblock_count;
 223     uint32_t mode;
 224     uint32_t qtableY_ptr;
 225     uint32_t qtableU_ptr;
 226     uint32_t qtableV_ptr;
 227     unsigned int subblock_count;
 228     unsigned int macroblock_size;
 229     int16_t macroblock[6*SUBBLOCK_SIZE]; /* macroblock contains at most 6 subblobcks */
 230     const OSTask_t * const task = get_task();
 231
 232     if (task->flags & 0x1)
 233     {
 234         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
 235         return;
 236     }
 237
 238     address          = rdram_read_u32(task->data_ptr);
 239     macroblock_count = rdram_read_u32(task->data_ptr + 4);
 240     mode             = rdram_read_u32(task->data_ptr + 8);
 241     qtableY_ptr      = rdram_read_u32(task->data_ptr + 12);
 242     qtableU_ptr      = rdram_read_u32(task->data_ptr + 16);
 243     qtableV_ptr      = rdram_read_u32(task->data_ptr + 20);
 244
 245     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
 246             version,
 247             address,
 248             macroblock_count,
 249             mode,
 250             qtableY_ptr,
 251             qtableU_ptr,
 252             qtableV_ptr);
 253
 254     if (mode != 0 && mode != 2)
 255     {
 256         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
 257         return;
 258     }
 259
 260     subblock_count = mode + 4;
 261     macroblock_size = subblock_count*SUBBLOCK_SIZE;
 262
 263     rdram_read_many_u16((uint16_t*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
 264     rdram_read_many_u16((uint16_t*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
 265     rdram_read_many_u16((uint16_t*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
 266
 267     for (mb = 0; mb < macroblock_count; ++mb)
 268     {
 269         rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size);
 270         decode_macroblock_std(transform_luma, transform_chroma,
 271                 macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
 272
 273         if (mode == 0)
 274         {
 275             EmitTilesMode0(emit_line, macroblock, address);
 276         }
 277         else
 278         {
 279             EmitTilesMode2(emit_line, macroblock, address);
 280         }
 281
 282         address += 2*macroblock_size;
 283     }
 284 }
 285
 286 static uint8_t clamp_u8(int16_t x)
 287 {
 288     return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
 289 }
 290
 291 static int16_t clamp_s12(int16_t x)
 292 {
 293     if (x < -0x800) { x = -0x800; } else if (x > 0x7f0) { x = 0x7f0; }
 294     return x;
 295 }
 296
 297 static int16_t clamp_s16(int32_t x)
 298 {
 299     if (x > 32767) { x = 32767; } else if (x < -32768) { x = -32768; }
 300     return x;
 301 }
 302
 303 static uint16_t clamp_RGBA_component(int16_t x)
 304 {
 305     if (x > 0xff0) { x = 0xff0; } else if (x < 0) { x = 0; }
 306     return (x & 0xf80);
 307 }
 308
 309 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
 310 {
 311     return (uint32_t)clamp_u8(u)  << 24
 312         |  (uint32_t)clamp_u8(y1) << 16
 313         |  (uint32_t)clamp_u8(v)  << 8
 314         |  (uint32_t)clamp_u8(y2);
 315 }
 316
 317 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
 318 {
 319     const float fY = (float)y + 2048.0f;
 320     const float fU = (float)u;
 321     const float fV = (float)v;
 322
 323     const uint16_t r = clamp_RGBA_component((int16_t)(fY             + 1.4025*fV));
 324     const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443*fU - 0.7144*fV));
 325     const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729*fU            ));
 326
 327     return (r << 4) | (g >> 1) | (b >> 6) | 1;
 328 }
 329
 330 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
 331 {
 332     uint32_t uyvy[8];
 333
 334     const int16_t * const v  = u + SUBBLOCK_SIZE;
 335     const int16_t * const y2 = y + SUBBLOCK_SIZE;
 336
 337     uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
 338     uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
 339     uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
 340     uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
 341     uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
 342     uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
 343     uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
 344     uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
 345
 346     rdram_write_many_u32(uyvy, address, 8);
 347 }
 348
 349 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
 350 {
 351     uint16_t rgba[16];
 352
 353     const int16_t * const v  = u + SUBBLOCK_SIZE;
 354     const int16_t * const y2 = y + SUBBLOCK_SIZE;
 355
 356     rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
 357     rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
 358     rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
 359     rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
 360     rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
 361     rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
 362     rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
 363     rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
 364     rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
 365     rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
 366     rgba[10] = GetRGBA(y2[2], u[5], v[5]);
 367     rgba[11] = GetRGBA(y2[3], u[5], v[5]);
 368     rgba[12] = GetRGBA(y2[4], u[6], v[6]);
 369     rgba[13] = GetRGBA(y2[5], u[6], v[6]);
 370     rgba[14] = GetRGBA(y2[6], u[7], v[7]);
 371     rgba[15] = GetRGBA(y2[7], u[7], v[7]);
 372
 373     rdram_write_many_u16(rgba, address, 16);
 374 }
 375
 376 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 377 {
 378     unsigned int i;
 379
 380     unsigned int y_offset = 0;
 381     unsigned int u_offset = 2*SUBBLOCK_SIZE;
 382
 383     for (i = 0; i < 8; ++i)
 384     {
 385         emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
 386
 387         y_offset += 8;
 388         u_offset += 8;
 389         address += 32;
 390     }
 391 }
 392
 393 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 394 {
 395     unsigned int i;
 396
 397     unsigned int y_offset = 0;
 398     unsigned int u_offset = 4*SUBBLOCK_SIZE;
 399
 400     for (i = 0; i < 8; ++i)
 401     {
 402         emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
 403         emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
 404
 405         y_offset += (i == 3) ? SUBBLOCK_SIZE+16 : 16;
 406         u_offset += 8;
 407         address += 64;
 408     }
 409 }
 410
 411 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
 412 {
 413     int sb;
 414
 415     for (sb = 0; sb < 6; ++sb)
 416     {
 417         int16_t tmp_sb[SUBBLOCK_SIZE];
 418
 419         /* update DC */
 420         int32_t dc = (int32_t)macroblock[0];
 421         switch(sb)
 422         {
 423         case 0: case 1: case 2: case 3:
 424                 *y_dc += dc; macroblock[0] = *y_dc & 0xffff; break;
 425         case 4: *u_dc += dc; macroblock[0] = *u_dc & 0xffff; break;
 426         case 5: *v_dc += dc; macroblock[0] = *v_dc & 0xffff; break;
 427         }
 428
 429         ZigZagSubBlock(tmp_sb, macroblock);
 430         if (qtable != NULL) { MultSubBlocks(tmp_sb, tmp_sb, qtable, 0); }
 431         TransposeSubBlock(macroblock, tmp_sb);
 432         InverseDCTSubBlock(macroblock, macroblock);
 433
 434         macroblock += SUBBLOCK_SIZE;
 435     }
 436 }
 437
 438 static void decode_macroblock_std(
 439         const subblock_transform_t transform_luma,
 440         const subblock_transform_t transform_chroma,
 441         int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
 442 {
 443     unsigned int sb;
 444     unsigned int q = 0;
 445
 446     for (sb = 0; sb < subblock_count; ++sb)
 447     {
 448         int16_t tmp_sb[SUBBLOCK_SIZE];
 449         const int isChromaSubBlock = (subblock_count - sb <= 2);
 450
 451         if (isChromaSubBlock) { ++q; }
 452
 453         MultSubBlocks(macroblock, macroblock, qtables[q], 4);
 454         ZigZagSubBlock(tmp_sb, macroblock);
 455         InverseDCTSubBlock(macroblock, tmp_sb);
 456
 457         if (isChromaSubBlock)
 458         {
 459             if (transform_chroma != NULL)
 460                 transform_chroma(macroblock, macroblock);
 461         }
 462         else
 463         {
 464             if (transform_luma != NULL)
 465                 transform_luma(macroblock, macroblock);
 466         }
 467
 468         macroblock += SUBBLOCK_SIZE;
 469     }
 470 }
 471
 472 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
 473 {
 474     ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
 475 }
 476
 477 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
 478 {
 479     ReorderSubBlock(dst, src, ZIGZAG_TABLE);
 480 }
 481
 482 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
 483 {
 484     unsigned int i;
 485
 486     /* source and destination sublocks cannot overlap */
 487     assert(abs(dst - src) > SUBBLOCK_SIZE);
 488
 489     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 490     {
 491         dst[i] = src[table[i]];
 492     }
 493 }
 494
 495 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
 496 {
 497     unsigned int i;
 498
 499     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 500     {
 501         int32_t v = src1[i] * src2[i];
 502         dst[i] = clamp_s16(v) << shift;
 503     }
 504 }
 505
 506 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
 507 {
 508     unsigned int i;
 509
 510     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 511     {
 512         int32_t v = src[i] * scale;
 513         dst[i] = clamp_s16(v);
 514     }
 515 }
 516
 517 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
 518 {
 519     unsigned int i;
 520
 521     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 522     {
 523         dst[i] = src[i] >> shift;
 524     }
 525 }
 526
 527 /***************************************************************************
 528  * Fast 2D IDCT using separable formulation and normalization
 529  * Computations use single precision floats
 530  * Implementation based on Wikipedia :
 531  * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
 532  **************************************************************************/
 533 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride)
 534 {
 535     float e[4];
 536     float f[4];
 537     float x26, x1357, x15, x37, x17, x35;
 538
 539     x15   = IDCT_K[2] * (x[1] + x[5]);
 540     x37   = IDCT_K[3] * (x[3] + x[7]);
 541     x17   = IDCT_K[8] * (x[1] + x[7]);
 542     x35   = IDCT_K[9] * (x[3] + x[5]);
 543     x1357 = IDCT_C3   * (x[1] + x[3] + x[5] + x[7]);
 544     x26   = IDCT_C6   * (x[2] + x[6]);
 545
 546     f[0] = x[0] + x[4];
 547     f[1] = x[0] - x[4];
 548     f[2] = x26  + IDCT_K[0]*x[2];
 549     f[3] = x26  + IDCT_K[1]*x[6];
 550
 551     e[0] = x1357 + x15 + IDCT_K[4]*x[1] + x17;
 552     e[1] = x1357 + x37 + IDCT_K[6]*x[3] + x35;
 553     e[2] = x1357 + x15 + IDCT_K[5]*x[5] + x35;
 554     e[3] = x1357 + x37 + IDCT_K[7]*x[7] + x17;
 555
 556     *dst = f[0] + f[2] + e[0]; dst += stride;
 557     *dst = f[1] + f[3] + e[1]; dst += stride;
 558     *dst = f[1] - f[3] + e[2]; dst += stride;
 559     *dst = f[0] - f[2] + e[3]; dst += stride;
 560     *dst = f[0] - f[2] - e[3]; dst += stride;
 561     *dst = f[1] - f[3] - e[2]; dst += stride;
 562     *dst = f[1] + f[3] - e[1]; dst += stride;
 563     *dst = f[0] + f[2] - e[0]; dst += stride;
 564 }
 565
 566 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
 567 {
 568     float x[8];
 569     float block[SUBBLOCK_SIZE];
 570     unsigned int i, j;
 571
 572     /* idct 1d on rows (+transposition) */
 573     for (i = 0; i < 8; ++i)
 574     {
 575         for (j = 0; j < 8; ++j)
 576         {
 577             x[j] = (float)src[i*8+j];
 578         }
 579
 580         InverseDCT1D(x, &block[i], 8);
 581     }
 582
 583     /* idct 1d on columns (thanks to previous transposition) */
 584     for (i = 0; i < 8; ++i)
 585     {
 586         InverseDCT1D(&block[i*8], x, 1);
 587
 588         /* C4 = 1 normalization implies a division by 8 */
 589         for (j = 0; j < 8; ++j)
 590         {
 591             dst[i+j*8] = (int16_t)x[j] >> 3;
 592         }
 593     }
 594 }
 595
 596 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
 597 {
 598     unsigned int i;
 599
 600     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 601     {
 602         dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
 603     }
 604 }
 605
 606 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
 607 {
 608     unsigned int i;
 609
 610     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 611     {
 612         dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
 613     }
 614 }
 615
 616
 617
 618 /* FIXME: assume presence of expansion pack */
 619 #define MEMMASK 0x7fffff
 620
 621 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
 622 {
 623     while (count != 0)
 624     {
 625         uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
 626         s <<= 8;
 627         s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
 628
 629         *(dst++) = s;
 630
 631         --count;
 632     }
 633 }
 634
 635 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
 636 {
 637     while (count != 0)
 638     {
 639         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 640         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 641
 642         --count;
 643     }
 644 }
 645
 646 static uint32_t rdram_read_u32(uint32_t address)
 647 {
 648     uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 649     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 650     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 651     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 652
 653     return r;
 654 }
 655
 656 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
 657 {
 658     while (count != 0)
 659     {
 660         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
 661         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
 662         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 663         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 664
 665         --count;
 666     }
 667 }
 668