source/mupen64plus-rsp-hle/src/jpeg.c

   1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
   2  *   Mupen64plus-rsp-hle - jpeg.c                                          *
   3  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
   4  *   Copyright (C) 2012 Bobby Smiles                                       *
   5  *   Copyright (C) 2009 Richard Goedeken                                   *
   6  *   Copyright (C) 2002 Hacktarux                                          *
   7  *                                                                         *
   8  *   This program is free software; you can redistribute it and/or modify  *
   9  *   it under the terms of the GNU General Public License as published by  *
  10  *   the Free Software Foundation; either version 2 of the License, or     *
  11  *   (at your option) any later version.                                   *
  12  *                                                                         *
  13  *   This program is distributed in the hope that it will be useful,       *
  14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  16  *   GNU General Public License for more details.                          *
  17  *                                                                         *
  18  *   You should have received a copy of the GNU General Public License     *
  19  *   along with this program; if not, write to the                         *
  20  *   Free Software Foundation, Inc.,                                       *
  21  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  22  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  23
  24 #include <stdint.h>
  25 #include <assert.h>
  26 #include <stdlib.h>
  27
  28 #define M64P_PLUGIN_PROTOTYPES 1
  29 #include "m64p_types.h"
  30 #include "m64p_plugin.h"
  31 #include "hle.h"
  32 #include "jpeg.h"
  33
  34 #define SUBBLOCK_SIZE 64
  35
  36 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
  37 typedef void (*subblock_transform_t)(int16_t *dst, const int16_t *src);
  38
  39 /* rdram operations
  40  * FIXME: these functions deserve their own module
  41  */
  42 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
  43 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
  44 static uint32_t rdram_read_u32(uint32_t address);
  45 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
  46
  47 /* standard jpeg ucode decoder */
  48 static void jpeg_decode_std(const char *const version,
  49                             const subblock_transform_t transform_luma,
  50                             const subblock_transform_t transform_chroma,
  51                             const tile_line_emitter_t emit_line);
  52
  53 /* helper functions */
  54 static uint8_t clamp_u8(int16_t x);
  55 static int16_t clamp_s12(int16_t x);
  56 static uint16_t clamp_RGBA_component(int16_t x);
  57
  58 /* pixel conversion & foratting */
  59 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
  60 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
  61
  62 /* tile line emitters */
  63 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
  64 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
  65
  66 /* macroblocks operations */
  67 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
  68 static void decode_macroblock_std(const subblock_transform_t transform_luma,
  69                                   const subblock_transform_t transform_chroma,
  70                                   int16_t *macroblock,
  71                                   unsigned int subblock_count,
  72                                   const int16_t qtables[3][SUBBLOCK_SIZE]);
  73 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  74 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  75
  76 /* subblocks operations */
  77 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
  78 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
  79 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
  80 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
  81 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
  82 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
  83 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride);
  84 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
  85 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
  86 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
  87
  88 /* transposed dequantization table */
  89 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
  90     16, 12, 14, 14,  18,  24,  49,  72,
  91     11, 12, 13, 17,  22,  35,  64,  92,
  92     10, 14, 16, 22,  37,  55,  78,  95,
  93     16, 19, 24, 29,  56,  64,  87,  98,
  94     24, 26, 40, 51,  68,  81, 103, 112,
  95     40, 58, 57, 87, 109, 104, 121, 100,
  96     51, 60, 69, 80, 103, 113, 120, 103,
  97     61, 55, 56, 62,  77,  92, 101,  99
  98 };
  99
 100 /* zig-zag indices */
 101 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
 102      0,  1,  5,  6, 14, 15, 27, 28,
 103      2,  4,  7, 13, 16, 26, 29, 42,
 104      3,  8, 12, 17, 25, 30, 41, 43,
 105      9, 11, 18, 24, 31, 40, 44, 53,
 106     10, 19, 23, 32, 39, 45, 52, 54,
 107     20, 22, 33, 38, 46, 51, 55, 60,
 108     21, 34, 37, 47, 50, 56, 59, 61,
 109     35, 36, 48, 49, 57, 58, 62, 63
 110 };
 111
 112 /* transposition indices */
 113 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
 114     0,  8, 16, 24, 32, 40, 48, 56,
 115     1,  9, 17, 25, 33, 41, 49, 57,
 116     2, 10, 18, 26, 34, 42, 50, 58,
 117     3, 11, 19, 27, 35, 43, 51, 59,
 118     4, 12, 20, 28, 36, 44, 52, 60,
 119     5, 13, 21, 29, 37, 45, 53, 61,
 120     6, 14, 22, 30, 38, 46, 54, 62,
 121     7, 15, 23, 31, 39, 47, 55, 63
 122 };
 123
 124
 125
 126 /* IDCT related constants
 127  * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
 128 static const float IDCT_C3 = 1.175875602f;
 129 static const float IDCT_C6 = 0.541196100f;
 130 static const float IDCT_K[10] = {
 131      0.765366865f,   /*  C2-C6         */
 132     -1.847759065f,   /* -C2-C6         */
 133     -0.390180644f,   /*  C5-C3         */
 134     -1.961570561f,   /* -C5-C3         */
 135      1.501321110f,   /*  C1+C3-C5-C7   */
 136      2.053119869f,   /*  C1+C3-C5+C7   */
 137      3.072711027f,   /*  C1+C3+C5-C7   */
 138      0.298631336f,   /* -C1+C3+C5-C7   */
 139     -0.899976223f,   /*  C7-C3         */
 140     -2.562915448f    /* -C1-C3         */
 141 };
 142
 143
 144 /* global functions */
 145
 146 /***************************************************************************
 147  * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
 148  **************************************************************************/
 149 void jpeg_decode_PS0(void)
 150 {
 151     jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
 152 }
 153
 154 /***************************************************************************
 155  * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
 156  * Pokemon Stadium 2.
 157  **************************************************************************/
 158 void jpeg_decode_PS(void)
 159 {
 160     jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
 161 }
 162
 163 /***************************************************************************
 164  * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
 165  **************************************************************************/
 166 void jpeg_decode_OB(void)
 167 {
 168     int16_t qtable[SUBBLOCK_SIZE];
 169     unsigned int mb;
 170
 171     int32_t y_dc = 0;
 172     int32_t u_dc = 0;
 173     int32_t v_dc = 0;
 174
 175     const OSTask_t *const task = get_task();
 176
 177     uint32_t           address          = task->data_ptr;
 178     const unsigned int macroblock_count = task->data_size;
 179     const int          qscale           = task->yield_data_size;
 180
 181     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
 182                  address,
 183                  macroblock_count,
 184                  qscale);
 185
 186     if (qscale != 0) {
 187         if (qscale > 0)
 188             ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
 189         else
 190             RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
 191     }
 192
 193     for (mb = 0; mb < macroblock_count; ++mb) {
 194         int16_t macroblock[6 * SUBBLOCK_SIZE];
 195
 196         rdram_read_many_u16((uint16_t *)macroblock, address, 6 * SUBBLOCK_SIZE);
 197         decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
 198         EmitTilesMode2(EmitYUVTileLine, macroblock, address);
 199
 200         address += (2 * 6 * SUBBLOCK_SIZE);
 201     }
 202 }
 203
 204
 205 /* local functions */
 206 static void jpeg_decode_std(const char *const version,
 207                             const subblock_transform_t transform_luma,
 208                             const subblock_transform_t transform_chroma,
 209                             const tile_line_emitter_t emit_line)
 210 {
 211     int16_t qtables[3][SUBBLOCK_SIZE];
 212     unsigned int mb;
 213     uint32_t address;
 214     uint32_t macroblock_count;
 215     uint32_t mode;
 216     uint32_t qtableY_ptr;
 217     uint32_t qtableU_ptr;
 218     uint32_t qtableV_ptr;
 219     unsigned int subblock_count;
 220     unsigned int macroblock_size;
 221     /* macroblock contains at most 6 subblocks */
 222     int16_t macroblock[6 * SUBBLOCK_SIZE];
 223     const OSTask_t *const task = get_task();
 224
 225     if (task->flags & 0x1) {
 226         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
 227         return;
 228     }
 229
 230     address          = rdram_read_u32(task->data_ptr);
 231     macroblock_count = rdram_read_u32(task->data_ptr + 4);
 232     mode             = rdram_read_u32(task->data_ptr + 8);
 233     qtableY_ptr      = rdram_read_u32(task->data_ptr + 12);
 234     qtableU_ptr      = rdram_read_u32(task->data_ptr + 16);
 235     qtableV_ptr      = rdram_read_u32(task->data_ptr + 20);
 236
 237     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
 238                  version,
 239                  address,
 240                  macroblock_count,
 241                  mode,
 242                  qtableY_ptr,
 243                  qtableU_ptr,
 244                  qtableV_ptr);
 245
 246     if (mode != 0 && mode != 2) {
 247         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
 248         return;
 249     }
 250
 251     subblock_count = mode + 4;
 252     macroblock_size = subblock_count * SUBBLOCK_SIZE;
 253
 254     rdram_read_many_u16((uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
 255     rdram_read_many_u16((uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
 256     rdram_read_many_u16((uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
 257
 258     for (mb = 0; mb < macroblock_count; ++mb) {
 259         rdram_read_many_u16((uint16_t *)macroblock, address, macroblock_size);
 260         decode_macroblock_std(transform_luma, transform_chroma,
 261                               macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
 262
 263         if (mode == 0)
 264             EmitTilesMode0(emit_line, macroblock, address);
 265         else
 266             EmitTilesMode2(emit_line, macroblock, address);
 267
 268         address += 2 * macroblock_size;
 269     }
 270 }
 271
 272 static uint8_t clamp_u8(int16_t x)
 273 {
 274     return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
 275 }
 276
 277 static int16_t clamp_s12(int16_t x)
 278 {
 279     if (x < -0x800)
 280         x = -0x800;
 281     else if (x > 0x7f0)
 282         x = 0x7f0;
 283     return x;
 284 }
 285
 286 static uint16_t clamp_RGBA_component(int16_t x)
 287 {
 288     if (x > 0xff0)
 289         x = 0xff0;
 290     else if (x < 0)
 291         x = 0;
 292     return (x & 0xf80);
 293 }
 294
 295 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
 296 {
 297     return (uint32_t)clamp_u8(u)  << 24 |
 298            (uint32_t)clamp_u8(y1) << 16 |
 299            (uint32_t)clamp_u8(v)  << 8 |
 300            (uint32_t)clamp_u8(y2);
 301 }
 302
 303 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
 304 {
 305     const float fY = (float)y + 2048.0f;
 306     const float fU = (float)u;
 307     const float fV = (float)v;
 308
 309     const uint16_t r = clamp_RGBA_component((int16_t)(fY               + 1.4025 * fV));
 310     const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
 311     const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));
 312
 313     return (r << 4) | (g >> 1) | (b >> 6) | 1;
 314 }
 315
 316 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
 317 {
 318     uint32_t uyvy[8];
 319
 320     const int16_t *const v  = u + SUBBLOCK_SIZE;
 321     const int16_t *const y2 = y + SUBBLOCK_SIZE;
 322
 323     uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
 324     uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
 325     uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
 326     uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
 327     uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
 328     uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
 329     uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
 330     uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
 331
 332     rdram_write_many_u32(uyvy, address, 8);
 333 }
 334
 335 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
 336 {
 337     uint16_t rgba[16];
 338
 339     const int16_t *const v  = u + SUBBLOCK_SIZE;
 340     const int16_t *const y2 = y + SUBBLOCK_SIZE;
 341
 342     rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
 343     rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
 344     rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
 345     rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
 346     rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
 347     rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
 348     rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
 349     rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
 350     rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
 351     rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
 352     rgba[10] = GetRGBA(y2[2], u[5], v[5]);
 353     rgba[11] = GetRGBA(y2[3], u[5], v[5]);
 354     rgba[12] = GetRGBA(y2[4], u[6], v[6]);
 355     rgba[13] = GetRGBA(y2[5], u[6], v[6]);
 356     rgba[14] = GetRGBA(y2[6], u[7], v[7]);
 357     rgba[15] = GetRGBA(y2[7], u[7], v[7]);
 358
 359     rdram_write_many_u16(rgba, address, 16);
 360 }
 361
 362 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 363 {
 364     unsigned int i;
 365
 366     unsigned int y_offset = 0;
 367     unsigned int u_offset = 2 * SUBBLOCK_SIZE;
 368
 369     for (i = 0; i < 8; ++i) {
 370         emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
 371
 372         y_offset += 8;
 373         u_offset += 8;
 374         address += 32;
 375     }
 376 }
 377
 378 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 379 {
 380     unsigned int i;
 381
 382     unsigned int y_offset = 0;
 383     unsigned int u_offset = 4 * SUBBLOCK_SIZE;
 384
 385     for (i = 0; i < 8; ++i) {
 386         emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
 387         emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
 388
 389         y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
 390         u_offset += 8;
 391         address += 64;
 392     }
 393 }
 394
 395 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
 396 {
 397     int sb;
 398
 399     for (sb = 0; sb < 6; ++sb) {
 400         int16_t tmp_sb[SUBBLOCK_SIZE];
 401
 402         /* update DC */
 403         int32_t dc = (int32_t)macroblock[0];
 404         switch (sb) {
 405         case 0:
 406         case 1:
 407         case 2:
 408         case 3:
 409             *y_dc += dc;
 410             macroblock[0] = *y_dc & 0xffff;
 411             break;
 412         case 4:
 413             *u_dc += dc;
 414             macroblock[0] = *u_dc & 0xffff;
 415             break;
 416         case 5:
 417             *v_dc += dc;
 418             macroblock[0] = *v_dc & 0xffff;
 419             break;
 420         }
 421
 422         ZigZagSubBlock(tmp_sb, macroblock);
 423         if (qtable != NULL)
 424             MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
 425         TransposeSubBlock(macroblock, tmp_sb);
 426         InverseDCTSubBlock(macroblock, macroblock);
 427
 428         macroblock += SUBBLOCK_SIZE;
 429     }
 430 }
 431
 432 static void decode_macroblock_std(const subblock_transform_t transform_luma,
 433                                   const subblock_transform_t transform_chroma,
 434                                   int16_t *macroblock,
 435                                   unsigned int subblock_count,
 436                                   const int16_t qtables[3][SUBBLOCK_SIZE])
 437 {
 438     unsigned int sb;
 439     unsigned int q = 0;
 440
 441     for (sb = 0; sb < subblock_count; ++sb) {
 442         int16_t tmp_sb[SUBBLOCK_SIZE];
 443         const int isChromaSubBlock = (subblock_count - sb <= 2);
 444
 445         if (isChromaSubBlock)
 446             ++q;
 447
 448         MultSubBlocks(macroblock, macroblock, qtables[q], 4);
 449         ZigZagSubBlock(tmp_sb, macroblock);
 450         InverseDCTSubBlock(macroblock, tmp_sb);
 451
 452         if (isChromaSubBlock) {
 453             if (transform_chroma != NULL)
 454                 transform_chroma(macroblock, macroblock);
 455         } else {
 456             if (transform_luma != NULL)
 457                 transform_luma(macroblock, macroblock);
 458         }
 459
 460         macroblock += SUBBLOCK_SIZE;
 461     }
 462 }
 463
 464 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
 465 {
 466     ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
 467 }
 468
 469 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
 470 {
 471     ReorderSubBlock(dst, src, ZIGZAG_TABLE);
 472 }
 473
 474 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
 475 {
 476     unsigned int i;
 477
 478     /* source and destination sublocks cannot overlap */
 479     assert(abs(dst - src) > SUBBLOCK_SIZE);
 480
 481     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 482         dst[i] = src[table[i]];
 483 }
 484
 485 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
 486 {
 487     unsigned int i;
 488
 489     for (i = 0; i < SUBBLOCK_SIZE; ++i) {
 490         int32_t v = src1[i] * src2[i];
 491         dst[i] = clamp_s16(v) << shift;
 492     }
 493 }
 494
 495 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
 496 {
 497     unsigned int i;
 498
 499     for (i = 0; i < SUBBLOCK_SIZE; ++i) {
 500         int32_t v = src[i] * scale;
 501         dst[i] = clamp_s16(v);
 502     }
 503 }
 504
 505 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
 506 {
 507     unsigned int i;
 508
 509     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 510         dst[i] = src[i] >> shift;
 511 }
 512
 513 /***************************************************************************
 514  * Fast 2D IDCT using separable formulation and normalization
 515  * Computations use single precision floats
 516  * Implementation based on Wikipedia :
 517  * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
 518  **************************************************************************/
 519 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride)
 520 {
 521     float e[4];
 522     float f[4];
 523     float x26, x1357, x15, x37, x17, x35;
 524
 525     x15   = IDCT_K[2] * (x[1] + x[5]);
 526     x37   = IDCT_K[3] * (x[3] + x[7]);
 527     x17   = IDCT_K[8] * (x[1] + x[7]);
 528     x35   = IDCT_K[9] * (x[3] + x[5]);
 529     x1357 = IDCT_C3   * (x[1] + x[3] + x[5] + x[7]);
 530     x26   = IDCT_C6   * (x[2] + x[6]);
 531
 532     f[0] = x[0] + x[4];
 533     f[1] = x[0] - x[4];
 534     f[2] = x26  + IDCT_K[0] * x[2];
 535     f[3] = x26  + IDCT_K[1] * x[6];
 536
 537     e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
 538     e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
 539     e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
 540     e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;
 541
 542     *dst = f[0] + f[2] + e[0];
 543     dst += stride;
 544     *dst = f[1] + f[3] + e[1];
 545     dst += stride;
 546     *dst = f[1] - f[3] + e[2];
 547     dst += stride;
 548     *dst = f[0] - f[2] + e[3];
 549     dst += stride;
 550     *dst = f[0] - f[2] - e[3];
 551     dst += stride;
 552     *dst = f[1] - f[3] - e[2];
 553     dst += stride;
 554     *dst = f[1] + f[3] - e[1];
 555     dst += stride;
 556     *dst = f[0] + f[2] - e[0];
 557 }
 558
 559 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
 560 {
 561     float x[8];
 562     float block[SUBBLOCK_SIZE];
 563     unsigned int i, j;
 564
 565     /* idct 1d on rows (+transposition) */
 566     for (i = 0; i < 8; ++i) {
 567         for (j = 0; j < 8; ++j)
 568             x[j] = (float)src[i * 8 + j];
 569
 570         InverseDCT1D(x, &block[i], 8);
 571     }
 572
 573     /* idct 1d on columns (thanks to previous transposition) */
 574     for (i = 0; i < 8; ++i) {
 575         InverseDCT1D(&block[i * 8], x, 1);
 576
 577         /* C4 = 1 normalization implies a division by 8 */
 578         for (j = 0; j < 8; ++j)
 579             dst[i + j * 8] = (int16_t)x[j] >> 3;
 580     }
 581 }
 582
 583 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
 584 {
 585     unsigned int i;
 586
 587     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 588         dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
 589 }
 590
 591 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
 592 {
 593     unsigned int i;
 594
 595     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 596         dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
 597 }
 598
 599
 600
 601 /* FIXME: assume presence of expansion pack */
 602 #define MEMMASK 0x7fffff
 603
 604 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
 605 {
 606     while (count != 0) {
 607         uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
 608         s <<= 8;
 609         s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
 610
 611         *(dst++) = s;
 612
 613         --count;
 614     }
 615 }
 616
 617 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
 618 {
 619     while (count != 0) {
 620         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 621         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 622
 623         --count;
 624     }
 625 }
 626
 627 static uint32_t rdram_read_u32(uint32_t address)
 628 {
 629     uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 630     r <<= 8;
 631     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 632     r <<= 8;
 633     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 634     r <<= 8;
 635     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 636
 637     return r;
 638 }
 639
 640 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
 641 {
 642     while (count != 0) {
 643         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
 644         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
 645         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 646         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 647
 648         --count;
 649     }
 650 }
 651