source/mupen64plus-rsp-hle/src/jpeg.c

   1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
   2  *   Mupen64plus-rsp-hle - jpeg.c                                          *
   3  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
   4  *   Copyright (C) 2012 Bobby Smiles                                       *
   5  *   Copyright (C) 2009 Richard Goedeken                                   *
   6  *   Copyright (C) 2002 Hacktarux                                          *
   7  *                                                                         *
   8  *   This program is free software; you can redistribute it and/or modify  *
   9  *   it under the terms of the GNU General Public License as published by  *
  10  *   the Free Software Foundation; either version 2 of the License, or     *
  11  *   (at your option) any later version.                                   *
  12  *                                                                         *
  13  *   This program is distributed in the hope that it will be useful,       *
  14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  16  *   GNU General Public License for more details.                          *
  17  *                                                                         *
  18  *   You should have received a copy of the GNU General Public License     *
  19  *   along with this program; if not, write to the                         *
  20  *   Free Software Foundation, Inc.,                                       *
  21  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  22  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  23
  24 #include <assert.h>
  25 #include <stdlib.h>
  26 #include <stdint.h>
  27
  28 #define M64P_PLUGIN_PROTOTYPES 1
  29 #include "m64p_types.h"
  30 #include "m64p_plugin.h"
  31 #include "hle.h"
  32
  33 #define SUBBLOCK_SIZE 64
  34
  35 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
  36 typedef void (*std_macroblock_decoder_t)(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
  37
  38 /* rdram operations */
  39 // FIXME: these functions deserve their own module
  40 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
  41 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
  42 static uint32_t rdram_read_u32(uint32_t address);
  43 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
  44
  45 /* standard jpeg ucode decoder */
  46 static void jpeg_decode_std(const char * const version, const std_macroblock_decoder_t decode_mb, const tile_line_emitter_t emit_line);
  47
  48 /* helper functions */
  49 static uint8_t clamp_u8(int16_t x);
  50 static int16_t clamp_s12(int16_t x);
  51 static int16_t clamp_s16(int32_t x);
  52 static uint16_t clamp_RGBA_component(int16_t x);
  53
  54 /* pixel conversion & foratting */
  55 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
  56 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
  57
  58 /* tile line emitters */
  59 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
  60 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
  61
  62 /* macroblocks operations */
  63 static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
  64 static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
  65 static void DecodeMacroblock3(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
  66 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  67 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
  68
  69 /* subblocks operations */
  70 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
  71 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
  72 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
  73 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
  74 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
  75 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
  76 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride);
  77 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
  78 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
  79 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
  80
  81 /* transposed dequantization table */
  82 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] =
  83 {
  84     16, 12, 14, 14,  18,  24,  49,  72,
  85     11, 12, 13, 17,  22,  35,  64,  92,
  86     10, 14, 16, 22,  37,  55,  78,  95,
  87     16, 19, 24, 29,  56,  64,  87,  98,
  88     24, 26, 40, 51,  68,  81, 103, 112,
  89     40, 58, 57, 87, 109, 104, 121, 100,
  90     51, 60, 69, 80, 103, 113, 120, 103,
  91     61, 55, 56, 62,  77,  92, 101,  99
  92 };
  93
  94 /* zig-zag indices */
  95 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] =
  96 {
  97      0,  1,  5,  6, 14, 15, 27, 28,
  98      2,  4,  7, 13, 16, 26, 29, 42,
  99      3,  8, 12, 17, 25, 30, 41, 43,
 100      9, 11, 18, 24, 31, 40, 44, 53,
 101     10, 19, 23, 32, 39, 45, 52, 54,
 102     20, 22, 33, 38, 46, 51, 55, 60,
 103     21, 34, 37, 47, 50, 56, 59, 61,
 104     35, 36, 48, 49, 57, 58, 62, 63
 105 };
 106
 107 /* transposition indices */
 108 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
 109 {
 110     0,  8, 16, 24, 32, 40, 48, 56,
 111     1,  9, 17, 25, 33, 41, 49, 57,
 112     2, 10, 18, 26, 34, 42, 50, 58,
 113     3, 11, 19, 27, 35, 43, 51, 59,
 114     4, 12, 20, 28, 36, 44, 52, 60,
 115     5, 13, 21, 29, 37, 45, 53, 61,
 116     6, 14, 22, 30, 38, 46, 54, 62,
 117     7, 15, 23, 31, 39, 47, 55, 63
 118 };
 119
 120
 121
 122 /* IDCT related constants
 123  * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
 124 static const float IDCT_C3 = 1.175875602f;
 125 static const float IDCT_C6 = 0.541196100f;
 126 static const float IDCT_K[10] =
 127 {
 128   0.765366865f,   /*  C2-C6         */
 129  -1.847759065f,   /* -C2-C6         */
 130  -0.390180644f,   /*  C5-C3         */
 131  -1.961570561f,   /* -C5-C3         */
 132   1.501321110f,   /*  C1+C3-C5-C7   */
 133   2.053119869f,   /*  C1+C3-C5+C7   */
 134   3.072711027f,   /*  C1+C3+C5-C7   */
 135   0.298631336f,   /* -C1+C3+C5-C7   */
 136  -0.899976223f,   /*  C7-C3         */
 137  -2.562915448f    /* -C1-C3         */
 138 };
 139
 140
 141 /* global functions */
 142
 143 /***************************************************************************
 144  * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
 145  **************************************************************************/
 146 void jpeg_decode_PS0()
 147 {
 148     jpeg_decode_std("PS0", DecodeMacroblock3, EmitYUVTileLine);
 149 }
 150
 151 /***************************************************************************
 152  * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
 153  * Pokemon Stadium 2.
 154  **************************************************************************/
 155 void jpeg_decode_PS()
 156 {
 157     jpeg_decode_std("PS", DecodeMacroblock2, EmitRGBATileLine);
 158 }
 159
 160 /***************************************************************************
 161  * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
 162  **************************************************************************/
 163 void jpeg_decode_OB()
 164 {
 165     int16_t qtable[SUBBLOCK_SIZE];
 166     unsigned int mb;
 167
 168     int32_t y_dc = 0;
 169     int32_t u_dc = 0;
 170     int32_t v_dc = 0;
 171
 172     const OSTask_t * const task = get_task();
 173
 174     uint32_t           address          = task->data_ptr;
 175     const unsigned int macroblock_count = task->data_size;
 176     const int          qscale           = task->yield_data_size;
 177
 178     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
 179             address,
 180             macroblock_count,
 181             qscale);
 182
 183     if (qscale != 0)
 184     {
 185         if (qscale > 0)
 186         {
 187             ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
 188         }
 189         else
 190         {
 191             RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
 192         }
 193     }
 194
 195     for (mb = 0; mb < macroblock_count; ++mb)
 196     {
 197         int16_t macroblock[6*SUBBLOCK_SIZE];
 198
 199         rdram_read_many_u16((uint16_t*)macroblock, address, 6*SUBBLOCK_SIZE);
 200         DecodeMacroblock1(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
 201         EmitTilesMode2(EmitYUVTileLine, macroblock, address);
 202
 203         address += (2*6*SUBBLOCK_SIZE);
 204     }
 205 }
 206
 207
 208 /* local functions */
 209 static void jpeg_decode_std(const char * const version, const std_macroblock_decoder_t decode_mb, const tile_line_emitter_t emit_line)
 210 {
 211     int16_t qtables[3][SUBBLOCK_SIZE];
 212     unsigned int mb;
 213     uint32_t address;
 214     uint32_t macroblock_count;
 215     uint32_t mode;
 216     uint32_t qtableY_ptr;
 217     uint32_t qtableU_ptr;
 218     uint32_t qtableV_ptr;
 219     unsigned int subblock_count;
 220     unsigned int macroblock_size;
 221     int16_t *macroblock;
 222     const OSTask_t * const task = get_task();
 223
 224     if (task->flags & 0x1)
 225     {
 226         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
 227         return;
 228     }
 229
 230     address          = rdram_read_u32(task->data_ptr);
 231     macroblock_count = rdram_read_u32(task->data_ptr + 4);
 232     mode             = rdram_read_u32(task->data_ptr + 8);
 233     qtableY_ptr      = rdram_read_u32(task->data_ptr + 12);
 234     qtableU_ptr      = rdram_read_u32(task->data_ptr + 16);
 235     qtableV_ptr      = rdram_read_u32(task->data_ptr + 20);
 236
 237     DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
 238             version,
 239             address,
 240             macroblock_count,
 241             mode,
 242             qtableY_ptr,
 243             qtableU_ptr,
 244             qtableV_ptr);
 245
 246     if (mode != 0 && mode != 2)
 247     {
 248         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
 249         return;
 250     }
 251
 252     subblock_count = mode + 4;
 253     macroblock_size = 2*subblock_count*SUBBLOCK_SIZE;
 254
 255     rdram_read_many_u16((uint16_t*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
 256     rdram_read_many_u16((uint16_t*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
 257     rdram_read_many_u16((uint16_t*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
 258
 259     macroblock = malloc(sizeof(*macroblock) * macroblock_size);
 260     if (!macroblock)
 261     {
 262         DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: could not allocate macroblock", version);
 263         return;
 264     }
 265
 266     for (mb = 0; mb < macroblock_count; ++mb)
 267     {
 268         rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size >> 1);
 269         decode_mb(macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
 270
 271         if (mode == 0)
 272         {
 273             EmitTilesMode0(emit_line, macroblock, address);
 274         }
 275         else
 276         {
 277             EmitTilesMode2(emit_line, macroblock, address);
 278         }
 279
 280         address += macroblock_size;
 281     }
 282     free(macroblock);
 283 }
 284
 285 static uint8_t clamp_u8(int16_t x)
 286 {
 287     return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
 288 }
 289
 290 static int16_t clamp_s12(int16_t x)
 291 {
 292     if (x < -0x800) { x = -0x800; } else if (x > 0x7f0) { x = 0x7f0; }
 293     return x;
 294 }
 295
 296 static int16_t clamp_s16(int32_t x)
 297 {
 298     if (x > 32767) { x = 32767; } else if (x < -32768) { x = -32768; }
 299     return x;
 300 }
 301
 302 static uint16_t clamp_RGBA_component(int16_t x)
 303 {
 304     if (x > 0xff0) { x = 0xff0; } else if (x < 0) { x = 0; }
 305     return (x & 0xf80);
 306 }
 307
 308 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
 309 {
 310     return (uint32_t)clamp_u8(u)  << 24
 311         |  (uint32_t)clamp_u8(y1) << 16
 312         |  (uint32_t)clamp_u8(v)  << 8
 313         |  (uint32_t)clamp_u8(y2);
 314 }
 315
 316 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
 317 {
 318     const float fY = (float)y + 2048.0f;
 319     const float fU = (float)u;
 320     const float fV = (float)v;
 321
 322     const uint16_t r = clamp_RGBA_component((int16_t)(fY             + 1.4025*fV));
 323     const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443*fU - 0.7144*fV));
 324     const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729*fU            ));
 325
 326     return (r << 4) | (g >> 1) | (b >> 6) | 1;
 327 }
 328
 329 static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
 330 {
 331     uint32_t uyvy[8];
 332
 333     const int16_t * const v  = u + SUBBLOCK_SIZE;
 334     const int16_t * const y2 = y + SUBBLOCK_SIZE;
 335
 336     uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
 337     uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
 338     uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
 339     uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
 340     uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
 341     uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
 342     uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
 343     uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
 344
 345     rdram_write_many_u32(uyvy, address, 8);
 346 }
 347
 348 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
 349 {
 350     uint16_t rgba[16];
 351
 352     const int16_t * const v  = u + SUBBLOCK_SIZE;
 353     const int16_t * const y2 = y + SUBBLOCK_SIZE;
 354
 355     rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
 356     rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
 357     rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
 358     rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
 359     rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
 360     rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
 361     rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
 362     rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
 363     rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
 364     rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
 365     rgba[10] = GetRGBA(y2[2], u[5], v[5]);
 366     rgba[11] = GetRGBA(y2[3], u[5], v[5]);
 367     rgba[12] = GetRGBA(y2[4], u[6], v[6]);
 368     rgba[13] = GetRGBA(y2[5], u[6], v[6]);
 369     rgba[14] = GetRGBA(y2[6], u[7], v[7]);
 370     rgba[15] = GetRGBA(y2[7], u[7], v[7]);
 371
 372     rdram_write_many_u16(rgba, address, 16);
 373 }
 374
 375 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 376 {
 377     unsigned int i;
 378
 379     unsigned int y_offset = 0;
 380     unsigned int u_offset = 2*SUBBLOCK_SIZE;
 381
 382     for (i = 0; i < 8; ++i)
 383     {
 384         emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
 385
 386         y_offset += 8;
 387         u_offset += 8;
 388         address += 32;
 389     }
 390 }
 391
 392 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
 393 {
 394     unsigned int i;
 395
 396     unsigned int y_offset = 0;
 397     unsigned int u_offset = 4*SUBBLOCK_SIZE;
 398
 399     for (i = 0; i < 8; ++i)
 400     {
 401         emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
 402         emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
 403
 404         y_offset += (i == 3) ? SUBBLOCK_SIZE+16 : 16;
 405         u_offset += 8;
 406         address += 64;
 407     }
 408 }
 409
 410 static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
 411 {
 412     int sb;
 413
 414     for (sb = 0; sb < 6; ++sb)
 415     {
 416         int16_t tmp_sb[SUBBLOCK_SIZE];
 417
 418         /* update DC */
 419         int32_t dc = (int32_t)macroblock[0];
 420         switch(sb)
 421         {
 422         case 0: case 1: case 2: case 3:
 423                 *y_dc += dc; macroblock[0] = *y_dc & 0xffff; break;
 424         case 4: *u_dc += dc; macroblock[0] = *u_dc & 0xffff; break;
 425         case 5: *v_dc += dc; macroblock[0] = *v_dc & 0xffff; break;
 426         }
 427
 428         ZigZagSubBlock(tmp_sb, macroblock);
 429         if (qtable != NULL) { MultSubBlocks(tmp_sb, tmp_sb, qtable, 0); }
 430         TransposeSubBlock(macroblock, tmp_sb);
 431         InverseDCTSubBlock(macroblock, macroblock);
 432
 433         macroblock += SUBBLOCK_SIZE;
 434     }
 435 }
 436
 437 static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
 438 {
 439     unsigned int sb;
 440     unsigned int q = 0;
 441
 442     for (sb = 0; sb < subblock_count; ++sb)
 443     {
 444         int16_t tmp_sb[SUBBLOCK_SIZE];
 445         const int isChromaSubBlock = (subblock_count - sb <= 2);
 446
 447         if (isChromaSubBlock) { ++q; }
 448
 449         MultSubBlocks(macroblock, macroblock, qtables[q], 4);
 450         ZigZagSubBlock(tmp_sb, macroblock);
 451         InverseDCTSubBlock(macroblock, tmp_sb);
 452
 453         macroblock += SUBBLOCK_SIZE;
 454     }
 455
 456 }
 457
 458 static void DecodeMacroblock3(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
 459 {
 460     unsigned int sb;
 461     unsigned int q = 0;
 462
 463     for (sb = 0; sb < subblock_count; ++sb)
 464     {
 465         int16_t tmp_sb[SUBBLOCK_SIZE];
 466         const int isChromaSubBlock = (subblock_count - sb <= 2);
 467
 468         if (isChromaSubBlock) { ++q; }
 469
 470         MultSubBlocks(macroblock, macroblock, qtables[q], 4);
 471         ZigZagSubBlock(tmp_sb, macroblock);
 472         InverseDCTSubBlock(macroblock, tmp_sb);
 473
 474         if (isChromaSubBlock)
 475         {
 476             RescaleUVSubBlock(macroblock, macroblock);
 477         }
 478         else
 479         {
 480             RescaleYSubBlock(macroblock, macroblock);
 481         }
 482
 483         macroblock += SUBBLOCK_SIZE;
 484     }
 485 }
 486
 487 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
 488 {
 489     ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
 490 }
 491
 492 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
 493 {
 494     ReorderSubBlock(dst, src, ZIGZAG_TABLE);
 495 }
 496
 497 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
 498 {
 499     unsigned int i;
 500
 501     /* source and destination sublocks cannot overlap */
 502     assert(abs(dst - src) > SUBBLOCK_SIZE);
 503
 504     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 505     {
 506         dst[i] = src[table[i]];
 507     }
 508 }
 509
 510 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
 511 {
 512     unsigned int i;
 513
 514     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 515     {
 516         int32_t v = src1[i] * src2[i];
 517         dst[i] = clamp_s16(v) << shift;
 518     }
 519 }
 520
 521 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
 522 {
 523     unsigned int i;
 524
 525     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 526     {
 527         int32_t v = src[i] * scale;
 528         dst[i] = clamp_s16(v);
 529     }
 530 }
 531
 532 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
 533 {
 534     unsigned int i;
 535
 536     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 537     {
 538         dst[i] = src[i] >> shift;
 539     }
 540 }
 541
 542 /***************************************************************************
 543  * Fast 2D IDCT using separable formulation and normalization
 544  * Computations use single precision floats
 545  * Implementation based on Wikipedia :
 546  * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
 547  **************************************************************************/
 548 static void InverseDCT1D(const float * const x, float *dst, unsigned int stride)
 549 {
 550     float e[4];
 551     float f[4];
 552     float x26, x1357, x15, x37, x17, x35;
 553
 554     x15   = IDCT_K[2] * (x[1] + x[5]);
 555     x37   = IDCT_K[3] * (x[3] + x[7]);
 556     x17   = IDCT_K[8] * (x[1] + x[7]);
 557     x35   = IDCT_K[9] * (x[3] + x[5]);
 558     x1357 = IDCT_C3   * (x[1] + x[3] + x[5] + x[7]);
 559     x26   = IDCT_C6   * (x[2] + x[6]);
 560
 561     f[0] = x[0] + x[4];
 562     f[1] = x[0] - x[4];
 563     f[2] = x26  + IDCT_K[0]*x[2];
 564     f[3] = x26  + IDCT_K[1]*x[6];
 565
 566     e[0] = x1357 + x15 + IDCT_K[4]*x[1] + x17;
 567     e[1] = x1357 + x37 + IDCT_K[6]*x[3] + x35;
 568     e[2] = x1357 + x15 + IDCT_K[5]*x[5] + x35;
 569     e[3] = x1357 + x37 + IDCT_K[7]*x[7] + x17;
 570
 571     *dst = f[0] + f[2] + e[0]; dst += stride;
 572     *dst = f[1] + f[3] + e[1]; dst += stride;
 573     *dst = f[1] - f[3] + e[2]; dst += stride;
 574     *dst = f[0] - f[2] + e[3]; dst += stride;
 575     *dst = f[0] - f[2] - e[3]; dst += stride;
 576     *dst = f[1] - f[3] - e[2]; dst += stride;
 577     *dst = f[1] + f[3] - e[1]; dst += stride;
 578     *dst = f[0] + f[2] - e[0]; dst += stride;
 579 }
 580
 581 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
 582 {
 583     float x[8];
 584     float block[SUBBLOCK_SIZE];
 585     unsigned int i, j;
 586
 587     /* idct 1d on rows (+transposition) */
 588     for (i = 0; i < 8; ++i)
 589     {
 590         for (j = 0; j < 8; ++j)
 591         {
 592             x[j] = (float)src[i*8+j];
 593         }
 594
 595         InverseDCT1D(x, &block[i], 8);
 596     }
 597
 598     /* idct 1d on columns (thanks to previous transposition) */
 599     for (i = 0; i < 8; ++i)
 600     {
 601         InverseDCT1D(&block[i*8], x, 1);
 602
 603         /* C4 = 1 normalization implies a division by 8 */
 604         for (j = 0; j < 8; ++j)
 605         {
 606             dst[i+j*8] = (int16_t)x[j] >> 3;
 607         }
 608     }
 609 }
 610
 611 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
 612 {
 613     unsigned int i;
 614
 615     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 616     {
 617         dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
 618     }
 619 }
 620
 621 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
 622 {
 623     unsigned int i;
 624
 625     for (i = 0; i < SUBBLOCK_SIZE; ++i)
 626     {
 627         dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
 628     }
 629 }
 630
 631
 632
 633 /* FIXME: assume presence of expansion pack */
 634 #define MEMMASK 0x7fffff
 635
 636 static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
 637 {
 638     while (count != 0)
 639     {
 640         uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
 641         s <<= 8;
 642         s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
 643
 644         *(dst++) = s;
 645
 646         --count;
 647     }
 648 }
 649
 650 static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
 651 {
 652     while (count != 0)
 653     {
 654         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 655         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 656
 657         --count;
 658     }
 659 }
 660
 661 static uint32_t rdram_read_u32(uint32_t address)
 662 {
 663     uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 664     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 665     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
 666     r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
 667
 668     return r;
 669 }
 670
 671 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
 672 {
 673     while (count != 0)
 674     {
 675         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
 676         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
 677         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
 678         rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
 679
 680         --count;
 681     }
 682 }
 683