[mupen64plus-pandora.git] / source / mupen64plus-rsp-hle / src / jpeg.c

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *   Mupen64plus-rsp-hle - jpeg.c                                          *
 *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
 *   Copyright (C) 2012 Bobby Smiles                                       *
 *   Copyright (C) 2009 Richard Goedeken                                   *
 *   Copyright (C) 2002 Hacktarux                                          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#include <stdint.h>
#include <assert.h>
#include <stdlib.h>

#define M64P_PLUGIN_PROTOTYPES 1
#include "m64p_types.h"
#include "m64p_plugin.h"
#include "hle.h"
#include "jpeg.h"

#define SUBBLOCK_SIZE 64

typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
typedef void (*subblock_transform_t)(int16_t *dst, const int16_t *src);

/* rdram operations
 * FIXME: these functions deserve their own module
 */
static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
static uint32_t rdram_read_u32(uint32_t address);
static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);

/* standard jpeg ucode decoder */
static void jpeg_decode_std(const char *const version,
                            const subblock_transform_t transform_luma,
                            const subblock_transform_t transform_chroma,
                            const tile_line_emitter_t emit_line);

/* helper functions */
static uint8_t clamp_u8(int16_t x);
static int16_t clamp_s12(int16_t x);
static uint16_t clamp_RGBA_component(int16_t x);

/* pixel conversion & foratting */
static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);

/* tile line emitters */
static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);

/* macroblocks operations */
static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
static void decode_macroblock_std(const subblock_transform_t transform_luma,
                                  const subblock_transform_t transform_chroma,
                                  int16_t *macroblock,
                                  unsigned int subblock_count,
                                  const int16_t qtables[3][SUBBLOCK_SIZE]);
static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);

/* subblocks operations */
static void TransposeSubBlock(int16_t *dst, const int16_t *src);
static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
static void InverseDCT1D(const float *const x, float *dst, unsigned int stride);
static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);

/* transposed dequantization table */
static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
    16, 12, 14, 14,  18,  24,  49,  72,
    11, 12, 13, 17,  22,  35,  64,  92,
    10, 14, 16, 22,  37,  55,  78,  95,
    16, 19, 24, 29,  56,  64,  87,  98,
    24, 26, 40, 51,  68,  81, 103, 112,
    40, 58, 57, 87, 109, 104, 121, 100,
    51, 60, 69, 80, 103, 113, 120, 103,
    61, 55, 56, 62,  77,  92, 101,  99
};

/* zig-zag indices */
static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
     0,  1,  5,  6, 14, 15, 27, 28,
     2,  4,  7, 13, 16, 26, 29, 42,
     3,  8, 12, 17, 25, 30, 41, 43,
     9, 11, 18, 24, 31, 40, 44, 53,
    10, 19, 23, 32, 39, 45, 52, 54,
    20, 22, 33, 38, 46, 51, 55, 60,
    21, 34, 37, 47, 50, 56, 59, 61,
    35, 36, 48, 49, 57, 58, 62, 63
};

/* transposition indices */
static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
    0,  8, 16, 24, 32, 40, 48, 56,
    1,  9, 17, 25, 33, 41, 49, 57,
    2, 10, 18, 26, 34, 42, 50, 58,
    3, 11, 19, 27, 35, 43, 51, 59,
    4, 12, 20, 28, 36, 44, 52, 60,
    5, 13, 21, 29, 37, 45, 53, 61,
    6, 14, 22, 30, 38, 46, 54, 62,
    7, 15, 23, 31, 39, 47, 55, 63
};


/* IDCT related constants
 * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
static const float IDCT_C3 = 1.175875602f;
static const float IDCT_C6 = 0.541196100f;
static const float IDCT_K[10] = {
     0.765366865f,   /*  C2-C6         */
    -1.847759065f,   /* -C2-C6         */
    -0.390180644f,   /*  C5-C3         */
    -1.961570561f,   /* -C5-C3         */
     1.501321110f,   /*  C1+C3-C5-C7   */
     2.053119869f,   /*  C1+C3-C5+C7   */
     3.072711027f,   /*  C1+C3+C5-C7   */
     0.298631336f,   /* -C1+C3+C5-C7   */
    -0.899976223f,   /*  C7-C3         */
    -2.562915448f    /* -C1-C3         */
};


/* global functions */

/***************************************************************************
 * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
 **************************************************************************/
void jpeg_decode_PS0(void)
{
    jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
}

/***************************************************************************
 * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
 * Pokemon Stadium 2.
 **************************************************************************/
void jpeg_decode_PS(void)
{
    jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
}

/***************************************************************************
 * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
 **************************************************************************/
void jpeg_decode_OB(void)
{
    int16_t qtable[SUBBLOCK_SIZE];
    unsigned int mb;

    int32_t y_dc = 0;
    int32_t u_dc = 0;
    int32_t v_dc = 0;

    const OSTask_t *const task = get_task();

    uint32_t           address          = task->data_ptr;
    const unsigned int macroblock_count = task->data_size;
    const int          qscale           = task->yield_data_size;

    DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
                 address,
                 macroblock_count,
                 qscale);

    if (qscale != 0) {
        if (qscale > 0)
            ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
        else
            RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
    }

    for (mb = 0; mb < macroblock_count; ++mb) {
        int16_t macroblock[6 * SUBBLOCK_SIZE];

        rdram_read_many_u16((uint16_t *)macroblock, address, 6 * SUBBLOCK_SIZE);
        decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
        EmitTilesMode2(EmitYUVTileLine, macroblock, address);

        address += (2 * 6 * SUBBLOCK_SIZE);
    }
}


/* local functions */
static void jpeg_decode_std(const char *const version,
                            const subblock_transform_t transform_luma,
                            const subblock_transform_t transform_chroma,
                            const tile_line_emitter_t emit_line)
{
    int16_t qtables[3][SUBBLOCK_SIZE];
    unsigned int mb;
    uint32_t address;
    uint32_t macroblock_count;
    uint32_t mode;
    uint32_t qtableY_ptr;
    uint32_t qtableU_ptr;
    uint32_t qtableV_ptr;
    unsigned int subblock_count;
    unsigned int macroblock_size;
    /* macroblock contains at most 6 subblocks */
    int16_t macroblock[6 * SUBBLOCK_SIZE];
    const OSTask_t *const task = get_task();

    if (task->flags & 0x1) {
        DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
        return;
    }

    address          = rdram_read_u32(task->data_ptr);
    macroblock_count = rdram_read_u32(task->data_ptr + 4);
    mode             = rdram_read_u32(task->data_ptr + 8);
    qtableY_ptr      = rdram_read_u32(task->data_ptr + 12);
    qtableU_ptr      = rdram_read_u32(task->data_ptr + 16);
    qtableV_ptr      = rdram_read_u32(task->data_ptr + 20);

    DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
                 version,
                 address,
                 macroblock_count,
                 mode,
                 qtableY_ptr,
                 qtableU_ptr,
                 qtableV_ptr);

    if (mode != 0 && mode != 2) {
        DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
        return;
    }

    subblock_count = mode + 4;
    macroblock_size = subblock_count * SUBBLOCK_SIZE;

    rdram_read_many_u16((uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);

    for (mb = 0; mb < macroblock_count; ++mb) {
        rdram_read_many_u16((uint16_t *)macroblock, address, macroblock_size);
        decode_macroblock_std(transform_luma, transform_chroma,
                              macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);

        if (mode == 0)
            EmitTilesMode0(emit_line, macroblock, address);
        else
            EmitTilesMode2(emit_line, macroblock, address);

        address += 2 * macroblock_size;
    }
}

static uint8_t clamp_u8(int16_t x)
{
    return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
}

static int16_t clamp_s12(int16_t x)
{
    if (x < -0x800)
        x = -0x800;
    else if (x > 0x7f0)
        x = 0x7f0;
    return x;
}

static uint16_t clamp_RGBA_component(int16_t x)
{
    if (x > 0xff0)
        x = 0xff0;
    else if (x < 0)
        x = 0;
    return (x & 0xf80);
}

static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
{
    return (uint32_t)clamp_u8(u)  << 24 |
           (uint32_t)clamp_u8(y1) << 16 |
           (uint32_t)clamp_u8(v)  << 8 |
           (uint32_t)clamp_u8(y2);
}

static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
{
    const float fY = (float)y + 2048.0f;
    const float fU = (float)u;
    const float fV = (float)v;

    const uint16_t r = clamp_RGBA_component((int16_t)(fY               + 1.4025 * fV));
    const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
    const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));

    return (r << 4) | (g >> 1) | (b >> 6) | 1;
}

static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
{
    uint32_t uyvy[8];

    const int16_t *const v  = u + SUBBLOCK_SIZE;
    const int16_t *const y2 = y + SUBBLOCK_SIZE;

    uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
    uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
    uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
    uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
    uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
    uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
    uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
    uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);

    rdram_write_many_u32(uyvy, address, 8);
}

static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
{
    uint16_t rgba[16];

    const int16_t *const v  = u + SUBBLOCK_SIZE;
    const int16_t *const y2 = y + SUBBLOCK_SIZE;

    rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
    rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
    rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
    rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
    rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
    rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
    rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
    rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
    rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
    rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
    rgba[10] = GetRGBA(y2[2], u[5], v[5]);
    rgba[11] = GetRGBA(y2[3], u[5], v[5]);
    rgba[12] = GetRGBA(y2[4], u[6], v[6]);
    rgba[13] = GetRGBA(y2[5], u[6], v[6]);
    rgba[14] = GetRGBA(y2[6], u[7], v[7]);
    rgba[15] = GetRGBA(y2[7], u[7], v[7]);

    rdram_write_many_u16(rgba, address, 16);
}

static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
{
    unsigned int i;

    unsigned int y_offset = 0;
    unsigned int u_offset = 2 * SUBBLOCK_SIZE;

    for (i = 0; i < 8; ++i) {
        emit_line(&macroblock[y_offset], &macroblock[u_offset], address);

        y_offset += 8;
        u_offset += 8;
        address += 32;
    }
}

static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
{
    unsigned int i;

    unsigned int y_offset = 0;
    unsigned int u_offset = 4 * SUBBLOCK_SIZE;

    for (i = 0; i < 8; ++i) {
        emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
        emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);

        y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
        u_offset += 8;
        address += 64;
    }
}

static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
{
    int sb;

    for (sb = 0; sb < 6; ++sb) {
        int16_t tmp_sb[SUBBLOCK_SIZE];

        /* update DC */
        int32_t dc = (int32_t)macroblock[0];
        switch (sb) {
        case 0:
        case 1:
        case 2:
        case 3:
            *y_dc += dc;
            macroblock[0] = *y_dc & 0xffff;
            break;
        case 4:
            *u_dc += dc;
            macroblock[0] = *u_dc & 0xffff;
            break;
        case 5:
            *v_dc += dc;
            macroblock[0] = *v_dc & 0xffff;
            break;
        }

        ZigZagSubBlock(tmp_sb, macroblock);
        if (qtable != NULL)
            MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
        TransposeSubBlock(macroblock, tmp_sb);
        InverseDCTSubBlock(macroblock, macroblock);

        macroblock += SUBBLOCK_SIZE;
    }
}

static void decode_macroblock_std(const subblock_transform_t transform_luma,
                                  const subblock_transform_t transform_chroma,
                                  int16_t *macroblock,
                                  unsigned int subblock_count,
                                  const int16_t qtables[3][SUBBLOCK_SIZE])
{
    unsigned int sb;
    unsigned int q = 0;

    for (sb = 0; sb < subblock_count; ++sb) {
        int16_t tmp_sb[SUBBLOCK_SIZE];
        const int isChromaSubBlock = (subblock_count - sb <= 2);

        if (isChromaSubBlock)
            ++q;

        MultSubBlocks(macroblock, macroblock, qtables[q], 4);
        ZigZagSubBlock(tmp_sb, macroblock);
        InverseDCTSubBlock(macroblock, tmp_sb);

        if (isChromaSubBlock) {
            if (transform_chroma != NULL)
                transform_chroma(macroblock, macroblock);
        } else {
            if (transform_luma != NULL)
                transform_luma(macroblock, macroblock);
        }

        macroblock += SUBBLOCK_SIZE;
    }
}

static void TransposeSubBlock(int16_t *dst, const int16_t *src)
{
    ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
}

static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
{
    ReorderSubBlock(dst, src, ZIGZAG_TABLE);
}

static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
{
    unsigned int i;

    /* source and destination sublocks cannot overlap */
    assert(abs(dst - src) > SUBBLOCK_SIZE);

    for (i = 0; i < SUBBLOCK_SIZE; ++i)
        dst[i] = src[table[i]];
}

static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
{
    unsigned int i;

    for (i = 0; i < SUBBLOCK_SIZE; ++i) {
        int32_t v = src1[i] * src2[i];
        dst[i] = clamp_s16(v) << shift;
    }
}

static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
{
    unsigned int i;

    for (i = 0; i < SUBBLOCK_SIZE; ++i) {
        int32_t v = src[i] * scale;
        dst[i] = clamp_s16(v);
    }
}

static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
{
    unsigned int i;

    for (i = 0; i < SUBBLOCK_SIZE; ++i)
        dst[i] = src[i] >> shift;
}

/***************************************************************************
 * Fast 2D IDCT using separable formulation and normalization
 * Computations use single precision floats
 * Implementation based on Wikipedia :
 * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
 **************************************************************************/
static void InverseDCT1D(const float *const x, float *dst, unsigned int stride)
{
    float e[4];
    float f[4];
    float x26, x1357, x15, x37, x17, x35;

    x15   = IDCT_K[2] * (x[1] + x[5]);
    x37   = IDCT_K[3] * (x[3] + x[7]);
    x17   = IDCT_K[8] * (x[1] + x[7]);
    x35   = IDCT_K[9] * (x[3] + x[5]);
    x1357 = IDCT_C3   * (x[1] + x[3] + x[5] + x[7]);
    x26   = IDCT_C6   * (x[2] + x[6]);

    f[0] = x[0] + x[4];
    f[1] = x[0] - x[4];
    f[2] = x26  + IDCT_K[0] * x[2];
    f[3] = x26  + IDCT_K[1] * x[6];

    e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
    e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
    e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
    e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;

    *dst = f[0] + f[2] + e[0];
    dst += stride;
    *dst = f[1] + f[3] + e[1];
    dst += stride;
    *dst = f[1] - f[3] + e[2];
    dst += stride;
    *dst = f[0] - f[2] + e[3];
    dst += stride;
    *dst = f[0] - f[2] - e[3];
    dst += stride;
    *dst = f[1] - f[3] - e[2];
    dst += stride;
    *dst = f[1] + f[3] - e[1];
    dst += stride;
    *dst = f[0] + f[2] - e[0];
}

static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
{
    float x[8];
    float block[SUBBLOCK_SIZE];
    unsigned int i, j;

    /* idct 1d on rows (+transposition) */
    for (i = 0; i < 8; ++i) {
        for (j = 0; j < 8; ++j)
            x[j] = (float)src[i * 8 + j];

        InverseDCT1D(x, &block[i], 8);
    }

    /* idct 1d on columns (thanks to previous transposition) */
    for (i = 0; i < 8; ++i) {
        InverseDCT1D(&block[i * 8], x, 1);

        /* C4 = 1 normalization implies a division by 8 */
        for (j = 0; j < 8; ++j)
            dst[i + j * 8] = (int16_t)x[j] >> 3;
    }
}

static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
{
    unsigned int i;

    for (i = 0; i < SUBBLOCK_SIZE; ++i)
        dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
}

static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
{
    unsigned int i;

    for (i = 0; i < SUBBLOCK_SIZE; ++i)
        dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
}


/* FIXME: assume presence of expansion pack */
#define MEMMASK 0x7fffff

static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
{
    while (count != 0) {
        uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
        s <<= 8;
        s |= rsp.RDRAM[((address++)^S8) & MEMMASK];

        *(dst++) = s;

        --count;
    }
}

static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
{
    while (count != 0) {
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);

        --count;
    }
}

static uint32_t rdram_read_u32(uint32_t address)
{
    uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK];
    r <<= 8;
    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
    r <<= 8;
    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
    r <<= 8;
    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];

    return r;
}

static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
{
    while (count != 0) {
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);

        --count;
    }
}
Commit	Line	Data
	1	/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
	2	* Mupen64plus-rsp-hle - jpeg.c *
	3	* Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
	4	* Copyright (C) 2012 Bobby Smiles *
	5	* Copyright (C) 2009 Richard Goedeken *
	6	* Copyright (C) 2002 Hacktarux *
	7	* *
	8	* This program is free software; you can redistribute it and/or modify *
	9	* it under the terms of the GNU General Public License as published by *
	10	* the Free Software Foundation; either version 2 of the License, or *
	11	* (at your option) any later version. *
	12	* *
	13	* This program is distributed in the hope that it will be useful, *
	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
	16	* GNU General Public License for more details. *
	17	* *
	18	* You should have received a copy of the GNU General Public License *
	19	* along with this program; if not, write to the *
	20	* Free Software Foundation, Inc., *
	21	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
	22	* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
	23
	24	#include <stdint.h>
	25	#include <assert.h>
	26	#include <stdlib.h>
	27
	28	#define M64P_PLUGIN_PROTOTYPES 1
	29	#include "m64p_types.h"
	30	#include "m64p_plugin.h"
	31	#include "hle.h"
	32	#include "jpeg.h"
	33
	34	#define SUBBLOCK_SIZE 64
	35
	36	typedef void (tile_line_emitter_t)(const int16_t y, const int16_t *u, uint32_t address);
	37	typedef void (subblock_transform_t)(int16_t dst, const int16_t *src);
	38
	39	/* rdram operations
	40	* FIXME: these functions deserve their own module
	41	*/
	42	static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
	43	static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
	44	static uint32_t rdram_read_u32(uint32_t address);
	45	static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
	46
	47	/* standard jpeg ucode decoder */
	48	static void jpeg_decode_std(const char *const version,
	49	const subblock_transform_t transform_luma,
	50	const subblock_transform_t transform_chroma,
	51	const tile_line_emitter_t emit_line);
	52
	53	/* helper functions */
	54	static uint8_t clamp_u8(int16_t x);
	55	static int16_t clamp_s12(int16_t x);
	56	static uint16_t clamp_RGBA_component(int16_t x);
	57
	58	/* pixel conversion & foratting */
	59	static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
	60	static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
	61
	62	/* tile line emitters */
	63	static void EmitYUVTileLine(const int16_t y, const int16_t u, uint32_t address);
	64	static void EmitRGBATileLine(const int16_t y, const int16_t u, uint32_t address);
	65
	66	/* macroblocks operations */
	67	static void decode_macroblock_ob(int16_t macroblock, int32_t y_dc, int32_t u_dc, int32_t v_dc, const int16_t *qtable);
	68	static void decode_macroblock_std(const subblock_transform_t transform_luma,
	69	const subblock_transform_t transform_chroma,
	70	int16_t *macroblock,
	71	unsigned int subblock_count,
	72	const int16_t qtables[3][SUBBLOCK_SIZE]);
	73	static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
	74	static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
	75
	76	/* subblocks operations */
	77	static void TransposeSubBlock(int16_t dst, const int16_t src);
	78	static void ZigZagSubBlock(int16_t dst, const int16_t src);
	79	static void ReorderSubBlock(int16_t dst, const int16_t src, const unsigned int *table);
	80	static void MultSubBlocks(int16_t dst, const int16_t src1, const int16_t *src2, unsigned int shift);
	81	static void ScaleSubBlock(int16_t dst, const int16_t src, int16_t scale);
	82	static void RShiftSubBlock(int16_t dst, const int16_t src, unsigned int shift);
	83	static void InverseDCT1D(const float const x, float dst, unsigned int stride);
	84	static void InverseDCTSubBlock(int16_t dst, const int16_t src);
	85	static void RescaleYSubBlock(int16_t dst, const int16_t src);
	86	static void RescaleUVSubBlock(int16_t dst, const int16_t src);
	87
	88	/* transposed dequantization table */
	89	static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
	90	16, 12, 14, 14, 18, 24, 49, 72,
	91	11, 12, 13, 17, 22, 35, 64, 92,
	92	10, 14, 16, 22, 37, 55, 78, 95,
	93	16, 19, 24, 29, 56, 64, 87, 98,
	94	24, 26, 40, 51, 68, 81, 103, 112,
	95	40, 58, 57, 87, 109, 104, 121, 100,
	96	51, 60, 69, 80, 103, 113, 120, 103,
	97	61, 55, 56, 62, 77, 92, 101, 99
	98	};
	99
	100	/* zig-zag indices */
	101	static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
	102	0, 1, 5, 6, 14, 15, 27, 28,
	103	2, 4, 7, 13, 16, 26, 29, 42,
	104	3, 8, 12, 17, 25, 30, 41, 43,
	105	9, 11, 18, 24, 31, 40, 44, 53,
	106	10, 19, 23, 32, 39, 45, 52, 54,
	107	20, 22, 33, 38, 46, 51, 55, 60,
	108	21, 34, 37, 47, 50, 56, 59, 61,
	109	35, 36, 48, 49, 57, 58, 62, 63
	110	};
	111
	112	/* transposition indices */
	113	static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
	114	0, 8, 16, 24, 32, 40, 48, 56,
	115	1, 9, 17, 25, 33, 41, 49, 57,
	116	2, 10, 18, 26, 34, 42, 50, 58,
	117	3, 11, 19, 27, 35, 43, 51, 59,
	118	4, 12, 20, 28, 36, 44, 52, 60,
	119	5, 13, 21, 29, 37, 45, 53, 61,
	120	6, 14, 22, 30, 38, 46, 54, 62,
	121	7, 15, 23, 31, 39, 47, 55, 63
	122	};
	123
	124
	125
	126	/* IDCT related constants
	127	* Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
	128	static const float IDCT_C3 = 1.175875602f;
	129	static const float IDCT_C6 = 0.541196100f;
	130	static const float IDCT_K[10] = {
	131	0.765366865f, /* C2-C6 */
	132	-1.847759065f, /* -C2-C6 */
	133	-0.390180644f, /* C5-C3 */
	134	-1.961570561f, /* -C5-C3 */
	135	1.501321110f, /* C1+C3-C5-C7 */
	136	2.053119869f, /* C1+C3-C5+C7 */
	137	3.072711027f, /* C1+C3+C5-C7 */
	138	0.298631336f, /* -C1+C3+C5-C7 */
	139	-0.899976223f, /* C7-C3 */
	140	-2.562915448f /* -C1-C3 */
	141	};
	142
	143
	144	/* global functions */
	145
	146	/***************************************************************************
	147	* JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
	148	**************************************************************************/
	149	void jpeg_decode_PS0(void)
	150	{
	151	jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
	152	}
	153
	154	/***************************************************************************
	155	* JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
	156	* Pokemon Stadium 2.
	157	**************************************************************************/
	158	void jpeg_decode_PS(void)
	159	{
	160	jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
	161	}
	162
	163	/***************************************************************************
	164	* JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
	165	**************************************************************************/
	166	void jpeg_decode_OB(void)
	167	{
	168	int16_t qtable[SUBBLOCK_SIZE];
	169	unsigned int mb;
	170
	171	int32_t y_dc = 0;
	172	int32_t u_dc = 0;
	173	int32_t v_dc = 0;
	174
	175	const OSTask_t *const task = get_task();
	176
	177	uint32_t address = task->data_ptr;
	178	const unsigned int macroblock_count = task->data_size;
	179	const int qscale = task->yield_data_size;
	180
	181	DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
	182	address,
	183	macroblock_count,
	184	qscale);
	185
	186	if (qscale != 0) {
	187	if (qscale > 0)
	188	ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
	189	else
	190	RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
	191	}
	192
	193	for (mb = 0; mb < macroblock_count; ++mb) {
	194	int16_t macroblock[6 * SUBBLOCK_SIZE];
	195
	196	rdram_read_many_u16((uint16_t )macroblock, address, 6 SUBBLOCK_SIZE);
	197	decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
	198	EmitTilesMode2(EmitYUVTileLine, macroblock, address);
	199
	200	address += (2 * 6 * SUBBLOCK_SIZE);
	201	}
	202	}
	203
	204
	205	/* local functions */
	206	static void jpeg_decode_std(const char *const version,
	207	const subblock_transform_t transform_luma,
	208	const subblock_transform_t transform_chroma,
	209	const tile_line_emitter_t emit_line)
	210	{
	211	int16_t qtables[3][SUBBLOCK_SIZE];
	212	unsigned int mb;
	213	uint32_t address;
	214	uint32_t macroblock_count;
	215	uint32_t mode;
	216	uint32_t qtableY_ptr;
	217	uint32_t qtableU_ptr;
	218	uint32_t qtableV_ptr;
	219	unsigned int subblock_count;
	220	unsigned int macroblock_size;
	221	/* macroblock contains at most 6 subblocks */
	222	int16_t macroblock[6 * SUBBLOCK_SIZE];
	223	const OSTask_t *const task = get_task();
	224
	225	if (task->flags & 0x1) {
	226	DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: task yielding not implemented", version);
	227	return;
	228	}
	229
	230	address = rdram_read_u32(task->data_ptr);
	231	macroblock_count = rdram_read_u32(task->data_ptr + 4);
	232	mode = rdram_read_u32(task->data_ptr + 8);
	233	qtableY_ptr = rdram_read_u32(task->data_ptr + 12);
	234	qtableU_ptr = rdram_read_u32(task->data_ptr + 16);
	235	qtableV_ptr = rdram_read_u32(task->data_ptr + 20);
	236
	237	DebugMessage(M64MSG_VERBOSE, "jpeg_decode_%s: buffer=%x, #MB=%d, mode=%d, Qy=%x, Qu=%x, Qv=%x",
	238	version,
	239	address,
	240	macroblock_count,
	241	mode,
	242	qtableY_ptr,
	243	qtableU_ptr,
	244	qtableV_ptr);
	245
	246	if (mode != 0 && mode != 2) {
	247	DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: invalid mode %d", version, mode);
	248	return;
	249	}
	250
	251	subblock_count = mode + 4;
	252	macroblock_size = subblock_count * SUBBLOCK_SIZE;
	253
	254	rdram_read_many_u16((uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
	255	rdram_read_many_u16((uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
	256	rdram_read_many_u16((uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
	257
	258	for (mb = 0; mb < macroblock_count; ++mb) {
	259	rdram_read_many_u16((uint16_t *)macroblock, address, macroblock_size);
	260	decode_macroblock_std(transform_luma, transform_chroma,
	261	macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
	262
	263	if (mode == 0)
	264	EmitTilesMode0(emit_line, macroblock, address);
	265	else
	266	EmitTilesMode2(emit_line, macroblock, address);
	267
	268	address += 2 * macroblock_size;
	269	}
	270	}
	271
	272	static uint8_t clamp_u8(int16_t x)
	273	{
	274	return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
	275	}
	276
	277	static int16_t clamp_s12(int16_t x)
	278	{
	279	if (x < -0x800)
	280	x = -0x800;
	281	else if (x > 0x7f0)
	282	x = 0x7f0;
	283	return x;
	284	}
	285
	286	static uint16_t clamp_RGBA_component(int16_t x)
	287	{
	288	if (x > 0xff0)
	289	x = 0xff0;
	290	else if (x < 0)
	291	x = 0;
	292	return (x & 0xf80);
	293	}
	294
	295	static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
	296	{
	297	return (uint32_t)clamp_u8(u) << 24 \|
	298	(uint32_t)clamp_u8(y1) << 16 \|
	299	(uint32_t)clamp_u8(v) << 8 \|
	300	(uint32_t)clamp_u8(y2);
	301	}
	302
	303	static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
	304	{
	305	const float fY = (float)y + 2048.0f;
	306	const float fU = (float)u;
	307	const float fV = (float)v;
	308
	309	const uint16_t r = clamp_RGBA_component((int16_t)(fY + 1.4025 * fV));
	310	const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
	311	const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));
	312
	313	return (r << 4) \| (g >> 1) \| (b >> 6) \| 1;
	314	}
	315
	316	static void EmitYUVTileLine(const int16_t y, const int16_t u, uint32_t address)
	317	{
	318	uint32_t uyvy[8];
	319
	320	const int16_t *const v = u + SUBBLOCK_SIZE;
	321	const int16_t *const y2 = y + SUBBLOCK_SIZE;
	322
	323	uyvy[0] = GetUYVY(y[0], y[1], u[0], v[0]);
	324	uyvy[1] = GetUYVY(y[2], y[3], u[1], v[1]);
	325	uyvy[2] = GetUYVY(y[4], y[5], u[2], v[2]);
	326	uyvy[3] = GetUYVY(y[6], y[7], u[3], v[3]);
	327	uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
	328	uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
	329	uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
	330	uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
	331
	332	rdram_write_many_u32(uyvy, address, 8);
	333	}
	334
	335	static void EmitRGBATileLine(const int16_t y, const int16_t u, uint32_t address)
	336	{
	337	uint16_t rgba[16];
	338
	339	const int16_t *const v = u + SUBBLOCK_SIZE;
	340	const int16_t *const y2 = y + SUBBLOCK_SIZE;
	341
	342	rgba[0] = GetRGBA(y[0], u[0], v[0]);
	343	rgba[1] = GetRGBA(y[1], u[0], v[0]);
	344	rgba[2] = GetRGBA(y[2], u[1], v[1]);
	345	rgba[3] = GetRGBA(y[3], u[1], v[1]);
	346	rgba[4] = GetRGBA(y[4], u[2], v[2]);
	347	rgba[5] = GetRGBA(y[5], u[2], v[2]);
	348	rgba[6] = GetRGBA(y[6], u[3], v[3]);
	349	rgba[7] = GetRGBA(y[7], u[3], v[3]);
	350	rgba[8] = GetRGBA(y2[0], u[4], v[4]);
	351	rgba[9] = GetRGBA(y2[1], u[4], v[4]);
	352	rgba[10] = GetRGBA(y2[2], u[5], v[5]);
	353	rgba[11] = GetRGBA(y2[3], u[5], v[5]);
	354	rgba[12] = GetRGBA(y2[4], u[6], v[6]);
	355	rgba[13] = GetRGBA(y2[5], u[6], v[6]);
	356	rgba[14] = GetRGBA(y2[6], u[7], v[7]);
	357	rgba[15] = GetRGBA(y2[7], u[7], v[7]);
	358
	359	rdram_write_many_u16(rgba, address, 16);
	360	}
	361
	362	static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
	363	{
	364	unsigned int i;
	365
	366	unsigned int y_offset = 0;
	367	unsigned int u_offset = 2 * SUBBLOCK_SIZE;
	368
	369	for (i = 0; i < 8; ++i) {
	370	emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
	371
	372	y_offset += 8;
	373	u_offset += 8;
	374	address += 32;
	375	}
	376	}
	377
	378	static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
	379	{
	380	unsigned int i;
	381
	382	unsigned int y_offset = 0;
	383	unsigned int u_offset = 4 * SUBBLOCK_SIZE;
	384
	385	for (i = 0; i < 8; ++i) {
	386	emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
	387	emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
	388
	389	y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
	390	u_offset += 8;
	391	address += 64;
	392	}
	393	}
	394
	395	static void decode_macroblock_ob(int16_t macroblock, int32_t y_dc, int32_t u_dc, int32_t v_dc, const int16_t *qtable)
	396	{
	397	int sb;
	398
	399	for (sb = 0; sb < 6; ++sb) {
	400	int16_t tmp_sb[SUBBLOCK_SIZE];
	401
	402	/* update DC */
	403	int32_t dc = (int32_t)macroblock[0];
	404	switch (sb) {
	405	case 0:
	406	case 1:
	407	case 2:
	408	case 3:
	409	*y_dc += dc;
	410	macroblock[0] = *y_dc & 0xffff;
	411	break;
	412	case 4:
	413	*u_dc += dc;
	414	macroblock[0] = *u_dc & 0xffff;
	415	break;
	416	case 5:
	417	*v_dc += dc;
	418	macroblock[0] = *v_dc & 0xffff;
	419	break;
	420	}
	421
	422	ZigZagSubBlock(tmp_sb, macroblock);
	423	if (qtable != NULL)
	424	MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
	425	TransposeSubBlock(macroblock, tmp_sb);
	426	InverseDCTSubBlock(macroblock, macroblock);
	427
	428	macroblock += SUBBLOCK_SIZE;
	429	}
	430	}
	431
	432	static void decode_macroblock_std(const subblock_transform_t transform_luma,
	433	const subblock_transform_t transform_chroma,
	434	int16_t *macroblock,
	435	unsigned int subblock_count,
	436	const int16_t qtables[3][SUBBLOCK_SIZE])
	437	{
	438	unsigned int sb;
	439	unsigned int q = 0;
	440
	441	for (sb = 0; sb < subblock_count; ++sb) {
	442	int16_t tmp_sb[SUBBLOCK_SIZE];
	443	const int isChromaSubBlock = (subblock_count - sb <= 2);
	444
	445	if (isChromaSubBlock)
	446	++q;
	447
	448	MultSubBlocks(macroblock, macroblock, qtables[q], 4);
	449	ZigZagSubBlock(tmp_sb, macroblock);
	450	InverseDCTSubBlock(macroblock, tmp_sb);
	451
	452	if (isChromaSubBlock) {
	453	if (transform_chroma != NULL)
	454	transform_chroma(macroblock, macroblock);
	455	} else {
	456	if (transform_luma != NULL)
	457	transform_luma(macroblock, macroblock);
	458	}
	459
	460	macroblock += SUBBLOCK_SIZE;
	461	}
	462	}
	463
	464	static void TransposeSubBlock(int16_t dst, const int16_t src)
	465	{
	466	ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
	467	}
	468
	469	static void ZigZagSubBlock(int16_t dst, const int16_t src)
	470	{
	471	ReorderSubBlock(dst, src, ZIGZAG_TABLE);
	472	}
	473
	474	static void ReorderSubBlock(int16_t dst, const int16_t src, const unsigned int *table)
	475	{
	476	unsigned int i;
	477
	478	/* source and destination sublocks cannot overlap */
	479	assert(abs(dst - src) > SUBBLOCK_SIZE);
	480
	481	for (i = 0; i < SUBBLOCK_SIZE; ++i)
	482	dst[i] = src[table[i]];
	483	}
	484
	485	static void MultSubBlocks(int16_t dst, const int16_t src1, const int16_t *src2, unsigned int shift)
	486	{
	487	unsigned int i;
	488
	489	for (i = 0; i < SUBBLOCK_SIZE; ++i) {
	490	int32_t v = src1[i] * src2[i];
	491	dst[i] = clamp_s16(v) << shift;
	492	}
	493	}
	494
	495	static void ScaleSubBlock(int16_t dst, const int16_t src, int16_t scale)
	496	{
	497	unsigned int i;
	498
	499	for (i = 0; i < SUBBLOCK_SIZE; ++i) {
	500	int32_t v = src[i] * scale;
	501	dst[i] = clamp_s16(v);
	502	}
	503	}
	504
	505	static void RShiftSubBlock(int16_t dst, const int16_t src, unsigned int shift)
	506	{
	507	unsigned int i;
	508
	509	for (i = 0; i < SUBBLOCK_SIZE; ++i)
	510	dst[i] = src[i] >> shift;
	511	}
	512
	513	/***************************************************************************
	514	* Fast 2D IDCT using separable formulation and normalization
	515	* Computations use single precision floats
	516	* Implementation based on Wikipedia :
	517	* http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
	518	**************************************************************************/
	519	static void InverseDCT1D(const float const x, float dst, unsigned int stride)
	520	{
	521	float e[4];
	522	float f[4];
	523	float x26, x1357, x15, x37, x17, x35;
	524
	525	x15 = IDCT_K[2] * (x[1] + x[5]);
	526	x37 = IDCT_K[3] * (x[3] + x[7]);
	527	x17 = IDCT_K[8] * (x[1] + x[7]);
	528	x35 = IDCT_K[9] * (x[3] + x[5]);
	529	x1357 = IDCT_C3 * (x[1] + x[3] + x[5] + x[7]);
	530	x26 = IDCT_C6 * (x[2] + x[6]);
	531
	532	f[0] = x[0] + x[4];
	533	f[1] = x[0] - x[4];
	534	f[2] = x26 + IDCT_K[0] * x[2];
	535	f[3] = x26 + IDCT_K[1] * x[6];
	536
	537	e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
	538	e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
	539	e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
	540	e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;
	541
	542	*dst = f[0] + f[2] + e[0];
	543	dst += stride;
	544	*dst = f[1] + f[3] + e[1];
	545	dst += stride;
	546	*dst = f[1] - f[3] + e[2];
	547	dst += stride;
	548	*dst = f[0] - f[2] + e[3];
	549	dst += stride;
	550	*dst = f[0] - f[2] - e[3];
	551	dst += stride;
	552	*dst = f[1] - f[3] - e[2];
	553	dst += stride;
	554	*dst = f[1] + f[3] - e[1];
	555	dst += stride;
	556	*dst = f[0] + f[2] - e[0];
	557	}
	558
	559	static void InverseDCTSubBlock(int16_t dst, const int16_t src)
	560	{
	561	float x[8];
	562	float block[SUBBLOCK_SIZE];
	563	unsigned int i, j;
	564
	565	/* idct 1d on rows (+transposition) */
	566	for (i = 0; i < 8; ++i) {
	567	for (j = 0; j < 8; ++j)
	568	x[j] = (float)src[i * 8 + j];
	569
	570	InverseDCT1D(x, &block[i], 8);
	571	}
	572
	573	/* idct 1d on columns (thanks to previous transposition) */
	574	for (i = 0; i < 8; ++i) {
	575	InverseDCT1D(&block[i * 8], x, 1);
	576
	577	/* C4 = 1 normalization implies a division by 8 */
	578	for (j = 0; j < 8; ++j)
	579	dst[i + j * 8] = (int16_t)x[j] >> 3;
	580	}
	581	}
	582
	583	static void RescaleYSubBlock(int16_t dst, const int16_t src)
	584	{
	585	unsigned int i;
	586
	587	for (i = 0; i < SUBBLOCK_SIZE; ++i)
	588	dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
	589	}
	590
	591	static void RescaleUVSubBlock(int16_t dst, const int16_t src)
	592	{
	593	unsigned int i;
	594
	595	for (i = 0; i < SUBBLOCK_SIZE; ++i)
	596	dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
	597	}
	598
	599
	600
	601	/* FIXME: assume presence of expansion pack */
	602	#define MEMMASK 0x7fffff
	603
	604	static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
	605	{
	606	while (count != 0) {
	607	uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
	608	s <<= 8;
	609	s \|= rsp.RDRAM[((address++)^S8) & MEMMASK];
	610
	611	*(dst++) = s;
	612
	613	--count;
	614	}
	615	}
	616
	617	static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
	618	{
	619	while (count != 0) {
	620	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
	621	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
	622
	623	--count;
	624	}
	625	}
	626
	627	static uint32_t rdram_read_u32(uint32_t address)
	628	{
	629	uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK];
	630	r <<= 8;
	631	r \|= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
	632	r <<= 8;
	633	r \|= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
	634	r <<= 8;
	635	r \|= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
	636
	637	return r;
	638	}
	639
	640	static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
	641	{
	642	while (count != 0) {
	643	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
	644	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
	645	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
	646	rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
	647
	648	--count;
	649	}
	650	}
	651