RSP LLE plugin. Compile and run (slowly, eat 50% CPU) on the OpenPandora
[mupen64plus-pandora.git] / source / mupen64plus-rsp-z64 / src / rsp.cpp
diff --git a/source/mupen64plus-rsp-z64/src/rsp.cpp b/source/mupen64plus-rsp-z64/src/rsp.cpp
new file mode 100644 (file)
index 0000000..04e65ec
--- /dev/null
@@ -0,0 +1,3093 @@
+/*
+ * z64
+ *
+ * Copyright (C) 2007  ziggy
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+**/
+
+/*
+Nintendo/SGI Reality Signal Processor (RSP) emulator
+
+Written by Ville Linde
+*/
+// #include "z64.h"
+#include "rsp.h"
+#include "rsp_opinfo.h"
+#include <math.h>       // sqrt
+#include <assert.h>
+#include <string.h>
+
+#define INLINE inline
+
+#define LOG_INSTRUCTION_EXECUTION               0
+#define SAVE_DISASM                             0
+#define SAVE_DMEM                               0
+
+#define PRINT_VECREG(x)         printf("V%d: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X\n", (x), \
+    (UINT16)VREG_S((x),0), (UINT16)VREG_S((x),1), \
+    (UINT16)VREG_S((x),2), (UINT16)VREG_S((x),3), \
+    (UINT16)VREG_S((x),4), (UINT16)VREG_S((x),5), \
+    (UINT16)VREG_S((x),6), (UINT16)VREG_S((x),7))
+
+
+extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
+
+#if LOG_INSTRUCTION_EXECUTION
+static FILE *exec_output;
+#endif
+
+
+// INLINE void sp_set_status(UINT32 status)
+// {
+//      if (status & 0x1)
+//      {
+//              cpu_trigger(6789);
+
+//              cpunum_set_input_line(1, INPUT_LINE_HALT, ASSERT_LINE);
+//              rsp_sp_status |= SP_STATUS_HALT;
+//      }
+//      if (status & 0x2)
+//      {
+//              rsp_sp_status |= SP_STATUS_BROKE;
+
+//              if (rsp_sp_status & SP_STATUS_INTR_BREAK)
+//              {
+//                      signal_rcp_interrupt(SP_INTERRUPT);
+//              }
+//      }
+// }
+
+
+#if 0
+enum
+{
+    RSP_PC = 1,
+    RSP_R0,
+    RSP_R1,
+    RSP_R2,
+    RSP_R3,
+    RSP_R4,
+    RSP_R5,
+    RSP_R6,
+    RSP_R7,
+    RSP_R8,
+    RSP_R9,
+    RSP_R10,
+    RSP_R11,
+    RSP_R12,
+    RSP_R13,
+    RSP_R14,
+    RSP_R15,
+    RSP_R16,
+    RSP_R17,
+    RSP_R18,
+    RSP_R19,
+    RSP_R20,
+    RSP_R21,
+    RSP_R22,
+    RSP_R23,
+    RSP_R24,
+    RSP_R25,
+    RSP_R26,
+    RSP_R27,
+    RSP_R28,
+    RSP_R29,
+    RSP_R30,
+    RSP_R31,
+};
+#endif
+
+
+#ifdef RSPTIMING
+uint64_t rsptimings[512];
+int rspcounts[512];
+#endif
+
+
+#define JUMP_ABS(addr)                  { rsp.nextpc = 0x04001000 | (((addr) << 2) & 0xfff); }
+#define JUMP_ABS_L(addr,l)              { rsp.nextpc = 0x04001000 | (((addr) << 2) & 0xfff); rsp.r[l] = sp_pc + 4; }
+#define JUMP_REL(offset)                { rsp.nextpc = 0x04001000 | ((sp_pc + ((offset) << 2)) & 0xfff); }
+#define JUMP_REL_L(offset,l)            { rsp.nextpc = 0x04001000 | ((sp_pc + ((offset) << 2)) & 0xfff); rsp.r[l] = sp_pc + 4; }
+#define JUMP_PC(addr)                   { rsp.nextpc = 0x04001000 | ((addr) & 0xfff); }
+#define JUMP_PC_L(addr,l)               { rsp.nextpc = 0x04001000 | ((addr) & 0xfff); rsp.r[l] = sp_pc + 4; }
+#define LINK(l) rsp.r[l] = sp_pc + 4
+
+
+#define VDREG           ((op >> 6) & 0x1f)
+#define VS1REG          ((op >> 11) & 0x1f)
+#define VS2REG          ((op >> 16) & 0x1f)
+#define EL              ((op >> 21) & 0xf)
+
+#define S_VREG_B(offset)            (((15 - (offset)) & 0x07) << 3)
+#define S_VREG_S(offset)            (((7 - (offset)) & 0x03) << 4)
+#define S_VREG_L(offset)            (((3 - (offset)) & 0x01) << 5)
+
+#define M_VREG_B(offset)            ((UINT64)0x00FF << S_VREG_B(offset))
+#define M_VREG_S(offset)            ((UINT64)0x0000FFFFul << S_VREG_S(offset))
+#define M_VREG_L(offset)            ((UINT64)0x00000000FFFFFFFFull << S_VREG_L(offset))
+
+#define R_VREG_B(reg, offset)       ((rsp.v[(reg)].d[(15 - (offset)) >> 3] >> S_VREG_B(offset)) & 0x00FF)
+#define R_VREG_S(reg, offset)       (INT16)((rsp.v[(reg)].d[(7 - (offset)) >> 2] >> S_VREG_S(offset)) & 0x0000FFFFul)
+#define R_VREG_L(reg, offset)       ((rsp.v[(reg)].d[(3 - (offset)) >> 1] >> S_VREG_L(offset)) & 0x00000000FFFFFFFFull)
+
+#define W_VREG_B(reg, offset, val)  (rsp.v[(reg)].d[(15 - (offset)) >> 3] = (rsp.v[(reg)].d[(15 - (offset)) >> 3] & ~M_VREG_B(offset)) | (M_VREG_B(offset) & ((UINT64)(val) << S_VREG_B(offset))))
+#define W_VREG_S(reg, offset, val)  (rsp.v[(reg)].d[(7 - (offset)) >> 2] = (rsp.v[(reg)].d[(7 - (offset)) >> 2] & ~M_VREG_S(offset)) | (M_VREG_S(offset) & ((UINT64)(val) << S_VREG_S(offset))))
+#define W_VREG_L(reg, offset, val)  (rsp.v[(reg)].d[(3 - (offset)) >> 1] = (rsp.v[(reg)].d[(3 - (offset)) >> 1] & ~M_VREG_L(offset)) | (M_VREG_L(offset) & ((UINT64)(val) << S_VREG_L(offset))))
+
+
+#define VEC_EL_1(x,z)           (z)
+#define VEC_EL_2(x,z)           (vector_elements_2[(x)][(z)])
+
+#define ACCUM(x)                rsp.accum[((x))].q
+
+#define S_ACCUM_H               (3 << 4)
+#define S_ACCUM_M               (2 << 4)
+#define S_ACCUM_L               (1 << 4)
+
+#define M_ACCUM_H               (((INT64)0x0000FFFF) << S_ACCUM_H)
+#define M_ACCUM_M               (((INT64)0x0000FFFF) << S_ACCUM_M)
+#define M_ACCUM_L               (((INT64)0x0000FFFF) << S_ACCUM_L)
+
+#define R_ACCUM_H(x)            ((INT16)((ACCUM(x) >> S_ACCUM_H) & 0x00FFFF))
+#define R_ACCUM_M(x)            ((INT16)((ACCUM(x) >> S_ACCUM_M) & 0x00FFFF))
+#define R_ACCUM_L(x)            ((INT16)((ACCUM(x) >> S_ACCUM_L) & 0x00FFFF))
+
+#define W_ACCUM_H(x, y)         (ACCUM(x) = (ACCUM(x) & ~M_ACCUM_H) | (M_ACCUM_H & ((INT64)(y) << S_ACCUM_H)))
+#define W_ACCUM_M(x, y)         (ACCUM(x) = (ACCUM(x) & ~M_ACCUM_M) | (M_ACCUM_M & ((INT64)(y) << S_ACCUM_M)))
+#define W_ACCUM_L(x, y)         (ACCUM(x) = (ACCUM(x) & ~M_ACCUM_L) | (M_ACCUM_L & ((INT64)(y) << S_ACCUM_L)))
+
+
+
+RSP_REGS rsp;
+static int rsp_icount;
+// RSP Interface
+
+#define rsp_sp_status (*(UINT32*)z64_rspinfo.SP_STATUS_REG)
+#define sp_mem_addr (*(UINT32*)z64_rspinfo.SP_MEM_ADDR_REG)
+#define sp_dram_addr (*(UINT32*)z64_rspinfo.SP_DRAM_ADDR_REG)
+#define sp_semaphore (*(UINT32*)z64_rspinfo.SP_SEMAPHORE_REG)
+
+#define sp_dma_rlength (*(UINT32*)z64_rspinfo.SP_RD_LEN_REG)
+#define sp_dma_wlength (*(UINT32*)z64_rspinfo.SP_WR_LEN_REG)
+
+INT32 sp_dma_length;
+
+/*****************************************************************************/
+
+UINT32 get_cop0_reg(int reg)
+{
+    if (reg >= 0 && reg < 8)
+    {
+        return sp_read_reg(reg);
+    }
+    else if (reg >= 8 && reg < 16)
+    {
+        return n64_dp_reg_r(reg - 8, 0x00000000);
+    }
+    else
+    {
+        log(M64MSG_ERROR, "RSP: get_cop0_reg: %d", reg);
+       return ~0;
+    }
+}
+
+void set_cop0_reg(int reg, UINT32 data)
+{
+    if (reg >= 0 && reg < 8)
+    {
+        sp_write_reg(reg, data);
+    }
+    else if (reg >= 8 && reg < 16)
+    {
+        n64_dp_reg_w(reg - 8, data, 0x00000000);
+    }
+    else
+    {
+        log(M64MSG_ERROR, "RSP: set_cop0_reg: %d, %08X\n", reg, data);
+    }
+}
+
+static int got_unimp;
+void unimplemented_opcode(UINT32 op)
+{
+    got_unimp = 1;
+#ifdef MAME_DEBUG
+    char string[200];
+    rsp_dasm_one(string, rsp.ppc, op);
+    printf("%08X: %s\n", rsp.ppc, string);
+#endif
+
+#if SAVE_DISASM
+    {
+        char string[200];
+        int i;
+        FILE *dasm;
+        dasm = fopen("rsp_disasm.txt", "wt");
+
+        for (i=0; i < 0x1000; i+=4)
+        {
+            UINT32 opcode = ROPCODE(0x04001000 + i);
+            rsp_dasm_one(string, 0x04001000 + i, opcode);
+            fprintf(dasm, "%08X: %08X   %s\n", 0x04001000 + i, opcode, string);
+        }
+        fclose(dasm);
+    }
+#endif
+#if SAVE_DMEM
+    {
+        int i;
+        FILE *dmem;
+        dmem = fopen("rsp_dmem.bin", "wb");
+
+        for (i=0; i < 0x1000; i++)
+        {
+            fputc(READ8(0x04000000 + i), dmem);
+        }
+        fclose(dmem);
+    }
+#endif
+
+    log(M64MSG_ERROR, "RSP: unknown opcode %02X (%d) (%08X) at %08X\n", op >> 26, op >> 26, op, rsp.ppc);
+}
+
+/*****************************************************************************/
+
+const int vector_elements_1[16][8] =
+{
+    { 0, 1, 2, 3, 4, 5, 6, 7 },             // none
+    { 0, 1, 2, 3, 4, 5, 6 ,7 },             // ???
+    { 1, 3, 5, 7, 0, 2, 4, 6 },             // 0q
+    { 0, 2, 4, 6, 1, 3, 5, 7 },             // 1q
+    { 1, 2, 3, 5, 6, 7, 0, 4 },             // 0h
+    { 0, 2, 3, 4, 6, 7, 1, 5 },             // 1h
+    { 0, 1, 3, 4, 5, 7, 2, 6 },             // 2h
+    { 0, 1, 2, 4, 5, 6, 3, 7 },             // 3h
+    { 1, 2, 3, 4, 5, 6, 7, 0 },             // 0
+    { 0, 2, 3, 4, 5, 6, 7, 1 },             // 1
+    { 0, 1, 3, 4, 5, 6, 7, 2 },             // 2
+    { 0, 1, 2, 4, 5, 6, 7, 3 },             // 3
+    { 0, 1, 2, 3, 5, 6, 7, 4 },             // 4
+    { 0, 1, 2, 3, 4, 6, 7, 5 },             // 5
+    { 0, 1, 2, 3, 4, 5, 7, 6 },             // 6
+    { 0, 1, 2, 3, 4, 5, 6, 7 },             // 7
+};
+
+const int vector_elements_2[16][8] =
+{
+    { 0, 1, 2, 3, 4, 5, 6, 7 },             // none
+    { 0, 1, 2, 3, 4, 5, 6, 7 },             // ???
+    { 0, 0, 2, 2, 4, 4, 6, 6 },             // 0q
+    { 1, 1, 3, 3, 5, 5, 7, 7 },             // 1q
+    { 0, 0, 0, 0, 4, 4, 4, 4 },             // 0h
+    { 1, 1, 1, 1, 5, 5, 5, 5 },             // 1h
+    { 2, 2, 2, 2, 6, 6, 6, 6 },             // 2h
+    { 3, 3, 3, 3, 7, 7, 7, 7 },             // 3h
+    { 0, 0, 0, 0, 0, 0, 0, 0 },             // 0
+    { 1, 1, 1, 1, 1, 1, 1, 1 },             // 1
+    { 2, 2, 2, 2, 2, 2, 2, 2 },             // 2
+    { 3, 3, 3, 3, 3, 3, 3, 3 },             // 3
+    { 4, 4, 4, 4, 4, 4, 4, 4 },             // 4
+    { 5, 5, 5, 5, 5, 5, 5, 5 },             // 5
+    { 6, 6, 6, 6, 6, 6, 6, 6 },             // 6
+    { 7, 7, 7, 7, 7, 7, 7, 7 },             // 7
+};
+
+void rsp_init(RSP_INFO info)
+{
+#if LOG_INSTRUCTION_EXECUTION
+    exec_output = fopen("rsp_execute.txt", "wt");
+#endif
+
+    memset(&rsp, 0, sizeof(rsp));
+    rsp.ext = info;
+
+    sp_pc = 0; //0x4001000;
+    rsp.nextpc = ~0U;
+    //rsp_invalidate(0, 0x1000);
+    rsp.step_count=0;
+}
+
+void rsp_reset(void)
+{
+    rsp.nextpc = ~0U;
+}
+
+void handle_lwc2(UINT32 op)
+{
+    int i, end;
+    UINT32 ea;
+    int dest = (op >> 16) & 0x1f;
+    int base = (op >> 21) & 0x1f;
+    int index = (op >> 7) & 0xf;
+    int offset = (op & 0x7f);
+    if (offset & 0x40)
+        offset |= 0xffffffc0;
+
+    switch ((op >> 11) & 0x1f)
+    {
+    case 0x00:              /* LBV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00000 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Load 1 byte to vector byte index
+
+            ea = (base) ? rsp.r[base] + offset : offset;
+            VREG_B(dest, index) = READ8(ea);
+            break;
+        }
+    case 0x01:              /* LSV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00001 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads 2 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 2) : (offset * 2);
+
+            end = index + 2;
+
+            // VP need mask i and ea ?
+            for (i=index; i < end; i++)
+            {
+                VREG_B(dest, i) = READ8(ea);
+                ea++;
+            }
+            break;
+        }
+    case 0x02:              /* LLV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00010 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads 4 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 4) : (offset * 4);
+
+            end = index + 4;
+
+            // VP need mask i and ea ?
+            for (i=index; i < end; i++)
+            {
+                VREG_B(dest, i) = READ8(ea);
+                ea++;
+            }
+            break;
+        }
+    case 0x03:              /* LDV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00011 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads 8 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+
+            end = index + 8;
+
+            // VP need mask i and ea ?
+            for (i=index; i < end; i++)
+            {
+                VREG_B(dest, i) = READ8(ea);
+                ea++;
+            }
+            break;
+        }
+    case 0x04:              /* LQV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00100 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads up to 16 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            end = index + (16 - (ea & 0xf));
+            if (end > 16) end = 16;
+            for (i=index; i < end; i++)
+            {
+                VREG_B(dest, i) = READ8(ea);
+                ea++;
+            }
+            break;
+        }
+    case 0x05:              /* LRV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00101 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores up to 16 bytes starting from right side until 16-byte boundary
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            index = 16 - ((ea & 0xf) - index);
+            end = 16;
+            ea &= ~0xf;
+            //assert(index == 0);
+
+            for (i=index; i < end; i++)
+            {
+                VREG_B(dest, i) = READ8(ea);
+                ea++;
+            }
+            break;
+        }
+    case 0x06:              /* LPV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00110 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads a byte as the upper 8 bits of each element
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+
+            for (i=0; i < 8; i++)
+            {
+                VREG_S(dest, i) = READ8(ea + (((16-index) + i) & 0xf)) << 8;
+            }
+            break;
+        }
+    case 0x07:              /* LUV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 00111 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads a byte as the bits 14-7 of each element
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+
+            for (i=0; i < 8; i++)
+            {
+                VREG_S(dest, i) = READ8(ea + (((16-index) + i) & 0xf)) << 7;
+            }
+            break;
+        }
+    case 0x08:              /* LHV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 01000 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads a byte as the bits 14-7 of each element, with 2-byte stride
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            for (i=0; i < 8; i++)
+            {
+                VREG_S(dest, i) = READ8(ea + (((16-index) + (i<<1)) & 0xf)) << 7;
+            }
+            break;
+        }
+    case 0x09:              /* LFV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 01001 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride
+
+            //                      fatalerror("RSP: LFV\n");
+
+            //if (index & 0x7)      fatalerror("RSP: LFV: index = %d at %08X\n", index, rsp.ppc);
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            // not sure what happens if 16-byte boundary is crossed...
+            //if ((ea & 0xf) > 0)   fatalerror("RSP: LFV: 16-byte boundary crossing at %08X, recheck this!\n", rsp.ppc);
+
+            end = (index >> 1) + 4;
+
+            for (i=index >> 1; i < end; i++)
+            {
+                VREG_S(dest, i) = READ8(ea) << 7;
+                ea += 4;
+            }
+            break;
+        }
+    case 0x0a:              /* LWV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 01010 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0
+            // after byte index 15
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            // not sure what happens if 16-byte boundary is crossed...
+            //if ((ea & 0xf) > 0) fatalerror("RSP: LWV: 16-byte boundary crossing at %08X, recheck this!\n", rsp.ppc);
+
+            end = (16 - index) + 16;
+
+            for (i=(16 - index); i < end; i++)
+            {
+                VREG_B(dest, i & 0xf) = READ8(ea);
+                ea += 4;
+            }
+            break;
+        }
+    case 0x0b:              /* LTV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 110010 | BBBBB | TTTTT | 01011 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Loads one element to maximum of 8 vectors, while incrementing element index
+
+            // FIXME: has a small problem with odd indices
+
+            int element;
+            int vs = dest;
+            int ve = dest + 8;
+            if (ve > 32)
+                ve = 32;
+
+            element = 7 - (index >> 1);
+
+            //if (index & 1)        fatalerror("RSP: LTV: index = %d\n", index);
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            ea = ((ea + 8) & ~0xf) + (index & 1);
+            for (i=vs; i < ve; i++)
+            {
+                element = ((8 - (index >> 1) + (i-vs)) << 1);
+                VREG_B(i, (element & 0xf)) = READ8(ea);
+                VREG_B(i, ((element+1) & 0xf)) = READ8(ea+1);
+
+                ea += 2;
+            }
+            break;
+        }
+
+    default:
+        {
+            unimplemented_opcode(op);
+            break;
+        }
+    }
+}
+
+void handle_swc2(UINT32 op)
+{
+    int i, end;
+    int eaoffset;
+    UINT32 ea;
+    int dest = (op >> 16) & 0x1f;
+    int base = (op >> 21) & 0x1f;
+    int index = (op >> 7) & 0xf;
+    int offset = (op & 0x7f);
+    if (offset & 0x40)
+        offset |= 0xffffffc0;
+
+    switch ((op >> 11) & 0x1f)
+    {
+    case 0x00:              /* SBV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00000 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores 1 byte from vector byte index
+
+            ea = (base) ? rsp.r[base] + offset : offset;
+            WRITE8(ea, VREG_B(dest, index));
+            break;
+        }
+    case 0x01:              /* SSV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00001 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores 2 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 2) : (offset * 2);
+
+            end = index + 2;
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea, VREG_B(dest, i));
+                ea++;
+            }
+            break;
+        }
+    case 0x02:              /* SLV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00010 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores 4 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 4) : (offset * 4);
+
+            end = index + 4;
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea, VREG_B(dest, i));
+                ea++;
+            }
+            break;
+        }
+    case 0x03:              /* SDV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00011 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores 8 bytes starting from vector byte index
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+
+            end = index + 8;
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea, VREG_B(dest, i));
+                ea++;
+            }
+            break;
+        }
+    case 0x04:              /* SQV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00100 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores up to 16 bytes starting from vector byte index until 16-byte boundary
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            end = index + (16 - (ea & 0xf));
+            //       if (end != 16)
+            //         printf("SQV %d\n", end-index);
+            //assert(end == 16);
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea, VREG_B(dest, i & 0xf));
+                ea++;
+            }
+            break;
+        }
+    case 0x05:              /* SRV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00101 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores up to 16 bytes starting from right side until 16-byte boundary
+
+            int o;
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            end = index + (ea & 0xf);
+            o = (16 - (ea & 0xf)) & 0xf;
+            ea &= ~0xf;
+            //       if (end != 16)
+            //         printf("SRV %d\n", end-index);
+            //assert(end == 16);
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea, VREG_B(dest, ((i + o) & 0xf)));
+                ea++;
+            }
+            break;
+        }
+    case 0x06:              /* SPV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00110 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores upper 8 bits of each element
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+            end = index + 8;
+
+            for (i=index; i < end; i++)
+            {
+                if ((i & 0xf) < 8)
+                {
+                    WRITE8(ea, VREG_B(dest, ((i & 0xf) << 1)));
+                }
+                else
+                {
+                    WRITE8(ea, VREG_S(dest, (i & 0x7)) >> 7);
+                }
+                ea++;
+            }
+            break;
+        }
+    case 0x07:              /* SUV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 00111 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores bits 14-7 of each element
+
+            ea = (base) ? rsp.r[base] + (offset * 8) : (offset * 8);
+            end = index + 8;
+
+            for (i=index; i < end; i++)
+            {
+                if ((i & 0xf) < 8)
+                {
+                    WRITE8(ea, VREG_S(dest, (i & 0x7)) >> 7);
+                }
+                else
+                {
+                    WRITE8(ea, VREG_B(dest, ((i & 0x7) << 1)));
+                }
+                ea++;
+            }
+            break;
+        }
+    case 0x08:              /* SHV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 01000 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores bits 14-7 of each element, with 2-byte stride
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            for (i=0; i < 8; i++)
+            {
+                UINT8 d = ((VREG_B(dest, ((index + (i << 1) + 0) & 0xf))) << 1) |
+                    ((VREG_B(dest, ((index + (i << 1) + 1) & 0xf))) >> 7);
+
+                WRITE8(ea, d);
+                ea += 2;
+            }
+            break;
+        }
+    case 0x09:              /* SFV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 01001 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores bits 14-7 of upper or lower quad, with 4-byte stride
+
+            // FIXME: only works for index 0 and index 8
+
+            if (index & 0x7)
+                log(M64MSG_WARNING, "SFV: index = %d at %08X\n", index, rsp.ppc);
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            eaoffset = ea & 0xf;
+            ea &= ~0xf;
+
+            end = (index >> 1) + 4;
+
+            for (i=index >> 1; i < end; i++)
+            {
+                WRITE8(ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7);
+                eaoffset += 4;
+            }
+            break;
+        }
+    case 0x0a:              /* SWV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 01010 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0
+            // after byte index 15
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            eaoffset = ea & 0xf;
+            ea &= ~0xf;
+
+            end = index + 16;
+
+            for (i=index; i < end; i++)
+            {
+                WRITE8(ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf));
+                eaoffset++;
+            }
+            break;
+        }
+    case 0x0b:              /* STV */
+        {
+            // 31       25      20      15      10     6        0
+            // --------------------------------------------------
+            // | 111010 | BBBBB | TTTTT | 01011 | IIII | Offset |
+            // --------------------------------------------------
+            //
+            // Stores one element from maximum of 8 vectors, while incrementing element index
+
+            int element, eaoffset;
+            int vs = dest;
+            int ve = dest + 8;
+            if (ve > 32)
+                ve = 32;
+
+            element = 8 - (index >> 1);
+            //if (index & 0x1)      fatalerror("RSP: STV: index = %d at %08X\n", index, rsp.ppc);
+
+            ea = (base) ? rsp.r[base] + (offset * 16) : (offset * 16);
+
+            //if (ea & 0x1)         fatalerror("RSP: STV: ea = %08X at %08X\n", ea, rsp.ppc);
+
+            eaoffset = (ea & 0xf) + (element * 2);
+            ea &= ~0xf;
+
+            for (i=vs; i < ve; i++)
+            {
+                WRITE16(ea + (eaoffset & 0xf), VREG_S(i, element & 0x7));
+                eaoffset += 2;
+                element++;
+            }
+            break;
+        }
+
+    default:
+        {
+            unimplemented_opcode(op);
+            break;
+        }
+    }
+}
+
+#define U16MIN    0x0000
+#define U16MAX    0xffff
+
+#define S16MIN    0x8000
+#define S16MAX    0x7fff
+
+INLINE UINT16 SATURATE_ACCUM_U(int accum)
+{
+    if ((INT16)ACCUM_H(accum) < 0)
+    {
+        if ((UINT16)(ACCUM_H(accum)) != 0xffff)
+        {
+            return U16MIN;
+        }
+        else
+        {
+            if ((INT16)ACCUM_M(accum) >= 0)
+            {
+                return U16MIN;
+            }
+            else
+            {
+                return ACCUM_L(accum);
+            }
+        }
+    }
+    else
+    {
+        if ((UINT16)(ACCUM_H(accum)) != 0)
+        {
+            return U16MAX;
+        }
+        else
+        {
+            if ((INT16)ACCUM_M(accum) < 0)
+            {
+                return U16MAX;
+            }
+            else
+            {
+                return ACCUM_L(accum);
+            }
+        }
+    }
+
+    return 0;
+}
+
+INLINE UINT16 SATURATE_ACCUM_S(int accum)
+{
+    if ((INT16)ACCUM_H(accum) < 0)
+    {
+        if ((UINT16)(ACCUM_H(accum)) != 0xffff)
+            return S16MIN;
+        else
+        {
+            if ((INT16)ACCUM_M(accum) >= 0)
+                return S16MIN;
+            else
+                return ACCUM_M(accum);
+        }
+    }
+    else
+    {
+        if ((UINT16)(ACCUM_H(accum)) != 0)
+            return S16MAX;
+        else
+        {
+            if ((INT16)ACCUM_M(accum) < 0)
+                return S16MAX;
+            else
+                return ACCUM_M(accum);
+        }
+    }
+
+    return 0;
+}
+
+#define WRITEBACK_RESULT()                          \
+    do {                                            \
+    VREG_S(VDREG, 0) = vres[0];                     \
+    VREG_S(VDREG, 1) = vres[1];                     \
+    VREG_S(VDREG, 2) = vres[2];                     \
+    VREG_S(VDREG, 3) = vres[3];                     \
+    VREG_S(VDREG, 4) = vres[4];                     \
+    VREG_S(VDREG, 5) = vres[5];                     \
+    VREG_S(VDREG, 6) = vres[6];                     \
+    VREG_S(VDREG, 7) = vres[7];                     \
+    } while(0)
+
+
+void handle_vector_ops(UINT32 op)
+{
+    int i;
+    INT16 vres[8];
+
+    // Opcode legend:
+    //    E = VS2 element type
+    //    S = VS1, Source vector 1
+    //    T = VS2, Source vector 2
+    //    D = Destination vector
+
+    switch (op & 0x3f)
+    {
+    case 0x00:              /* VMULF */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000000 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by signed integer * 2
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                if (s1 == -32768 && s2 == -32768)
+                {
+                    // overflow
+                    ACCUM_H(del) = 0;
+                    ACCUM_M(del) = -32768;
+                    ACCUM_L(del) = -32768;
+                    vres[del] = 0x7fff;
+                }
+                else
+                {
+                    INT64 r =  s1 * s2 * 2;
+                    r += 0x8000;    // rounding ?
+                    ACCUM_H(del) = (r < 0) ? 0xffff : 0;            // sign-extend to 48-bit
+                    ACCUM_M(del) = (INT16)(r >> 16);
+                    ACCUM_L(del) = (UINT16)(r);
+                    vres[del] = ACCUM_M(del);
+                }
+            }
+            WRITEBACK_RESULT();
+
+            break;
+        }
+
+    case 0x01:              /* VMULU */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 |
+            // ------------------------------------------------------
+            //
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT64 r = s1 * s2 * 2;
+                r += 0x8000;    // rounding ?
+
+                ACCUM_H(del) = (UINT16)(r >> 32);
+                ACCUM_M(del) = (UINT16)(r >> 16);
+                ACCUM_L(del) = (UINT16)(r);
+
+                if (r < 0)
+                {
+                    vres[del] = 0;
+                }
+                else if (((INT16)(ACCUM_H(del)) ^ (INT16)(ACCUM_M(del))) < 0)
+                {
+                    vres[del] = -1;
+                }
+                else
+                {
+                    vres[del] = ACCUM_M(del);
+                }
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x04:              /* VMUDL */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000100 |
+            // ------------------------------------------------------
+            //
+            // Multiplies unsigned fraction by unsigned fraction
+            // Stores the higher 16 bits of the 32-bit result to accumulator
+            // The low slice of accumulator is stored into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, del);
+                UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
+                UINT32 r = s1 * s2;
+
+                ACCUM_H(del) = 0;
+                ACCUM_M(del) = 0;
+                ACCUM_L(del) = (UINT16)(r >> 16);
+
+                vres[del] = ACCUM_L(del);
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x05:              /* VMUDM */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000101 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by unsigned fraction
+            // The result is stored into accumulator
+            // The middle slice of accumulator is stored into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (UINT16)VREG_S(VS2REG, sel); // not sign-extended
+                INT32 r =  s1 * s2;
+
+                ACCUM_H(del) = (r < 0) ? 0xffff : 0;            // sign-extend to 48-bit
+                ACCUM_M(del) = (INT16)(r >> 16);
+                ACCUM_L(del) = (UINT16)(r);
+
+                vres[del] = ACCUM_M(del);
+            }
+            WRITEBACK_RESULT();
+            break;
+
+        }
+
+    case 0x06:              /* VMUDN */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000110 |
+            // ------------------------------------------------------
+            //
+            // Multiplies unsigned fraction by signed integer
+            // The result is stored into accumulator
+            // The low slice of accumulator is stored into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (UINT16)VREG_S(VS1REG, del);         // not sign-extended
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 * s2;
+
+                ACCUM_H(del) = (r < 0) ? 0xffff : 0;            // sign-extend to 48-bit
+                ACCUM_M(del) = (INT16)(r >> 16);
+                ACCUM_L(del) = (UINT16)(r);
+
+                vres[del] = ACCUM_L(del);
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x07:              /* VMUDH */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by signed integer
+            // The result is stored into highest 32 bits of accumulator, the low slice is zero
+            // The highest 32 bits of accumulator is saturated into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 * s2;
+
+                ACCUM_H(del) = (INT16)(r >> 16);
+                ACCUM_M(del) = (UINT16)(r);
+                ACCUM_L(del) = 0;
+
+                if (r < -32768) r = -32768;
+                if (r >  32767) r = 32767;
+                vres[del] = (INT16)(r);
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x08:              /* VMACF */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001000 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by signed integer * 2
+            // The result is added to accumulator
+
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 * s2;
+
+                ACCUM(del) += (INT64)(r) << 17;
+                res = SATURATE_ACCUM_S(del);
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x09:              /* VMACU */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 |
+            // ------------------------------------------------------
+            //
+
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r1 = s1 * s2;
+                UINT32 r2 = (UINT16)ACCUM_L(del) + ((UINT16)(r1) * 2);
+                UINT32 r3 = (UINT16)ACCUM_M(del) + (UINT16)((r1 >> 16) * 2) + (UINT16)(r2 >> 16);
+
+                ACCUM_L(del) = (UINT16)(r2);
+                ACCUM_M(del) = (UINT16)(r3);
+                ACCUM_H(del) += (UINT16)(r3 >> 16) + (UINT16)(r1 >> 31);
+
+                //res = SATURATE_ACCUM(del, 1, 0x0000, 0xffff);
+                if ((INT16)ACCUM_H(del) < 0)
+                {
+                    res = 0;
+                }
+                else
+                {
+                    if (ACCUM_H(del) != 0)
+                    {
+                        res = 0xffff;
+                    }
+                    else
+                    {
+                        if ((INT16)ACCUM_M(del) < 0)
+                        {
+                            res = 0xffff;
+                        }
+                        else
+                        {
+                            res = ACCUM_M(del);
+                        }
+                    }
+                }
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x0c:              /* VMADL */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 |
+            // ------------------------------------------------------
+            //
+            // Multiplies unsigned fraction by unsigned fraction
+            // Adds the higher 16 bits of the 32-bit result to accumulator
+            // The low slice of accumulator is stored into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, del);
+                UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
+                UINT32 r1 = s1 * s2;
+                UINT32 r2 = (UINT16)ACCUM_L(del) + (r1 >> 16);
+                UINT32 r3 = (UINT16)ACCUM_M(del) + (r2 >> 16);
+
+                ACCUM_L(del) = (UINT16)(r2);
+                ACCUM_M(del) = (UINT16)(r3);
+                ACCUM_H(del) += (INT16)(r3 >> 16);
+
+                res = SATURATE_ACCUM_U(del);
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x0d:              /* VMADM */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001101 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by unsigned fraction
+            // The result is added into accumulator
+            // The middle slice of accumulator is stored into destination element
+
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                UINT32 s2 = (UINT16)VREG_S(VS2REG, sel);        // not sign-extended
+                UINT32 r1 = s1 * s2;
+                UINT32 r2 = (UINT16)ACCUM_L(del) + (UINT16)(r1);
+                UINT32 r3 = (UINT16)ACCUM_M(del) + (r1 >> 16) + (r2 >> 16);
+
+                ACCUM_L(del) = (UINT16)(r2);
+                ACCUM_M(del) = (UINT16)(r3);
+                ACCUM_H(del) += (UINT16)(r3 >> 16);
+                if ((INT32)(r1) < 0)
+                    ACCUM_H(del) -= 1;
+
+                res = SATURATE_ACCUM_S(del);
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x0e:              /* VMADN */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001110 |
+            // ------------------------------------------------------
+            //
+            // Multiplies unsigned fraction by signed integer
+            // The result is added into accumulator
+            // The low slice of accumulator is stored into destination element
+
+#if 1
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (UINT16)VREG_S(VS1REG, del);         // not sign-extended
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                ACCUM(del) += (INT64)(s1*s2)<<16;
+            }
+
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                res = SATURATE_ACCUM_U(i);
+                //res = ACCUM_L(i);
+
+                VREG_S(VDREG, i) = res;
+            }
+#else
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (UINT16)VREG_S(VS1REG, del);         // not sign-extended
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                UINT32 r1 = s1 * s2;
+                UINT32 r2 = (UINT16)ACCUM_L(del) + (UINT16)(r1);
+                UINT32 r3 = (UINT16)ACCUM_M(del) + (r1 >> 16) + (r2 >> 16);
+
+                ACCUM_L(del) = (UINT16)(r2);
+                ACCUM_M(del) = (UINT16)(r3);
+                ACCUM_H(del) += (UINT16)(r3 >> 16);
+                if ((INT32)(r1) < 0)
+                    ACCUM_H(del) -= 1;
+
+                res = SATURATE_ACCUM_U(del);
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+#endif
+            break;
+        }
+
+    case 0x0f:              /* VMADH */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 |
+            // ------------------------------------------------------
+            //
+            // Multiplies signed integer by signed integer
+            // The result is added into highest 32 bits of accumulator, the low slice is zero
+            // The highest 32 bits of accumulator is saturated into destination element
+
+#if 1
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+
+                rsp.accum[del].l[1] += s1*s2;
+
+            }
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                res = SATURATE_ACCUM_S(i);
+                //res = ACCUM_M(i);
+
+                VREG_S(VDREG, i) = res;
+            }
+#else      
+            for (i=0; i < 8; i++)
+            {
+                UINT16 res;
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT64 r = s1 * s2;
+
+                ACCUM(del) += (INT64)(r) << 32;
+
+                res = SATURATE_ACCUM_S(del);
+
+                vres[del] = res;
+            }
+            WRITEBACK_RESULT();
+#endif
+            break;
+        }
+
+    case 0x10:              /* VADD */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 |
+            // ------------------------------------------------------
+            //
+            // Adds two vector registers and carry flag, the result is saturated to 32767
+
+            // TODO: check VS2REG == VDREG
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 + s2 + CARRY_FLAG(del);
+
+                ACCUM_L(del) = (INT16)(r);
+
+                if (r > 32767) r = 32767;
+                if (r < -32768) r = -32768;
+                vres[del] = (INT16)(r);
+            }
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x11:              /* VSUB */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 |
+            // ------------------------------------------------------
+            //
+            // Subtracts two vector registers and carry flag, the result is saturated to -32768
+
+            // TODO: check VS2REG == VDREG
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, del);
+                INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 - s2 - CARRY_FLAG(del);
+
+                ACCUM_L(del) = (INT16)(r);
+
+                if (r > 32767) r = 32767;
+                if (r < -32768) r = -32768;
+
+                vres[del] = (INT16)(r);
+            }
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x13:              /* VABS */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 |
+            // ------------------------------------------------------
+            //
+            // Changes the sign of source register 2 if source register 1 is negative and stores
+            // the result to destination register
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT16 s1 = (INT16)VREG_S(VS1REG, del);
+                INT16 s2 = (INT16)VREG_S(VS2REG, sel);
+
+                if (s1 < 0)
+                {
+                    if (s2 == -32768)
+                    {
+                        vres[del] = 32767;
+                    }
+                    else
+                    {
+                        vres[del] = -s2;
+                    }
+                }
+                else if (s1 > 0)
+                {
+                    vres[del] = s2;
+                }
+                else
+                {
+                    vres[del] = 0;
+                }
+
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x14:              /* VADDC */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 |
+            // ------------------------------------------------------
+            //
+            // Adds two vector registers, the carry out is stored into carry register
+
+            // TODO: check VS2REG = VDREG
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, del);
+                INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 + s2;
+
+                vres[del] = (INT16)(r);
+                ACCUM_L(del) = (INT16)(r);
+
+                if (r & 0xffff0000)
+                {
+                    SET_CARRY_FLAG(del);
+                }
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x15:              /* VSUBC */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 |
+            // ------------------------------------------------------
+            //
+            // Subtracts two vector registers, the carry out is stored into carry register
+
+            // TODO: check VS2REG = VDREG
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, del);
+                INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
+                INT32 r = s1 - s2;
+
+                vres[del] = (INT16)(r);
+                ACCUM_L(del) = (UINT16)(r);
+
+                if ((UINT16)(r) != 0)
+                {
+                    SET_ZERO_FLAG(del);
+                }
+                if (r & 0xffff0000)
+                {
+                    SET_CARRY_FLAG(del);
+                }
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x1d:              /* VSAW */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 011101 |
+            // ------------------------------------------------------
+            //
+            // Stores high, middle or low slice of accumulator to destination vector
+
+            switch (EL)
+            {
+            case 0x08:              // VSAWH
+                {
+                    for (i=0; i < 8; i++)
+                    {
+                        VREG_S(VDREG, i) = ACCUM_H(i);
+                    }
+                    break;
+                }
+            case 0x09:              // VSAWM
+                {
+                    for (i=0; i < 8; i++)
+                    {
+                        VREG_S(VDREG, i) = ACCUM_M(i);
+                    }
+                    break;
+                }
+            case 0x0a:              // VSAWL
+                {
+                    for (i=0; i < 8; i++)
+                    {
+                        VREG_S(VDREG, i) = ACCUM_L(i);
+                    }
+                    break;
+                }
+            default:        log(M64MSG_ERROR, "RSP: VSAW: el = %d\n", EL);
+            }
+            break;
+        }
+
+    case 0x20:              /* VLT */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100000 |
+            // ------------------------------------------------------
+            //
+            // Sets compare flags if elements in VS1 are less than VS2
+            // Moves the element in VS2 to destination vector
+
+            rsp.flag[1] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+
+                if (VREG_S(VS1REG, del) < VREG_S(VS2REG, sel))
+                {
+                    vres[del] = VREG_S(VS1REG, del);
+                    SET_COMPARE_FLAG(del);
+                }
+                else if (VREG_S(VS1REG, del) == VREG_S(VS2REG, sel))
+                {
+                    vres[del] = VREG_S(VS1REG, del);
+                    if (ZERO_FLAG(del) != 0 && CARRY_FLAG(del) != 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                }
+                else
+                {
+                    vres[del] = VREG_S(VS2REG, sel);
+                }
+
+                ACCUM_L(del) = vres[del];
+            }
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x21:              /* VEQ */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 |
+            // ------------------------------------------------------
+            //
+            // Sets compare flags if elements in VS1 are equal with VS2
+            // Moves the element in VS2 to destination vector
+
+            rsp.flag[1] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+
+                vres[del] = VREG_S(VS2REG, sel);
+                ACCUM_L(del) = vres[del];
+
+                if (VREG_S(VS1REG, del) == VREG_S(VS2REG, sel))
+                {
+                    if (ZERO_FLAG(del) == 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                }
+            }
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x22:              /* VNE */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 |
+            // ------------------------------------------------------
+            //
+            // Sets compare flags if elements in VS1 are not equal with VS2
+            // Moves the element in VS2 to destination vector
+
+            rsp.flag[1] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+
+                vres[del] = VREG_S(VS1REG, del);
+                ACCUM_L(del) = vres[del];
+
+                if (VREG_S(VS1REG, del) != VREG_S(VS2REG, sel))
+                {
+                    SET_COMPARE_FLAG(del);
+                }
+                else
+                {
+                    if (ZERO_FLAG(del) != 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                }
+            }
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x23:              /* VGE */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100011 |
+            // ------------------------------------------------------
+            //
+            // Sets compare flags if elements in VS1 are greater or equal with VS2
+            // Moves the element in VS2 to destination vector
+
+            rsp.flag[1] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+
+                if (VREG_S(VS1REG, del) == VREG_S(VS2REG, sel))
+                {
+                    if (ZERO_FLAG(del) == 0 || CARRY_FLAG(del) == 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                }
+                else if (VREG_S(VS1REG, del) > VREG_S(VS2REG, sel))
+                {
+                    SET_COMPARE_FLAG(del);
+                }
+
+                if (COMPARE_FLAG(del) != 0)
+                {
+                    vres[del] = VREG_S(VS1REG, del);
+                }
+                else
+                {
+                    vres[del] = VREG_S(VS2REG, sel);
+                }
+
+                ACCUM_L(del) = vres[del];
+            }
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x24:              /* VCL */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 |
+            // ------------------------------------------------------
+            //
+            // Vector clip low
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT16 s1 = VREG_S(VS1REG, del);
+                INT16 s2 = VREG_S(VS2REG, sel);
+
+                if (CARRY_FLAG(del) != 0)
+                {
+                    if (ZERO_FLAG(del) != 0)
+                    {
+                        if (COMPARE_FLAG(del) != 0)
+                        {
+                            ACCUM_L(del) = -(UINT16)s2;
+                        }
+                        else
+                        {
+                            ACCUM_L(del) = s1;
+                        }
+                    }
+                    else
+                    {
+                        if (rsp.flag[2] & (1 << (del)))
+                        {
+                            if (((UINT32)(INT16)(s1) + (UINT32)(INT16)(s2)) > 0x10000)
+                            {
+                                ACCUM_L(del) = s1;
+                                CLEAR_COMPARE_FLAG(del);
+                            }
+                            else
+                            {
+                                ACCUM_L(del) = -((UINT16)s2);
+                                SET_COMPARE_FLAG(del);
+                            }
+                        }
+                        else
+                        {
+                            if (((UINT32)(INT16)(s1) + (UINT32)(INT16)(s2)) != 0)
+                            {
+                                ACCUM_L(del) = s1;
+                                CLEAR_COMPARE_FLAG(del);
+                            }
+                            else
+                            {
+                                ACCUM_L(del) = -((UINT16)s2);
+                                SET_COMPARE_FLAG(del);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (ZERO_FLAG(del) != 0)
+                    {
+                        if (rsp.flag[1] & (1 << (8+del)))
+                        {
+                            ACCUM_L(del) = s2;
+                        }
+                        else
+                        {
+                            ACCUM_L(del) = s1;
+                        }
+                    }
+                    else
+                    {
+                        if (((INT32)(UINT16)s1 - (INT32)(UINT16)s2) >= 0)
+                        {
+                            ACCUM_L(del) = s2;
+                            rsp.flag[1] |= (1 << (8+del));
+                        }
+                        else
+                        {
+                            ACCUM_L(del) = s1;
+                            rsp.flag[1] &= ~(1 << (8+del));
+                        }
+                    }
+                }
+
+                vres[del] = ACCUM_L(del);
+            }
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            rsp.flag[2] = 0;
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x25:              /* VCH */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 |
+            // ------------------------------------------------------
+            //
+            // Vector clip high
+
+            CLEAR_ZERO_FLAGS();
+            CLEAR_CARRY_FLAGS();
+            rsp.flag[1] = 0;
+            rsp.flag[2] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT16 s1 = VREG_S(VS1REG, del);
+                INT16 s2 = VREG_S(VS2REG, sel);
+
+                if ((s1 ^ s2) < 0)
+                {
+                    SET_CARRY_FLAG(del);
+                    if (s2 < 0)
+                    {
+                        rsp.flag[1] |= (1 << (8+del));
+                    }
+
+                    if (s1 + s2 <= 0)
+                    {
+                        if (s1 + s2 == -1)
+                        {
+                            rsp.flag[2] |= (1 << (del));
+                        }
+                        SET_COMPARE_FLAG(del);
+                        vres[del] = -((UINT16)s2);
+                    }
+                    else
+                    {
+                        vres[del] = s1;
+                    }
+
+                    if (s1 + s2 != 0)
+                    {
+                        if (s1 != ~s2)
+                        {
+                            SET_ZERO_FLAG(del);
+                        }
+                    }
+                }
+                else
+                {
+                    if (s2 < 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                    if (s1 - s2 >= 0)
+                    {
+                        rsp.flag[1] |= (1 << (8+del));
+                        vres[del] = s2;
+                    }
+                    else
+                    {
+                        vres[del] = s1;
+                    }
+
+                    if ((s1 - s2) != 0)
+                    {
+                        if (s1 != ~s2)
+                        {
+                            SET_ZERO_FLAG(del);
+                        }
+                    }
+                }
+
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x26:              /* VCR */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 |
+            // ------------------------------------------------------
+            //
+            // Vector clip reverse
+
+            rsp.flag[0] = 0;
+            rsp.flag[1] = 0;
+            rsp.flag[2] = 0;
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                INT16 s1 = VREG_S(VS1REG, del);
+                INT16 s2 = VREG_S(VS2REG, sel);
+
+                if ((INT16)(s1 ^ s2) < 0)
+                {
+                    if (s2 < 0)
+                    {
+                        rsp.flag[1] |= (1 << (8+del));
+                    }
+                    if ((s1 + s2) <= 0)
+                    {
+                        ACCUM_L(del) = ~((UINT16)s2);
+                        SET_COMPARE_FLAG(del);
+                    }
+                    else
+                    {
+                        ACCUM_L(del) = s1;
+                    }
+                }
+                else
+                {
+                    if (s2 < 0)
+                    {
+                        SET_COMPARE_FLAG(del);
+                    }
+                    if ((s1 - s2) >= 0)
+                    {
+                        ACCUM_L(del) = s2;
+                        rsp.flag[1] |= (1 << (8+del));
+                    }
+                    else
+                    {
+                        ACCUM_L(del) = s1;
+                    }
+                }
+
+                vres[del] = ACCUM_L(del);
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x27:              /* VMRG */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 |
+            // ------------------------------------------------------
+            //
+            // Merges two vectors according to compare flags
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                if (COMPARE_FLAG(del) != 0)
+                {
+                    vres[del] = VREG_S(VS1REG, del);
+                }
+                else
+                {
+                    vres[del] = VREG_S(VS2REG, VEC_EL_2(EL, sel));
+                }
+
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x28:              /* VAND */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 |
+            // ------------------------------------------------------
+            //
+            // Bitwise AND of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = VREG_S(VS1REG, del) & VREG_S(VS2REG, sel);
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x29:              /* VNAND */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 |
+            // ------------------------------------------------------
+            //
+            // Bitwise NOT AND of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = ~((VREG_S(VS1REG, del) & VREG_S(VS2REG, sel)));
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x2a:              /* VOR */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 |
+            // ------------------------------------------------------
+            //
+            // Bitwise OR of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = VREG_S(VS1REG, del) | VREG_S(VS2REG, sel);
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x2b:              /* VNOR */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 |
+            // ------------------------------------------------------
+            //
+            // Bitwise NOT OR of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = ~((VREG_S(VS1REG, del) | VREG_S(VS2REG, sel)));
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x2c:              /* VXOR */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 |
+            // ------------------------------------------------------
+            //
+            // Bitwise XOR of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = VREG_S(VS1REG, del) ^ VREG_S(VS2REG, sel);
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+    case 0x2d:              /* VNXOR */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 |
+            // ------------------------------------------------------
+            //
+            // Bitwise NOT XOR of two vector registers
+
+            for (i=0; i < 8; i++)
+            {
+                int del = VEC_EL_1(EL, i);
+                int sel = VEC_EL_2(EL, del);
+                vres[del] = ~((VREG_S(VS1REG, del) ^ VREG_S(VS2REG, sel)));
+                ACCUM_L(del) = vres[del];
+            }
+            WRITEBACK_RESULT();
+            break;
+        }
+
+    case 0x30:              /* VRCP */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 |
+            // ------------------------------------------------------
+            //
+            // Calculates reciprocal
+            int del = (VS1REG & 7);
+            int sel = EL&7; //VEC_EL_2(EL, del);
+            INT32 rec;
+
+            rec = (INT16)(VREG_S(VS2REG, sel));
+
+            if (rec == 0)
+            {
+                // divide by zero -> overflow
+                rec = 0x7fffffff;
+            }
+            else
+            {
+                int negative = 0;
+                if (rec < 0)
+                {
+                    rec = ~rec+1;
+                    negative = 1;
+                }
+                for (i = 15; i > 0; i--)
+                {
+                    if (rec & (1 << i))
+                    {
+                        rec &= ((0xffc0) >> (15 - i));
+                        i = 0;
+                    }
+                }
+                rec = (INT32)(0x7fffffff / (double)rec);
+                for (i = 31; i > 0; i--)
+                {
+                    if (rec & (1 << i))
+                    {
+                        rec &= ((0xffff8000) >> (31 - i));
+                        i = 0;
+                    }
+                }
+                if (negative)
+                {
+                    rec = ~rec;
+                }
+            }
+
+            for (i=0; i < 8; i++)
+            {
+                int element = VEC_EL_2(EL, i);
+                ACCUM_L(i) = VREG_S(VS2REG, element);
+            }
+
+            rsp.reciprocal_res = rec;
+
+            VREG_S(VDREG, del) = (UINT16)(rsp.reciprocal_res);                      // store low part
+            break;
+        }
+
+    case 0x31:              /* VRCPL */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 |
+            // ------------------------------------------------------
+            //
+            // Calculates reciprocal low part
+
+            int del = (VS1REG & 7);
+            int sel = VEC_EL_2(EL, del);
+            INT32 rec;
+
+            rec = ((UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp.reciprocal_high) << 16));
+
+            if (rec == 0)
+            {
+                // divide by zero -> overflow
+                rec = 0x7fffffff;
+            }
+            else
+            {
+                int negative = 0;
+                if (rec < 0)
+                {
+                    if (((UINT32)(rec & 0xffff0000) == 0xffff0000) && ((INT16)(rec & 0xffff) < 0))
+                    {
+                        rec = ~rec+1;
+                    }
+                    else
+                    {
+                        rec = ~rec;
+                    }
+                    negative = 1;
+                }
+                for (i = 31; i > 0; i--)
+                {
+                    if (rec & (1 << i))
+                    {
+                        rec &= ((0xffc00000) >> (31 - i));
+                        i = 0;
+                    }
+                }
+                rec = (0x7fffffff / rec);
+                for (i = 31; i > 0; i--)
+                {
+                    if (rec & (1 << i))
+                    {
+                        rec &= ((0xffff8000) >> (31 - i));
+                        i = 0;
+                    }
+                }
+                if (negative)
+                {
+                    rec = ~rec;
+                }
+            }
+
+            for (i=0; i < 8; i++)
+            {
+                int element = VEC_EL_2(EL, i);
+                ACCUM_L(i) = VREG_S(VS2REG, element);
+            }
+
+            rsp.reciprocal_res = rec;
+
+            VREG_S(VDREG, del) = (UINT16)(rsp.reciprocal_res);                      // store low part
+            break;
+        }
+
+    case 0x32:              /* VRCPH */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110010 |
+            // ------------------------------------------------------
+            //
+            // Calculates reciprocal high part
+
+            int del = (VS1REG & 7);
+            int sel = VEC_EL_2(EL, del);
+
+            rsp.reciprocal_high = VREG_S(VS2REG, sel);
+
+            for (i=0; i < 8; i++)
+            {
+                int element = VEC_EL_2(EL, i);
+                ACCUM_L(i) = VREG_S(VS2REG, element);           // perhaps accumulator is used to store the intermediate values ?
+            }
+
+            VREG_S(VDREG, del) = (INT16)(rsp.reciprocal_res >> 16); // store high part
+            break;
+        }
+
+    case 0x33:              /* VMOV */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110011 |
+            // ------------------------------------------------------
+            //
+            // Moves element from vector to destination vector
+
+            int element = VS1REG & 7;
+            VREG_S(VDREG, element) = VREG_S(VS2REG, VEC_EL_2(EL, 7-element));
+            break;
+        }
+
+    case 0x35:              /* VRSQL */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 |
+            // ------------------------------------------------------
+            //
+            // Calculates reciprocal square-root low part
+
+            int del = (VS1REG & 7);
+            int sel = VEC_EL_2(EL, del);
+            UINT32 sqr;
+
+            sqr = (UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp.square_root_high) << 16);
+
+            if (sqr == 0)
+            {
+                // square root on 0 -> overflow
+                sqr = 0x7fffffff;
+            }
+            else if (sqr == 0xffff8000)
+            {
+                // overflow ?
+                sqr = 0xffff8000;
+            }
+            else
+            {
+                int negative = 0;
+                if (sqr > 0x7fffffff)
+                {
+                    if (((UINT32)(sqr & 0xffff0000) == 0xffff0000) && ((INT16)(sqr & 0xffff) < 0))
+                    {
+                        sqr = ~sqr+1;
+                    }
+                    else
+                    {
+                        sqr = ~sqr;
+                    }
+                    negative = 1;
+                }
+                for (i = 31; i > 0; i--)
+                {
+                    if (sqr & (1 << i))
+                    {
+                        sqr &= (0xff800000 >> (31 - i));
+                        i = 0;
+                    }
+                }
+                sqr = (INT32)(0x7fffffff / sqrt(sqr));
+                for (i = 31; i > 0; i--)
+                {
+                    if (sqr & (1 << i))
+                    {
+                        sqr &= (0xffff8000 >> (31 - i));
+                        i = 0;
+                    }
+                }
+                if (negative)
+                {
+                    sqr = ~sqr;
+                }
+            }
+
+            for (i=0; i < 8; i++)
+            {
+                int element = VEC_EL_2(EL, i);
+                ACCUM_L(i) = VREG_S(VS2REG, element);
+            }
+
+            rsp.square_root_res = sqr;
+
+            VREG_S(VDREG, del) = (UINT16)(rsp.square_root_res);                     // store low part
+            break;
+        }
+
+    case 0x36:              /* VRSQH */
+        {
+            // 31       25  24     20      15      10      5        0
+            // ------------------------------------------------------
+            // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 |
+            // ------------------------------------------------------
+            //
+            // Calculates reciprocal square-root high part
+
+            int del = (VS1REG & 7);
+            int sel = VEC_EL_2(EL, del);
+
+            rsp.square_root_high = VREG_S(VS2REG, sel);
+
+            for (i=0; i < 8; i++)
+            {
+                int element = VEC_EL_2(EL, i);
+                ACCUM_L(i) = VREG_S(VS2REG, element);           // perhaps accumulator is used to store the intermediate values ?
+            }
+
+            VREG_S(VDREG, del) = (INT16)(rsp.square_root_res >> 16);        // store high part
+            break;
+        }
+
+    default:        unimplemented_opcode(op); break;
+    }
+}
+
+int rsp_execute(int cycles)
+{
+    UINT32 op;
+
+    rsp_icount=1; //cycles;
+
+    UINT32 ExecutedCycles=0;
+    UINT32 BreakMarker=0;
+    UINT32 WDCHackFlag1=0;
+    UINT32 WDCHackFlag2=0;
+
+    sp_pc = /*0x4001000 | */(sp_pc & 0xfff);
+    if( rsp_sp_status & (SP_STATUS_HALT|SP_STATUS_BROKE))
+    {
+        log(M64MSG_WARNING, "Quit due to SP halt/broke on start");
+        rsp_icount = 0;
+    }
+
+
+    while (rsp_icount > 0)
+    {
+#ifdef RSPTIMING
+        uint64_t lasttime;
+        lasttime = RDTSC();
+#endif
+        rsp.ppc = sp_pc;
+
+
+        op = ROPCODE(sp_pc);
+#ifdef GENTRACE
+        char s[128];
+        rsp_dasm_one(s, sp_pc, op);
+        GENTRACE("%2x %3x\t%s\n", ((UINT8*)rsp_dmem)[0x1934], sp_pc, s);
+#endif
+
+        if (rsp.nextpc != ~0U)///DELAY SLOT USAGE
+        {
+            sp_pc = /*0x4001000 | */(rsp.nextpc & 0xfff); //rsp.nextpc;
+            rsp.nextpc = ~0U;
+        }
+        else
+        {
+            sp_pc = /*0x4001000 | */((sp_pc+4)&0xfff);
+        }
+
+        switch (op >> 26)
+        {
+        case 0x00:      /* SPECIAL */
+            {
+                switch (op & 0x3f)
+                {
+                case 0x00:      /* SLL */               if (RDREG) RDVAL = (UINT32)RTVAL << SHIFT; break;
+                case 0x02:      /* SRL */               if (RDREG) RDVAL = (UINT32)RTVAL >> SHIFT; break;
+                case 0x03:      /* SRA */               if (RDREG) RDVAL = (INT32)RTVAL >> SHIFT; break;
+                case 0x04:      /* SLLV */              if (RDREG) RDVAL = (UINT32)RTVAL << (RSVAL & 0x1f); break;
+                case 0x06:      /* SRLV */              if (RDREG) RDVAL = (UINT32)RTVAL >> (RSVAL & 0x1f); break;
+                case 0x07:      /* SRAV */              if (RDREG) RDVAL = (INT32)RTVAL >> (RSVAL & 0x1f); break;
+                case 0x08:      /* JR */                JUMP_PC(RSVAL); break;
+                case 0x09:      /* JALR */              JUMP_PC_L(RSVAL, RDREG); break;
+                case 0x0d:      /* BREAK */
+                    {
+                        *z64_rspinfo.SP_STATUS_REG |= (SP_STATUS_HALT | SP_STATUS_BROKE );
+                        if ((*z64_rspinfo.SP_STATUS_REG & SP_STATUS_INTR_BREAK) != 0 ) {
+                            *z64_rspinfo.MI_INTR_REG |= 1;
+                            z64_rspinfo.CheckInterrupts();
+                        }
+                        //sp_set_status(0x3);
+                        rsp_icount = 0;
+
+                        BreakMarker=1;
+
+#if LOG_INSTRUCTION_EXECUTION
+                        fprintf(exec_output, "\n---------- break ----------\n\n");
+#endif
+                        break;
+                    }
+                case 0x20:      /* ADD */               if (RDREG) RDVAL = (INT32)(RSVAL + RTVAL); break;
+                case 0x21:      /* ADDU */              if (RDREG) RDVAL = (INT32)(RSVAL + RTVAL); break;
+                case 0x22:      /* SUB */               if (RDREG) RDVAL = (INT32)(RSVAL - RTVAL); break;
+                case 0x23:      /* SUBU */              if (RDREG) RDVAL = (INT32)(RSVAL - RTVAL); break;
+                case 0x24:      /* AND */               if (RDREG) RDVAL = RSVAL & RTVAL; break;
+                case 0x25:      /* OR */                if (RDREG) RDVAL = RSVAL | RTVAL; break;
+                case 0x26:      /* XOR */               if (RDREG) RDVAL = RSVAL ^ RTVAL; break;
+                case 0x27:      /* NOR */               if (RDREG) RDVAL = ~(RSVAL | RTVAL); break;
+                case 0x2a:      /* SLT */               if (RDREG) RDVAL = (INT32)RSVAL < (INT32)RTVAL; break;
+                case 0x2b:      /* SLTU */              if (RDREG) RDVAL = (UINT32)RSVAL < (UINT32)RTVAL; break;
+                default:        unimplemented_opcode(op); break;
+                }
+                break;
+            }
+
+        case 0x01:      /* REGIMM */
+            {
+                switch (RTREG)
+                {
+                case 0x00:      /* BLTZ */              if ((INT32)(RSVAL) < 0) JUMP_REL(SIMM16); break;
+                case 0x01:      /* BGEZ */              if ((INT32)(RSVAL) >= 0) JUMP_REL(SIMM16); break;
+                    // VP according to the doc, link is performed even when condition fails,
+                    // this sound pretty stupid but let's try it that way
+                case 0x11:      /* BGEZAL */    LINK(31); if ((INT32)(RSVAL) >= 0) JUMP_REL(SIMM16); break;
+                    //case 0x11:  /* BGEZAL */    if ((INT32)(RSVAL) >= 0) JUMP_REL_L(SIMM16, 31); break;
+                default:        unimplemented_opcode(op); break;
+                }
+                break;
+            }
+
+        case 0x02:      /* J */                 JUMP_ABS(UIMM26); break;
+        case 0x03:      /* JAL */               JUMP_ABS_L(UIMM26, 31); break;
+        case 0x04:      /* BEQ */               if (RSVAL == RTVAL) JUMP_REL(SIMM16); break;
+        case 0x05:      /* BNE */               if (RSVAL != RTVAL) JUMP_REL(SIMM16); break;
+        case 0x06:      /* BLEZ */              if ((INT32)RSVAL <= 0) JUMP_REL(SIMM16); break;
+        case 0x07:      /* BGTZ */              if ((INT32)RSVAL > 0) JUMP_REL(SIMM16); break;
+        case 0x08:      /* ADDI */              if (RTREG) RTVAL = (INT32)(RSVAL + SIMM16); break;
+        case 0x09:      /* ADDIU */             if (RTREG) RTVAL = (INT32)(RSVAL + SIMM16); break;
+        case 0x0a:      /* SLTI */              if (RTREG) RTVAL = (INT32)(RSVAL) < ((INT32)SIMM16); break;
+        case 0x0b:      /* SLTIU */             if (RTREG) RTVAL = (UINT32)(RSVAL) < (UINT32)((INT32)SIMM16); break;
+        case 0x0c:      /* ANDI */              if (RTREG) RTVAL = RSVAL & UIMM16; break;
+        case 0x0d:      /* ORI */               if (RTREG) RTVAL = RSVAL | UIMM16; break;
+        case 0x0e:      /* XORI */              if (RTREG) RTVAL = RSVAL ^ UIMM16; break;
+        case 0x0f:      /* LUI */               if (RTREG) RTVAL = UIMM16 << 16; break;
+
+        case 0x10:      /* COP0 */
+            {
+                switch ((op >> 21) & 0x1f)
+                {
+                case 0x00:      /* MFC0 */              if (RTREG) RTVAL = get_cop0_reg(RDREG); break;
+                case 0x04:      /* MTC0 */              set_cop0_reg(RDREG, RTVAL); break;
+                default:
+                    log(M64MSG_WARNING, "unimplemented cop0 %x (%x)\n", (op >> 21) & 0x1f, op);
+                    break;
+                }
+                break;
+            }
+
+        case 0x12:      /* COP2 */
+            {
+                switch ((op >> 21) & 0x1f)
+                {
+                case 0x00:      /* MFC2 */
+                    {
+                        // 31       25      20      15      10     6         0
+                        // ---------------------------------------------------
+                        // | 010010 | 00000 | TTTTT | DDDDD | IIII | 0000000 |
+                        // ---------------------------------------------------
+                        //
+
+                        int el = (op >> 7) & 0xf;
+                        UINT16 b1 = VREG_B(VS1REG, (el+0) & 0xf);
+                        UINT16 b2 = VREG_B(VS1REG, (el+1) & 0xf);
+                        if (RTREG) RTVAL = (INT32)(INT16)((b1 << 8) | (b2));
+                        break;
+                    }
+                case 0x02:      /* CFC2 */
+                    {
+                        // 31       25      20      15      10            0
+                        // ------------------------------------------------
+                        // | 010010 | 00010 | TTTTT | DDDDD | 00000000000 |
+                        // ------------------------------------------------
+                        //
+
+                        if (RTREG)
+                        {
+                            if (RDREG == 2)
+                            {
+                                // Anciliary clipping flags
+                                RTVAL = rsp.flag[RDREG] & 0x00ff;
+                            }
+                            else
+                            {
+                                // All other flags are 16 bits but sign-extended at retrieval
+                                RTVAL = (UINT32)rsp.flag[RDREG] | ( ( rsp.flag[RDREG] & 0x8000 ) ? 0xffff0000 : 0 );
+                            }
+                        }
+                        break;
+
+                    }
+                case 0x04:      /* MTC2 */
+                    {
+                        // 31       25      20      15      10     6         0
+                        // ---------------------------------------------------
+                        // | 010010 | 00100 | TTTTT | DDDDD | IIII | 0000000 |
+                        // ---------------------------------------------------
+                        //
+
+                        int el = (op >> 7) & 0xf;
+                        VREG_B(VS1REG, (el+0) & 0xf) = (RTVAL >> 8) & 0xff;
+                        VREG_B(VS1REG, (el+1) & 0xf) = (RTVAL >> 0) & 0xff;
+                        break;
+                    }
+                case 0x06:      /* CTC2 */
+                    {
+                        // 31       25      20      15      10            0
+                        // ------------------------------------------------
+                        // | 010010 | 00110 | TTTTT | DDDDD | 00000000000 |
+                        // ------------------------------------------------
+                        //
+
+                        rsp.flag[RDREG] = RTVAL & 0xffff;
+                        break;
+                    }
+
+                case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
+                case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f:
+                    {
+                        handle_vector_ops(op);
+                        break;
+                    }
+
+                default:        unimplemented_opcode(op); break;
+                }
+                break;
+            }
+
+        case 0x20:      /* LB */                if (RTREG) RTVAL = (INT32)(INT8)READ8(RSVAL + SIMM16); break;
+        case 0x21:      /* LH */                if (RTREG) RTVAL = (INT32)(INT16)READ16(RSVAL + SIMM16); break;
+        case 0x23:      /* LW */                if (RTREG) RTVAL = READ32(RSVAL + SIMM16); break;
+        case 0x24:      /* LBU */               if (RTREG) RTVAL = (UINT8)READ8(RSVAL + SIMM16); break;
+        case 0x25:      /* LHU */               if (RTREG) RTVAL = (UINT16)READ16(RSVAL + SIMM16); break;
+        case 0x28:      /* SB */                WRITE8(RSVAL + SIMM16, RTVAL); break;
+        case 0x29:      /* SH */                WRITE16(RSVAL + SIMM16, RTVAL); break;
+        case 0x2b:      /* SW */                WRITE32(RSVAL + SIMM16, RTVAL); break;
+        case 0x32:      /* LWC2 */              handle_lwc2(op); break;
+        case 0x3a:      /* SWC2 */              handle_swc2(op); break;
+
+        default:
+            {
+                unimplemented_opcode(op);
+                break;
+            }
+        }
+
+#ifdef RSPTIMING
+        uint64_t time = lasttime;
+        lasttime = RDTSC();
+        rsp_opinfo_t info;
+        rsp_get_opinfo(op, &info);
+        rsptimings[info.op2] += lasttime - time;
+        rspcounts[info.op2]++;
+#endif
+
+#if LOG_INSTRUCTION_EXECUTION
+        {
+            int i, l;
+            static UINT32 prev_regs[32];
+            static VECTOR_REG prev_vecs[32];
+            char string[200];
+            rsp_dasm_one(string, rsp.ppc, op);
+
+            fprintf(exec_output, "%08X: %s", rsp.ppc, string);
+
+            l = strlen(string);
+            if (l < 36)
+            {
+                for (i=l; i < 36; i++)
+                {
+                    fprintf(exec_output, " ");
+                }
+            }
+
+            fprintf(exec_output, "| ");
+
+            for (i=0; i < 32; i++)
+            {
+                if (rsp.r[i] != prev_regs[i])
+                {
+                    fprintf(exec_output, "R%d: %08X ", i, rsp.r[i]);
+                }
+                prev_regs[i] = rsp.r[i];
+            }
+
+            for (i=0; i < 32; i++)
+            {
+                if (rsp.v[i].d[0] != prev_vecs[i].d[0] || rsp.v[i].d[1] != prev_vecs[i].d[1])
+                {
+                    fprintf(exec_output, "V%d: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X ", i,
+                        (UINT16)VREG_S(i,0), (UINT16)VREG_S(i,1), (UINT16)VREG_S(i,2), (UINT16)VREG_S(i,3), (UINT16)VREG_S(i,4), (UINT16)VREG_S(i,5), (UINT16)VREG_S(i,6), (UINT16)VREG_S(i,7));
+                }
+                prev_vecs[i].d[0] = rsp.v[i].d[0];
+                prev_vecs[i].d[1] = rsp.v[i].d[1];
+            }
+
+            fprintf(exec_output, "\n");
+
+        }
+#endif
+        //              --rsp_icount;
+
+        ExecutedCycles++;
+        if( rsp_sp_status & SP_STATUS_SSTEP )
+        {
+            if( rsp.step_count )
+            {
+                rsp.step_count--;
+            }
+            else
+            {
+                rsp_sp_status |= SP_STATUS_BROKE;
+            }
+        }
+
+        if( rsp_sp_status & (SP_STATUS_HALT|SP_STATUS_BROKE))
+        {
+            rsp_icount = 0;
+
+            if(BreakMarker==0)
+                log(M64MSG_WARNING, "Quit due to SP halt/broke set by MTC0\n");
+        }
+
+        ///WDC&SR64 hack:VERSION3:1.8x -2x FASTER & safer
+        if((WDCHackFlag1==0)&&(rsp.ppc>0x137)&&(rsp.ppc<0x14D))
+            WDCHackFlag1=ExecutedCycles;
+        if ((WDCHackFlag1!=0)&&((rsp.ppc<=0x137)||(rsp.ppc>=0x14D)))
+            WDCHackFlag1=0;
+        if ((WDCHackFlag1!=0)&&((ExecutedCycles-WDCHackFlag1)>=0x20)&&(rsp.ppc>0x137)&&(rsp.ppc<0x14D)) 
+        {
+            //      printf("WDC hack quit 1\n");
+            rsp_icount=0;//32 cycles should be enough
+        }
+        if((WDCHackFlag2==0)&&(rsp.ppc>0xFCB)&&(rsp.ppc<0xFD5))
+            WDCHackFlag2=ExecutedCycles;
+        if ((WDCHackFlag2!=0)&&((rsp.ppc<=0xFCB)||(rsp.ppc>=0xFD5)))
+            WDCHackFlag2=0;
+        if ((WDCHackFlag2!=0)&&((ExecutedCycles-WDCHackFlag2)>=0x20)&&(rsp.ppc>0xFCB)&&(rsp.ppc<0xFD5)) 
+        {
+            //      printf("WDC hack quit 2\n");
+            rsp_icount=0;//32 cycles should be enough
+        }
+
+
+    }
+    //sp_pc -= 4;
+
+    return ExecutedCycles;
+}
+
+/*****************************************************************************/
+
+
+enum sp_dma_direction
+{
+       SP_DMA_RDRAM_TO_IDMEM,
+       SP_DMA_IDMEM_TO_RDRAM
+};
+
+static void sp_dma(enum sp_dma_direction direction)
+{
+    UINT8 *src, *dst;
+    int i, j;
+    int length;
+    int count;
+    int skip;
+
+
+    UINT32 l = sp_dma_length;
+    length = ((l & 0xfff) | 7) + 1;
+    skip = (l >> 20) + length;
+    count = ((l >> 12) & 0xff) + 1;
+
+    if (direction == SP_DMA_RDRAM_TO_IDMEM) // RDRAM -> I/DMEM
+    {
+        //UINT32 src_address = sp_dram_addr & ~7;
+        //UINT32 dst_address = (sp_mem_addr & 0x1000) ? 0x4001000 : 0x4000000;
+        src = (UINT8*)&rdram[(sp_dram_addr&~7) / 4];
+        dst = (sp_mem_addr & 0x1000) ? (UINT8*)&rsp_imem[(sp_mem_addr & ~7 & 0xfff) / 4] : (UINT8*)&rsp_dmem[(sp_mem_addr & ~7 &0xfff) / 4];
+        ///cpuintrf_push_context(0);
+#define BYTE8_XOR_BE(a) ((a)^7)// JFG, Ocarina of Time
+
+        for (j=0; j < count; j++)
+        {
+            for (i=0; i < length; i++)
+            {
+                ///UINT8 b = program_read_byte_64be(src_address + i + (j*skip));
+                ///program_write_byte_64be(dst_address + (((sp_mem_addr & ~7) + i + (j*length)) & 0xfff), b);
+                dst[BYTE8_XOR_BE((i + j*length)&0xfff)] = src[BYTE8_XOR_BE(i + j*skip)];
+            }
+        }
+
+        ///cpuintrf_pop_context();
+        *z64_rspinfo.SP_DMA_BUSY_REG = 0;
+        *z64_rspinfo.SP_STATUS_REG  &= ~SP_STATUS_DMABUSY;
+    }
+    else if (direction == SP_DMA_IDMEM_TO_RDRAM) // I/DMEM -> RDRAM
+    {
+        //UINT32 dst_address = sp_dram_addr & ~7;
+        //UINT32 src_address = (sp_mem_addr & 0x1000) ? 0x4001000 : 0x4000000;
+
+        dst = (UINT8*)&rdram[(sp_dram_addr&~7) / 4];
+        src = (sp_mem_addr & 0x1000) ? (UINT8*)&rsp_imem[(sp_mem_addr & ~7 & 0xfff) / 4] : (UINT8*)&rsp_dmem[(sp_mem_addr & ~7 &0xfff) / 4];
+        ///cpuintrf_push_context(0);
+
+        for (j=0; j < count; j++)
+        {
+            for (i=0; i < length; i++)
+            {
+                ///UINT8 b = program_read_byte_64be(src_address + (((sp_mem_addr & ~7) + i + (j*length)) & 0xfff));
+                ///program_write_byte_64be(dst_address + i + (j*skip), b);
+                dst[BYTE8_XOR_BE(i + j*skip)] = src[BYTE8_XOR_BE((+i + j*length)&0xfff)];
+            }
+        }
+
+        ///cpuintrf_pop_context();
+        *z64_rspinfo.SP_DMA_BUSY_REG = 0;
+        *z64_rspinfo.SP_STATUS_REG  &= ~SP_STATUS_DMABUSY;
+    }
+
+
+}
+
+
+
+
+
+UINT32 n64_sp_reg_r(UINT32 offset, UINT32 dummy)
+{
+    switch (offset)
+    {
+    case 0x00/4:            // SP_MEM_ADDR_REG
+        return sp_mem_addr;
+
+    case 0x04/4:            // SP_DRAM_ADDR_REG
+        return sp_dram_addr;
+
+    case 0x08/4:            // SP_RD_LEN_REG
+        return sp_dma_rlength;
+
+    case 0x10/4:            // SP_STATUS_REG
+        return rsp_sp_status;
+
+    case 0x14/4:            // SP_DMA_FULL_REG
+        return 0;
+
+    case 0x18/4:            // SP_DMA_BUSY_REG
+        return 0;
+
+    case 0x1c/4:            // SP_SEMAPHORE_REG
+        return sp_semaphore;
+
+    default:
+        log(M64MSG_WARNING, "sp_reg_r: %08X\n", offset);
+        break;
+    }
+
+    return 0;
+}
+
+//UINT32 n64_sp_reg_w(RSP_REGS & rsp, UINT32 offset, UINT32 data, UINT32 dummy)
+void n64_sp_reg_w(UINT32 offset, UINT32 data, UINT32 dummy)
+{
+    UINT32 InterruptPending=0;
+    if ((offset & 0x10000) == 0)
+    {
+        switch (offset & 0xffff)
+        {
+        case 0x00/4:            // SP_MEM_ADDR_REG
+            sp_mem_addr = data;
+            break;
+
+        case 0x04/4:            // SP_DRAM_ADDR_REG
+            sp_dram_addr = data & 0xffffff;
+            break;
+
+        case 0x08/4:            // SP_RD_LEN_REG
+            //                              sp_dma_length = data & 0xfff;
+            //                              sp_dma_count = (data >> 12) & 0xff;
+            //                              sp_dma_skip = (data >> 20) & 0xfff;
+            sp_dma_length=data;
+            sp_dma(SP_DMA_RDRAM_TO_IDMEM);
+            break;
+
+        case 0x0c/4:            // SP_WR_LEN_REG
+            //                              sp_dma_length = data & 0xfff;
+            //                              sp_dma_count = (data >> 12) & 0xff;
+            //                              sp_dma_skip = (data >> 20) & 0xfff;
+            sp_dma_length=data;
+            sp_dma(SP_DMA_IDMEM_TO_RDRAM);
+            break;
+
+        case 0x10/4:            // SP_STATUS_REG
+            {
+                if((data&0x1)&&(data&0x2)) 
+                    log(M64MSG_ERROR, "Clear halt and set halt simultaneously\n");
+                if((data&0x8)&&(data&0x10))
+                    log(M64MSG_ERROR, "Clear int and set int simultaneously\n");
+                if((data&0x20)&&(data&0x40)) 
+                    log(M64MSG_ERROR, "Clear sstep and set sstep simultaneously\n");
+                if (data & 0x00000001)          // clear halt
+                {
+                    rsp_sp_status &= ~SP_STATUS_HALT;
+
+                    //                                      if (first_rsp)
+                    //                                      {
+                    //                                              cpu_spinuntil_trigger(6789);
+
+                    //                                              cpunum_set_input_line(1, INPUT_LINE_HALT, CLEAR_LINE);
+                    //                                              rsp_sp_status &= ~SP_STATUS_HALT;
+                    //                                      }
+                    //                                      else
+                    //                                      {
+                    //                                              first_rsp = 1;
+                    //                                      }
+                }
+                if (data & 0x00000002)          // set halt
+                {
+                    //                                      cpunum_set_input_line(1, INPUT_LINE_HALT, ASSERT_LINE);
+                    rsp_sp_status |= SP_STATUS_HALT;
+                }
+                if (data & 0x00000004) rsp_sp_status &= ~SP_STATUS_BROKE;               // clear broke
+                if (data & 0x00000008)          // clear interrupt
+                {
+                    *z64_rspinfo.MI_INTR_REG &= ~R4300i_SP_Intr;
+                    ///TEMPORARY COMMENTED FOR SPEED
+                    ///               printf("sp_reg_w clear interrupt");
+                    //clear_rcp_interrupt(SP_INTERRUPT);
+                }
+                if (data & 0x00000010)          // set interrupt
+                {
+                    //signal_rcp_interrupt(SP_INTERRUPT);
+                }
+                if (data & 0x00000020) rsp_sp_status &= ~SP_STATUS_SSTEP;               // clear single step
+                if (data & 0x00000040) {
+                    rsp_sp_status |= SP_STATUS_SSTEP;  // set single step
+                    log(M64MSG_STATUS, "RSP STATUS REG: SSTEP set\n");
+                }
+                if (data & 0x00000080) rsp_sp_status &= ~SP_STATUS_INTR_BREAK;  // clear interrupt on break
+                if (data & 0x00000100) rsp_sp_status |= SP_STATUS_INTR_BREAK;   // set interrupt on break
+                if (data & 0x00000200) rsp_sp_status &= ~SP_STATUS_SIGNAL0;     // clear signal 0
+                if (data & 0x00000400) rsp_sp_status |= SP_STATUS_SIGNAL0;      // set signal 0
+                if (data & 0x00000800) rsp_sp_status &= ~SP_STATUS_SIGNAL1;     // clear signal 1
+                if (data & 0x00001000) rsp_sp_status |= SP_STATUS_SIGNAL1;      // set signal 1
+                if (data & 0x00002000) rsp_sp_status &= ~SP_STATUS_SIGNAL2;     // clear signal 2
+                if (data & 0x00004000) rsp_sp_status |= SP_STATUS_SIGNAL2;      // set signal 2
+                if (data & 0x00008000) rsp_sp_status &= ~SP_STATUS_SIGNAL3;     // clear signal 3
+                if (data & 0x00010000) rsp_sp_status |= SP_STATUS_SIGNAL3;      // set signal 3
+                if (data & 0x00020000) rsp_sp_status &= ~SP_STATUS_SIGNAL4;     // clear signal 4
+                if (data & 0x00040000) rsp_sp_status |= SP_STATUS_SIGNAL4;      // set signal 4
+                if (data & 0x00080000) rsp_sp_status &= ~SP_STATUS_SIGNAL5;     // clear signal 5
+                if (data & 0x00100000) rsp_sp_status |= SP_STATUS_SIGNAL5;      // set signal 5
+                if (data & 0x00200000) rsp_sp_status &= ~SP_STATUS_SIGNAL6;     // clear signal 6
+                if (data & 0x00400000) rsp_sp_status |= SP_STATUS_SIGNAL6;      // set signal 6
+                if (data & 0x00800000) rsp_sp_status &= ~SP_STATUS_SIGNAL7;     // clear signal 7
+                if (data & 0x01000000) rsp_sp_status |= SP_STATUS_SIGNAL7;      // set signal 7
+
+                if(InterruptPending==1)
+                {
+                    *z64_rspinfo.MI_INTR_REG |= 1;
+                    z64_rspinfo.CheckInterrupts();
+                    InterruptPending=0;
+                }
+                break;
+            }
+
+        case 0x1c/4:            // SP_SEMAPHORE_REG
+            sp_semaphore = data;
+            //      mame_printf_debug("sp_semaphore = %08X\n", sp_semaphore);
+            break;
+
+        default:
+            log(M64MSG_WARNING, "sp_reg_w: %08X, %08X\n", data, offset);
+            break;
+        }
+    }
+    else
+    {
+        switch (offset & 0xffff)
+        {
+        case 0x00/4:            // SP_PC_REG
+            //cpunum_set_info_int(1, CPUINFO_INT_PC, 0x04001000 | (data & 0xfff));
+            //break;
+
+        default:
+            log(M64MSG_WARNING, "sp_reg_w: %08X, %08X\n", data, offset);
+            break;
+        }
+    }
+}
+
+UINT32 sp_read_reg(UINT32 reg)
+{
+    switch (reg)
+    {
+        //case 4:       return rsp_sp_status;
+    default:        return n64_sp_reg_r(reg, 0x00000000);
+    }
+}
+
+
+void sp_write_reg(UINT32 reg, UINT32 data)
+{
+    switch (reg)
+    {
+    default:        n64_sp_reg_w(reg, data, 0x00000000); break;
+    }
+}