git subrepo pull --force deps/lightrec
[pcsx_rearmed.git] / deps / lightrec / lightrec.c
index be4da10..13434b4 100644 (file)
@@ -36,20 +36,20 @@ static void lightrec_mtc2(struct lightrec_state *state, u8 reg, u32 data);
 static u32 lightrec_mfc2(struct lightrec_state *state, u8 reg);
 
 static void lightrec_default_sb(struct lightrec_state *state, u32 opcode,
-                               void *host, u32 addr, u8 data)
+                               void *host, u32 addr, u32 data)
 {
-       *(u8 *)host = data;
+       *(u8 *)host = (u8)data;
 
-       if (!state->invalidate_from_dma_only)
+       if (!(state->opt_flags & LIGHTREC_OPT_INV_DMA_ONLY))
                lightrec_invalidate(state, addr, 1);
 }
 
 static void lightrec_default_sh(struct lightrec_state *state, u32 opcode,
-                               void *host, u32 addr, u16 data)
+                               void *host, u32 addr, u32 data)
 {
-       *(u16 *)host = HTOLE16(data);
+       *(u16 *)host = HTOLE16((u16)data);
 
-       if (!state->invalidate_from_dma_only)
+       if (!(state->opt_flags & LIGHTREC_OPT_INV_DMA_ONLY))
                lightrec_invalidate(state, addr, 2);
 }
 
@@ -58,7 +58,7 @@ static void lightrec_default_sw(struct lightrec_state *state, u32 opcode,
 {
        *(u32 *)host = HTOLE32(data);
 
-       if (!state->invalidate_from_dma_only)
+       if (!(state->opt_flags & LIGHTREC_OPT_INV_DMA_ONLY))
                lightrec_invalidate(state, addr, 4);
 }
 
@@ -80,6 +80,27 @@ static u32 lightrec_default_lw(struct lightrec_state *state,
        return LE32TOH(*(u32 *)host);
 }
 
+static u32 lightrec_default_lwu(struct lightrec_state *state,
+                               u32 opcode, void *host, u32 addr)
+{
+       u32 val;
+
+       memcpy(&val, host, 4);
+
+       return LE32TOH(val);
+}
+
+static void lightrec_default_swu(struct lightrec_state *state, u32 opcode,
+                                void *host, u32 addr, u32 data)
+{
+       data = HTOLE32(data);
+
+       memcpy(host, &data, 4);
+
+       if (!(state->opt_flags & LIGHTREC_OPT_INV_DMA_ONLY))
+               lightrec_invalidate(state, addr & ~0x3, 8);
+}
+
 static const struct lightrec_mem_map_ops lightrec_default_ops = {
        .sb = lightrec_default_sb,
        .sh = lightrec_default_sh,
@@ -87,6 +108,8 @@ static const struct lightrec_mem_map_ops lightrec_default_ops = {
        .lb = lightrec_default_lb,
        .lh = lightrec_default_lh,
        .lw = lightrec_default_lw,
+       .lwu = lightrec_default_lwu,
+       .swu = lightrec_default_swu,
 };
 
 static void __segfault_cb(struct lightrec_state *state, u32 addr,
@@ -94,9 +117,9 @@ static void __segfault_cb(struct lightrec_state *state, u32 addr,
 {
        lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
        pr_err("Segmentation fault in recompiled code: invalid "
-              "load/store at address 0x%08x\n", addr);
+              "load/store at address "PC_FMT"\n", addr);
        if (block)
-               pr_err("Was executing block PC 0x%08x\n", block->pc);
+               pr_err("Was executing block "PC_FMT"\n", block->pc);
 }
 
 static void lightrec_swl(struct lightrec_state *state,
@@ -198,7 +221,7 @@ static void lightrec_invalidate_map(struct lightrec_state *state,
        }
 }
 
-enum psx_map
+static enum psx_map
 lightrec_get_map_idx(struct lightrec_state *state, u32 kaddr)
 {
        const struct lightrec_mem_map *map;
@@ -237,26 +260,43 @@ lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr)
        return map;
 }
 
-u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u32 *flags, struct block *block)
+u32 lightrec_rw(struct lightrec_state *state, union code op, u32 base,
+               u32 data, u32 *flags, struct block *block, u16 offset)
 {
        const struct lightrec_mem_map *map;
        const struct lightrec_mem_map_ops *ops;
        u32 opcode = op.opcode;
+       bool was_tagged = true;
+       u16 old_flags;
+       u32 addr;
        void *host;
 
-       addr += (s16) op.i.imm;
+       addr = kunseg(base + (s16) op.i.imm);
 
-       map = lightrec_get_map(state, &host, kunseg(addr));
+       map = lightrec_get_map(state, &host, addr);
        if (!map) {
                __segfault_cb(state, addr, block);
                return 0;
        }
 
+       if (flags)
+               was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(*flags);
 
        if (likely(!map->ops)) {
-               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
-                       *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags)) {
+                       /* Force parallel port accesses as HW accesses, because
+                        * the direct-I/O emitters can't differenciate it. */
+                       if (unlikely(map == &state->maps[PSX_MAP_PARALLEL_PORT]))
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+                       /* If the base register is 0x0, be extra suspicious.
+                        * Some games (e.g. Sled Storm) actually do segmentation
+                        * faults by using uninitialized pointers, which are
+                        * later initialized to point to hardware registers. */
+                       else if (op.i.rs && base == 0x0)
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+                       else
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+               }
 
                ops = &lightrec_default_ops;
        } else if (flags &&
@@ -269,12 +309,23 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
                ops = map->ops;
        }
 
+       if (!was_tagged) {
+               old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
+
+               if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
+                       pr_debug("Opcode of block at "PC_FMT" has been tagged"
+                                " - flag for recompilation\n", block->pc);
+
+                       lut_write(state, lut_offset(block->pc), NULL);
+               }
+       }
+
        switch (op.i.op) {
        case OP_SB:
-               ops->sb(state, opcode, host, addr, (u8) data);
+               ops->sb(state, opcode, host, addr, data);
                return 0;
        case OP_SH:
-               ops->sh(state, opcode, host, addr, (u16) data);
+               ops->sh(state, opcode, host, addr, data);
                return 0;
        case OP_SWL:
                lightrec_swl(state, ops, opcode, host, addr, data);
@@ -303,6 +354,11 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
                return lightrec_lwl(state, ops, opcode, host, addr, data);
        case OP_LWR:
                return lightrec_lwr(state, ops, opcode, host, addr, data);
+       case OP_META_LWU:
+               return ops->lwu(state, opcode, host, addr);
+       case OP_META_SWU:
+               ops->swu(state, opcode, host, addr, data);
+               return 0;
        case OP_LW:
        default:
                return ops->lw(state, opcode, host, addr);
@@ -311,10 +367,10 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
 
 static void lightrec_rw_helper(struct lightrec_state *state,
                               union code op, u32 *flags,
-                              struct block *block)
+                              struct block *block, u16 offset)
 {
        u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs],
-                             state->regs.gpr[op.i.rt], flags, block);
+                             state->regs.gpr[op.i.rt], flags, block, offset);
 
        switch (op.i.op) {
        case OP_LB:
@@ -324,8 +380,13 @@ static void lightrec_rw_helper(struct lightrec_state *state,
        case OP_LWL:
        case OP_LWR:
        case OP_LW:
-               if (op.i.rt)
+       case OP_META_LWU:
+               if (OPT_HANDLE_LOAD_DELAYS && unlikely(!state->in_delay_slot_n)) {
+                       state->temp_reg = ret;
+                       state->in_delay_slot_n = 0xff;
+               } else if (op.i.rt) {
                        state->regs.gpr[op.i.rt] = ret;
+               }
                fallthrough;
        default:
                break;
@@ -334,41 +395,26 @@ static void lightrec_rw_helper(struct lightrec_state *state,
 
 static void lightrec_rw_cb(struct lightrec_state *state, u32 arg)
 {
-       lightrec_rw_helper(state, (union code) arg, NULL, NULL);
+       lightrec_rw_helper(state, (union code) arg, NULL, NULL, 0);
 }
 
 static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
 {
        struct block *block;
        struct opcode *op;
-       bool was_tagged;
        u16 offset = (u16)arg;
-       u16 old_flags;
 
        block = lightrec_find_block_from_lut(state->block_cache,
-                                            arg >> 16, state->next_pc);
+                                            arg >> 16, state->curr_pc);
        if (unlikely(!block)) {
-               pr_err("rw_generic: No block found in LUT for PC 0x%x offset 0x%x\n",
-                        state->next_pc, offset);
+               pr_err("rw_generic: No block found in LUT for "PC_FMT" offset 0x%"PRIx16"\n",
+                        state->curr_pc, offset);
                lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
                return;
        }
 
        op = &block->opcode_list[offset];
-       was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(op->flags);
-
-       lightrec_rw_helper(state, op->c, &op->flags, block);
-
-       if (!was_tagged) {
-               old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
-
-               if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
-                       pr_debug("Opcode of block at PC 0x%08x has been tagged"
-                                " - flag for recompilation\n", block->pc);
-
-                       lut_write(state, lut_offset(block->pc), NULL);
-               }
-       }
+       lightrec_rw_helper(state, op->c, &op->flags, block, offset);
 }
 
 static u32 clamp_s32(s32 val, s32 min, s32 max)
@@ -428,7 +474,10 @@ u32 lightrec_mfc(struct lightrec_state *state, union code op)
 
        if (op.i.op == OP_CP0)
                return state->regs.cp0[op.r.rd];
-       else if (op.r.rs == OP_CP2_BASIC_MFC2)
+
+       if (op.i.op == OP_SWC2) {
+               val = lightrec_mfc2(state, op.i.rt);
+       } else if (op.r.rs == OP_CP2_BASIC_MFC2)
                val = lightrec_mfc2(state, op.r.rd);
        else {
                val = state->regs.cp2c[op.r.rd];
@@ -458,7 +507,9 @@ static void lightrec_mfc_cb(struct lightrec_state *state, union code op)
 {
        u32 rt = lightrec_mfc(state, op);
 
-       if (op.r.rt)
+       if (op.i.op == OP_SWC2)
+               state->temp_reg = rt;
+       else if (op.r.rt)
                state->regs.gpr[op.r.rt] = rt;
 }
 
@@ -502,7 +553,7 @@ static void lightrec_mtc0(struct lightrec_state *state, u8 reg, u32 data)
                status = state->regs.cp0[12];
 
                /* Handle software interrupts */
-               if (!!(status & cause & 0x300) & status)
+               if ((!!(status & cause & 0x300)) & status)
                        lightrec_set_exit_flags(state, LIGHTREC_EXIT_CHECK_INTERRUPT);
 
                /* Handle hardware interrupts */
@@ -576,15 +627,15 @@ static void lightrec_ctc2(struct lightrec_state *state, u8 reg, u32 data)
        }
 }
 
-void lightrec_mtc(struct lightrec_state *state, union code op, u32 data)
+void lightrec_mtc(struct lightrec_state *state, union code op, u8 reg, u32 data)
 {
        if (op.i.op == OP_CP0) {
-               lightrec_mtc0(state, op.r.rd, data);
+               lightrec_mtc0(state, reg, data);
        } else {
-               if (op.r.rs == OP_CP2_BASIC_CTC2)
-                       lightrec_ctc2(state, op.r.rd, data);
+               if (op.i.op == OP_LWC2 || op.r.rs != OP_CP2_BASIC_CTC2)
+                       lightrec_mtc2(state, reg, data);
                else
-                       lightrec_mtc2(state, op.r.rd, data);
+                       lightrec_ctc2(state, reg, data);
 
                if (state->ops.cop2_notify)
                        (*state->ops.cop2_notify)(state, op.opcode, data);
@@ -594,8 +645,18 @@ void lightrec_mtc(struct lightrec_state *state, union code op, u32 data)
 static void lightrec_mtc_cb(struct lightrec_state *state, u32 arg)
 {
        union code op = (union code) arg;
+       u32 data;
+       u8 reg;
 
-       lightrec_mtc(state, op, state->regs.gpr[op.r.rt]);
+       if (op.i.op == OP_LWC2) {
+               data = state->temp_reg;
+               reg = op.i.rt;
+       } else {
+               data = state->regs.gpr[op.r.rt];
+               reg = op.r.rd;
+       }
+
+       lightrec_mtc(state, op, reg, data);
 }
 
 void lightrec_rfe(struct lightrec_state *state)
@@ -633,7 +694,7 @@ static struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
        u8 old_flags;
 
        if (block && lightrec_block_is_outdated(state, block)) {
-               pr_debug("Block at PC 0x%08x is outdated!\n", block->pc);
+               pr_debug("Block at "PC_FMT" is outdated!\n", block->pc);
 
                old_flags = block_set_flags(block, BLOCK_IS_DEAD);
                if (!(old_flags & BLOCK_IS_DEAD)) {
@@ -653,7 +714,7 @@ static struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
        if (!block) {
                block = lightrec_precompile_block(state, pc);
                if (!block) {
-                       pr_err("Unable to recompile block at PC 0x%x\n", pc);
+                       pr_err("Unable to recompile block at "PC_FMT"\n", pc);
                        lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
                        return NULL;
                }
@@ -671,7 +732,7 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
        void *func;
        int err;
 
-       for (;;) {
+       do {
                func = lut_read(state, lut_offset(pc));
                if (func && func != state->get_next_block)
                        break;
@@ -688,10 +749,11 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                }
 
                should_recompile = block_has_flag(block, BLOCK_SHOULD_RECOMPILE) &&
+                       !block_has_flag(block, BLOCK_NEVER_COMPILE) &&
                        !block_has_flag(block, BLOCK_IS_DEAD);
 
                if (unlikely(should_recompile)) {
-                       pr_debug("Block at PC 0x%08x should recompile\n", pc);
+                       pr_debug("Block at "PC_FMT" should recompile\n", pc);
 
                        if (ENABLE_THREADED_COMPILER) {
                                lightrec_recompiler_add(state->rec, block);
@@ -740,13 +802,10 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                } else {
                        lightrec_recompiler_add(state->rec, block);
                }
+       } while (state->exit_flags == LIGHTREC_EXIT_NORMAL
+                && state->current_cycle < state->target_cycle);
 
-               if (state->exit_flags != LIGHTREC_EXIT_NORMAL ||
-                   state->current_cycle >= state->target_cycle)
-                       break;
-       }
-
-       state->next_pc = pc;
+       state->curr_pc = pc;
        return func;
 }
 
@@ -791,6 +850,8 @@ static void lightrec_free_code(struct lightrec_state *state, void *ptr)
                lightrec_code_alloc_unlock(state);
 }
 
+static char lightning_code_data[0x80000];
+
 static void * lightrec_emit_code(struct lightrec_state *state,
                                 const struct block *block,
                                 jit_state_t *_jit, unsigned int *size)
@@ -801,7 +862,9 @@ static void * lightrec_emit_code(struct lightrec_state *state,
 
        jit_realize();
 
-       if (!ENABLE_DISASSEMBLER)
+       if (ENABLE_DISASSEMBLER)
+               jit_set_data(lightning_code_data, sizeof(lightning_code_data), 0);
+       else
                jit_set_data(NULL, 0, JIT_DISABLE_DATA | JIT_DISABLE_NOTE);
 
        if (has_code_buffer) {
@@ -847,6 +910,9 @@ static void * lightrec_emit_code(struct lightrec_state *state,
 
        *size = (unsigned int) new_code_size;
 
+       if (state->ops.code_inv)
+               state->ops.code_inv(code, new_code_size);
+
        return code;
 }
 
@@ -857,6 +923,15 @@ static struct block * generate_wrapper(struct lightrec_state *state)
        unsigned int i;
        jit_node_t *addr[C_WRAPPERS_COUNT - 1];
        jit_node_t *to_end[C_WRAPPERS_COUNT - 1];
+       u8 tmp = JIT_R1;
+
+#ifdef __sh__
+       /* On SH, GBR-relative loads target the r0 register.
+        * Use it as the temporary register to factorize the move to
+        * JIT_R1. */
+       if (LIGHTREC_REG_STATE == _GBR)
+               tmp = _R0;
+#endif
 
        block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block));
        if (!block)
@@ -875,17 +950,18 @@ static struct block * generate_wrapper(struct lightrec_state *state)
 
        /* Add entry points */
        for (i = C_WRAPPERS_COUNT - 1; i > 0; i--) {
-               jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+               jit_ldxi(tmp, LIGHTREC_REG_STATE,
                         offsetof(struct lightrec_state, c_wrappers[i]));
                to_end[i - 1] = jit_b();
                addr[i - 1] = jit_indirect();
        }
 
-       jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+       jit_ldxi(tmp, LIGHTREC_REG_STATE,
                 offsetof(struct lightrec_state, c_wrappers[0]));
 
        for (i = 0; i < C_WRAPPERS_COUNT - 1; i++)
                jit_patch(to_end[i]);
+       jit_movr(JIT_R1, tmp);
 
        jit_epilog();
        jit_prolog();
@@ -971,27 +1047,78 @@ static u32 lightrec_memset(struct lightrec_state *state)
        u32 length = state->regs.gpr[5] * 4;
 
        if (!map) {
-               pr_err("Unable to find memory map for memset target address "
-                      "0x%x\n", kunseg_pc);
+               pr_err("Unable to find memory map for memset target address "PC_FMT"\n",
+                      kunseg_pc);
                return 0;
        }
 
-       pr_debug("Calling host memset, PC 0x%x (host address 0x%" PRIxPTR ") for %u bytes\n",
+       pr_debug("Calling host memset, "PC_FMT" (host address 0x%"PRIxPTR") for %u bytes\n",
                 kunseg_pc, (uintptr_t)host, length);
        memset(host, 0, length);
 
-       if (!state->invalidate_from_dma_only)
+       if (!(state->opt_flags & LIGHTREC_OPT_INV_DMA_ONLY))
                lightrec_invalidate_map(state, map, kunseg_pc, length);
 
        /* Rough estimation of the number of cycles consumed */
        return 8 + 5 * (length  + 3 / 4);
 }
 
+static u32 lightrec_check_load_delay(struct lightrec_state *state, u32 pc, u8 reg)
+{
+       struct block *block;
+       union code first_op;
+
+       first_op = lightrec_read_opcode(state, pc);
+
+       if (likely(!opcode_reads_register(first_op, reg))) {
+               state->regs.gpr[reg] = state->temp_reg;
+       } else {
+               block = lightrec_get_block(state, pc);
+               if (unlikely(!block)) {
+                       pr_err("Unable to get block at "PC_FMT"\n", pc);
+                       lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
+                       pc = 0;
+               } else {
+                       pc = lightrec_handle_load_delay(state, block, pc, reg);
+               }
+       }
+
+       return pc;
+}
+
+static void update_cycle_counter_before_c(jit_state_t *_jit)
+{
+       /* update state->current_cycle */
+       jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, target_cycle));
+       jit_subr(JIT_R1, JIT_R2, LIGHTREC_REG_CYCLE);
+       jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+                  LIGHTREC_REG_STATE, JIT_R1);
+}
+
+static void update_cycle_counter_after_c(jit_state_t *_jit)
+{
+       /* Recalc the delta */
+       jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, current_cycle));
+       jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, target_cycle));
+       jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+}
+
+static void sync_next_pc(jit_state_t *_jit)
+{
+       if (lightrec_store_next_pc()) {
+               jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
+                           offsetof(struct lightrec_state, next_pc));
+       }
+}
+
 static struct block * generate_dispatcher(struct lightrec_state *state)
 {
        struct block *block;
        jit_state_t *_jit;
-       jit_node_t *to_end, *loop, *addr, *addr2, *addr3;
+       jit_node_t *to_end, *loop, *addr, *addr2, *addr3, *addr4, *addr5, *jmp, *jmp2;
        unsigned int i;
        u32 offset;
 
@@ -1009,6 +1136,8 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        jit_prolog();
        jit_frame(256);
 
+       jit_getarg(LIGHTREC_REG_STATE, jit_arg());
+       jit_getarg(JIT_V0, jit_arg());
        jit_getarg(JIT_V1, jit_arg());
        jit_getarg_i(LIGHTREC_REG_CYCLE, jit_arg());
 
@@ -1016,10 +1145,6 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        for (i = 0; i < NUM_REGS; i++)
                jit_movr(JIT_V(i + FIRST_REG), JIT_V(i + FIRST_REG));
 
-       /* Pass lightrec_state structure to blocks, using the last callee-saved
-        * register that Lightning provides */
-       jit_movi(LIGHTREC_REG_STATE, (intptr_t) state);
-
        loop = jit_label();
 
        /* Call the block's code */
@@ -1034,21 +1159,82 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
 
                jit_prepare();
                jit_pushargr(LIGHTREC_REG_STATE);
+
                jit_finishi(lightrec_memset);
+               jit_retval(LIGHTREC_REG_CYCLE);
 
                jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
                            offsetof(struct lightrec_state, regs.gpr[31]));
-
-               jit_retval(LIGHTREC_REG_CYCLE);
                jit_subr(LIGHTREC_REG_CYCLE, JIT_V1, LIGHTREC_REG_CYCLE);
+
+               if (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)
+                       jmp = jit_b();
+       }
+
+       if (OPT_DETECT_IMPOSSIBLE_BRANCHES) {
+               /* Blocks will jump here when they reach a branch that should
+                * be executed with the interpreter, passing the branch's PC
+                * in JIT_V0 and the address of the block in JIT_V1. */
+               addr4 = jit_indirect();
+
+               sync_next_pc(_jit);
+               update_cycle_counter_before_c(_jit);
+
+               jit_prepare();
+               jit_pushargr(LIGHTREC_REG_STATE);
+               jit_pushargr(JIT_V1);
+               jit_pushargr(JIT_V0);
+               jit_finishi(lightrec_emulate_block);
+
+               jit_retval(JIT_V0);
+
+               update_cycle_counter_after_c(_jit);
+
+               if (OPT_HANDLE_LOAD_DELAYS)
+                       jmp2 = jit_b();
+
+       }
+
+       if (OPT_HANDLE_LOAD_DELAYS) {
+               /* Blocks will jump here when they reach a branch with a load
+                * opcode in its delay slot. The delay slot has already been
+                * executed; the load value is in (state->temp_reg), and the
+                * register number is in JIT_V1.
+                * Jump to a C function which will evaluate the branch target's
+                * first opcode, to make sure that it does not read the register
+                * in question; and if it does, handle it accordingly. */
+               addr5 = jit_indirect();
+
+               sync_next_pc(_jit);
+               update_cycle_counter_before_c(_jit);
+
+               jit_prepare();
+               jit_pushargr(LIGHTREC_REG_STATE);
+               jit_pushargr(JIT_V0);
+               jit_pushargr(JIT_V1);
+               jit_finishi(lightrec_check_load_delay);
+
+               jit_retval(JIT_V0);
+
+               update_cycle_counter_after_c(_jit);
        }
 
        /* The block will jump here, with the number of cycles remaining in
         * LIGHTREC_REG_CYCLE */
        addr2 = jit_indirect();
 
-       /* Store back the next_pc to the lightrec_state structure */
-       offset = offsetof(struct lightrec_state, next_pc);
+       sync_next_pc(_jit);
+
+       if (OPT_HANDLE_LOAD_DELAYS && OPT_DETECT_IMPOSSIBLE_BRANCHES)
+             jit_patch(jmp2);
+
+       if (OPT_REPLACE_MEMSET
+           && (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)) {
+               jit_patch(jmp);
+       }
+
+       /* Store back the next PC to the lightrec_state structure */
+       offset = offsetof(struct lightrec_state, curr_pc);
        jit_stxi_i(offset, LIGHTREC_REG_STATE, JIT_V0);
 
        /* Jump to end if state->target_cycle < state->current_cycle */
@@ -1064,7 +1250,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        /* If possible, use the code LUT */
        if (!lut_is_32bit(state))
                jit_lshi(JIT_V1, JIT_V1, 1);
-       jit_addr(JIT_V1, JIT_V1, LIGHTREC_REG_STATE);
+       jit_add_state(JIT_V1, JIT_V1);
 
        offset = offsetof(struct lightrec_state, code_lut);
        if (lut_is_32bit(state))
@@ -1084,11 +1270,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
 
        if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* We may call the interpreter - update state->current_cycle */
-               jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, target_cycle));
-               jit_subr(JIT_V1, JIT_R2, LIGHTREC_REG_CYCLE);
-               jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
-                          LIGHTREC_REG_STATE, JIT_V1);
+               update_cycle_counter_before_c(_jit);
        }
 
        jit_prepare();
@@ -1106,15 +1288,15 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* The interpreter may have updated state->current_cycle and
                 * state->target_cycle - recalc the delta */
-               jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, current_cycle));
-               jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, target_cycle));
-               jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+               update_cycle_counter_after_c(_jit);
        } else {
                jit_movr(LIGHTREC_REG_CYCLE, JIT_V0);
        }
 
+       /* Reset JIT_V0 to the next PC */
+       jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
+                   offsetof(struct lightrec_state, curr_pc));
+
        /* If we get non-NULL, loop */
        jit_patch_at(jit_bnei(JIT_V1, 0), loop);
 
@@ -1136,6 +1318,10 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
                goto err_free_block;
 
        state->eob_wrapper_func = jit_address(addr2);
+       if (OPT_DETECT_IMPOSSIBLE_BRANCHES)
+               state->interpreter_func = jit_address(addr4);
+       if (OPT_HANDLE_LOAD_DELAYS)
+               state->ds_check_func = jit_address(addr5);
        if (OPT_REPLACE_MEMSET)
                state->memset_func = jit_address(addr3);
        state->get_next_block = jit_address(addr);
@@ -1166,9 +1352,10 @@ union code lightrec_read_opcode(struct lightrec_state *state, u32 pc)
        return (union code) LE32TOH(*code);
 }
 
-unsigned int lightrec_cycles_of_opcode(union code code)
+unsigned int lightrec_cycles_of_opcode(const struct lightrec_state *state,
+                                      union code code)
 {
-       return 2;
+       return state->cycles_per_op;
 }
 
 void lightrec_free_opcode_list(struct lightrec_state *state, struct opcode *ops)
@@ -1274,11 +1461,6 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
 
        pr_debug("Block size: %hu opcodes\n", block->nb_ops);
 
-       /* If the first opcode is an 'impossible' branch, never compile the
-        * block */
-       if (should_emulate(block->opcode_list))
-               block_flags |= BLOCK_NEVER_COMPILE;
-
        fully_tagged = lightrec_block_is_fully_tagged(block);
        if (fully_tagged)
                block_flags |= BLOCK_FULLY_TAGGED;
@@ -1294,7 +1476,7 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
                addr = state->get_next_block;
        lut_write(state, lut_offset(pc), addr);
 
-       pr_debug("Recompile count: %u\n", state->nb_precompile++);
+       pr_debug("Blocks created: %u\n", ++state->nb_precompile);
 
        return block;
 }
@@ -1307,8 +1489,12 @@ static bool lightrec_block_is_fully_tagged(const struct block *block)
        for (i = 0; i < block->nb_ops; i++) {
                op = &block->opcode_list[i];
 
-               /* Verify that all load/stores of the opcode list
-                * Check all loads/stores of the opcode list and mark the
+               /* If we have one branch that must be emulated, we cannot trash
+                * the opcode list. */
+               if (should_emulate(op))
+                       return false;
+
+               /* Check all loads/stores of the opcode list and mark the
                 * block as fully compiled if they all have been tagged. */
                switch (op->c.i.op) {
                case OP_LB:
@@ -1325,6 +1511,8 @@ static bool lightrec_block_is_fully_tagged(const struct block *block)
                case OP_SWR:
                case OP_LWC2:
                case OP_SWC2:
+               case OP_META_LWU:
+               case OP_META_SWU:
                        if (!LIGHTREC_FLAGS_GET_IO_MODE(op->flags))
                                return false;
                        fallthrough;
@@ -1340,7 +1528,7 @@ static void lightrec_reap_block(struct lightrec_state *state, void *data)
 {
        struct block *block = data;
 
-       pr_debug("Reap dead block at PC 0x%08x\n", block->pc);
+       pr_debug("Reap dead block at "PC_FMT"\n", block->pc);
        lightrec_unregister_block(state->block_cache, block);
        lightrec_free_block(state, block);
 }
@@ -1399,9 +1587,14 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
        block->_jit = _jit;
 
        lightrec_regcache_reset(cstate->reg_cache);
+
+       if (OPT_PRELOAD_PC && (block->flags & BLOCK_PRELOAD_PC))
+               lightrec_preload_pc(cstate->reg_cache, _jit);
+
        cstate->cycles = 0;
        cstate->nb_local_branches = 0;
        cstate->nb_targets = 0;
+       cstate->no_load_delay = false;
 
        jit_prolog();
        jit_tramp(256);
@@ -1420,7 +1613,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        pr_debug("Branch at offset 0x%x will be emulated\n",
                                 i << 2);
 
-                       lightrec_emit_eob(cstate, block, i);
+                       lightrec_emit_jump_to_interpreter(cstate, block, i);
                        skip_next = !op_flag_no_ds(elm->flags);
                } else {
                        lightrec_rec_opcode(cstate, block, i);
@@ -1434,7 +1627,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
 #endif
                }
 
-               cstate->cycles += lightrec_cycles_of_opcode(elm->c);
+               cstate->cycles += lightrec_cycles_of_opcode(state, elm->c);
        }
 
        for (i = 0; i < cstate->nb_local_branches; i++) {
@@ -1555,7 +1748,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                old_flags = block_set_flags(block, BLOCK_NO_OPCODE_LIST);
 
        if (fully_tagged && !(old_flags & BLOCK_NO_OPCODE_LIST)) {
-               pr_debug("Block PC 0x%08x is fully tagged"
+               pr_debug("Block "PC_FMT" is fully tagged"
                         " - free opcode list\n", block->pc);
 
                if (ENABLE_THREADED_COMPILER) {
@@ -1584,6 +1777,8 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                lightrec_unregister(MEM_FOR_CODE, old_code_size);
        }
 
+       pr_debug("Blocks compiled: %u\n", ++state->nb_compile);
+
        return 0;
 }
 
@@ -1603,7 +1798,7 @@ static void lightrec_print_info(struct lightrec_state *state)
 
 u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
 {
-       s32 (*func)(void *, s32) = (void *)state->dispatcher->function;
+       s32 (*func)(struct lightrec_state *, u32, void *, s32) = (void *)state->dispatcher->function;
        void *block_trace;
        s32 cycles_delta;
 
@@ -1614,13 +1809,14 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
                target_cycle = UINT_MAX;
 
        state->target_cycle = target_cycle;
-       state->next_pc = pc;
+       state->curr_pc = pc;
 
        block_trace = get_next_block_func(state, pc);
        if (block_trace) {
                cycles_delta = state->target_cycle - state->current_cycle;
 
-               cycles_delta = (*func)(block_trace, cycles_delta);
+               cycles_delta = (*func)(state, state->curr_pc,
+                                      block_trace, cycles_delta);
 
                state->current_cycle = state->target_cycle - cycles_delta;
        }
@@ -1631,7 +1827,7 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
        if (LOG_LEVEL >= INFO_L)
                lightrec_print_info(state);
 
-       return state->next_pc;
+       return state->curr_pc;
 }
 
 u32 lightrec_run_interpreter(struct lightrec_state *state, u32 pc,
@@ -1745,7 +1941,7 @@ struct lightrec_state * lightrec_init(char *argv0,
        else
                lut_size = CODE_LUT_SIZE * sizeof(void *);
 
-       init_jit(argv0);
+       init_jit_with_debug(argv0, stdout);
 
        state = calloc(1, sizeof(*state) + lut_size);
        if (!state)
@@ -1755,6 +1951,8 @@ struct lightrec_state * lightrec_init(char *argv0,
 
        state->tlsf = tlsf;
        state->with_32bit_lut = with_32bit_lut;
+       state->in_delay_slot_n = 0xff;
+       state->cycles_per_op = 2;
 
        state->block_cache = lightrec_blockcache_init(state);
        if (!state->block_cache)
@@ -1901,12 +2099,12 @@ void lightrec_invalidate_all(struct lightrec_state *state)
        memset(state->code_lut, 0, lut_elm_size(state) * CODE_LUT_SIZE);
 }
 
-void lightrec_set_invalidate_mode(struct lightrec_state *state, bool dma_only)
+void lightrec_set_unsafe_opt_flags(struct lightrec_state *state, u32 flags)
 {
-       if (state->invalidate_from_dma_only != dma_only)
+       if ((flags ^ state->opt_flags) & LIGHTREC_OPT_INV_DMA_ONLY)
                lightrec_invalidate_all(state);
 
-       state->invalidate_from_dma_only = dma_only;
+       state->opt_flags = flags;
 }
 
 void lightrec_set_exit_flags(struct lightrec_state *state, u32 flags)
@@ -1949,3 +2147,8 @@ struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state)
 {
        return &state->regs;
 }
+
+void lightrec_set_cycles_per_opcode(struct lightrec_state *state, u32 cycles)
+{
+       state->cycles_per_op = cycles;
+}