git subrepo pull --force deps/lightrec
[pcsx_rearmed.git] / deps / lightrec / lightrec.c
index d172a30..be4da10 100644 (file)
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
-#if ENABLE_TINYMM
-#include <tinymm.h>
-#endif
-
-#define GENMASK(h, l) \
-       (((uintptr_t)-1 << (l)) & ((uintptr_t)-1 >> (__WORDSIZE - 1 - (h))))
 
 static struct block * lightrec_precompile_block(struct lightrec_state *state,
                                                u32 pc);
@@ -110,7 +104,7 @@ static void lightrec_swl(struct lightrec_state *state,
                         u32 opcode, void *host, u32 addr, u32 data)
 {
        unsigned int shift = addr & 0x3;
-       unsigned int mask = GENMASK(31, (shift + 1) * 8);
+       unsigned int mask = shift < 3 ? GENMASK(31, (shift + 1) * 8) : 0;
        u32 old_data;
 
        /* Align to 32 bits */
@@ -174,7 +168,7 @@ static u32 lightrec_lwr(struct lightrec_state *state,
                        u32 opcode, void *host, u32 addr, u32 data)
 {
        unsigned int shift = addr & 0x3;
-       unsigned int mask = GENMASK(31, 32 - shift * 8);
+       unsigned int mask = shift ? GENMASK(31, 32 - shift * 8) : 0;
        u32 old_data;
 
        /* Align to 32 bits */
@@ -244,7 +238,7 @@ lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr)
 }
 
 u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u16 *flags, struct block *block)
+               u32 addr, u32 data, u32 *flags, struct block *block)
 {
        const struct lightrec_mem_map *map;
        const struct lightrec_mem_map_ops *ops;
@@ -259,16 +253,20 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
                return 0;
        }
 
-       if (unlikely(map->ops)) {
-               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
-                       *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
 
-               ops = map->ops;
-       } else {
+       if (likely(!map->ops)) {
                if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
                        *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
 
                ops = &lightrec_default_ops;
+       } else if (flags &&
+                  LIGHTREC_FLAGS_GET_IO_MODE(*flags) == LIGHTREC_IO_DIRECT_HW) {
+               ops = &lightrec_default_ops;
+       } else {
+               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
+                       *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+
+               ops = map->ops;
        }
 
        switch (op.i.op) {
@@ -312,7 +310,7 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
 }
 
 static void lightrec_rw_helper(struct lightrec_state *state,
-                              union code op, u16 *flags,
+                              union code op, u32 *flags,
                               struct block *block)
 {
        u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs],
@@ -328,29 +326,31 @@ static void lightrec_rw_helper(struct lightrec_state *state,
        case OP_LW:
                if (op.i.rt)
                        state->regs.gpr[op.i.rt] = ret;
-       default: /* fall-through */
+               fallthrough;
+       default:
                break;
        }
 }
 
-static void lightrec_rw_cb(struct lightrec_state *state)
+static void lightrec_rw_cb(struct lightrec_state *state, u32 arg)
 {
-       lightrec_rw_helper(state, (union code)state->c_wrapper_arg, NULL, NULL);
+       lightrec_rw_helper(state, (union code) arg, NULL, NULL);
 }
 
-static void lightrec_rw_generic_cb(struct lightrec_state *state)
+static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
 {
        struct block *block;
        struct opcode *op;
        bool was_tagged;
-       u32 arg = state->c_wrapper_arg;
        u16 offset = (u16)arg;
+       u16 old_flags;
 
        block = lightrec_find_block_from_lut(state->block_cache,
                                             arg >> 16, state->next_pc);
        if (unlikely(!block)) {
                pr_err("rw_generic: No block found in LUT for PC 0x%x offset 0x%x\n",
                         state->next_pc, offset);
+               lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
                return;
        }
 
@@ -360,10 +360,14 @@ static void lightrec_rw_generic_cb(struct lightrec_state *state)
        lightrec_rw_helper(state, op->c, &op->flags, block);
 
        if (!was_tagged) {
-               pr_debug("Opcode of block at PC 0x%08x has been tagged - flag "
-                        "for recompilation\n", block->pc);
+               old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
+
+               if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
+                       pr_debug("Opcode of block at PC 0x%08x has been tagged"
+                                " - flag for recompilation\n", block->pc);
 
-               block->flags |= BLOCK_SHOULD_RECOMPILE;
+                       lut_write(state, lut_offset(block->pc), NULL);
+               }
        }
 }
 
@@ -372,6 +376,16 @@ static u32 clamp_s32(s32 val, s32 min, s32 max)
        return val < min ? min : val > max ? max : val;
 }
 
+static u16 load_u16(u32 *ptr)
+{
+       return ((struct u16x2 *) ptr)->l;
+}
+
+static void store_u16(u32 *ptr, u16 value)
+{
+       ((struct u16x2 *) ptr)->l = value;
+}
+
 static u32 lightrec_mfc2(struct lightrec_state *state, u8 reg)
 {
        s16 gteir1, gteir2, gteir3;
@@ -384,37 +398,68 @@ static u32 lightrec_mfc2(struct lightrec_state *state, u8 reg)
        case 9:
        case 10:
        case 11:
-               return (s32)(s16) state->regs.cp2d[reg];
+               return (s32)(s16) load_u16(&state->regs.cp2d[reg]);
        case 7:
        case 16:
        case 17:
        case 18:
        case 19:
-               return (u16) state->regs.cp2d[reg];
+               return load_u16(&state->regs.cp2d[reg]);
        case 28:
        case 29:
-               gteir1 = (s16) state->regs.cp2d[9];
-               gteir2 = (s16) state->regs.cp2d[10];
-               gteir3 = (s16) state->regs.cp2d[11];
+               gteir1 = (s16) load_u16(&state->regs.cp2d[9]);
+               gteir2 = (s16) load_u16(&state->regs.cp2d[10]);
+               gteir3 = (s16) load_u16(&state->regs.cp2d[11]);
 
                return clamp_s32(gteir1 >> 7, 0, 0x1f) << 0 |
                        clamp_s32(gteir2 >> 7, 0, 0x1f) << 5 |
                        clamp_s32(gteir3 >> 7, 0, 0x1f) << 10;
        case 15:
                reg = 14;
-       default: /* fall-through */
+               fallthrough;
+       default:
                return state->regs.cp2d[reg];
        }
 }
 
 u32 lightrec_mfc(struct lightrec_state *state, union code op)
 {
+       u32 val;
+
        if (op.i.op == OP_CP0)
                return state->regs.cp0[op.r.rd];
        else if (op.r.rs == OP_CP2_BASIC_MFC2)
-               return lightrec_mfc2(state, op.r.rd);
-       else
-               return state->regs.cp2c[op.r.rd];
+               val = lightrec_mfc2(state, op.r.rd);
+       else {
+               val = state->regs.cp2c[op.r.rd];
+
+               switch (op.r.rd) {
+               case 4:
+               case 12:
+               case 20:
+               case 26:
+               case 27:
+               case 29:
+               case 30:
+                       val = (u32)(s16)val;
+                       fallthrough;
+               default:
+                       break;
+               }
+       }
+
+       if (state->ops.cop2_notify)
+               (*state->ops.cop2_notify)(state, op.opcode, val);
+
+       return val;
+}
+
+static void lightrec_mfc_cb(struct lightrec_state *state, union code op)
+{
+       u32 rt = lightrec_mfc(state, op);
+
+       if (op.r.rt)
+               state->regs.gpr[op.r.rt] = rt;
 }
 
 static void lightrec_mtc0(struct lightrec_state *state, u8 reg, u32 data)
@@ -503,7 +548,8 @@ static void lightrec_mtc2(struct lightrec_state *state, u8 reg, u32 data)
                return;
        case 30:
                state->regs.cp2d[31] = count_leading_bits((s32) data);
-       default: /* fall-through */
+               fallthrough;
+       default:
                state->regs.cp2d[reg] = data;
                break;
        }
@@ -519,30 +565,35 @@ static void lightrec_ctc2(struct lightrec_state *state, u8 reg, u32 data)
        case 27:
        case 29:
        case 30:
-               data = (s32)(s16) data;
+               store_u16(&state->regs.cp2c[reg], data);
                break;
        case 31:
                data = (data & 0x7ffff000) | !!(data & 0x7f87e000) << 31;
-       default: /* fall-through */
+               fallthrough;
+       default:
+               state->regs.cp2c[reg] = data;
                break;
        }
-
-       state->regs.cp2c[reg] = data;
 }
 
 void lightrec_mtc(struct lightrec_state *state, union code op, u32 data)
 {
-       if (op.i.op == OP_CP0)
+       if (op.i.op == OP_CP0) {
                lightrec_mtc0(state, op.r.rd, data);
-       else if (op.r.rs == OP_CP2_BASIC_CTC2)
-               lightrec_ctc2(state, op.r.rd, data);
-       else
-               lightrec_mtc2(state, op.r.rd, data);
+       } else {
+               if (op.r.rs == OP_CP2_BASIC_CTC2)
+                       lightrec_ctc2(state, op.r.rd, data);
+               else
+                       lightrec_mtc2(state, op.r.rd, data);
+
+               if (state->ops.cop2_notify)
+                       (*state->ops.cop2_notify)(state, op.opcode, data);
+       }
 }
 
-static void lightrec_mtc_cb(struct lightrec_state *state)
+static void lightrec_mtc_cb(struct lightrec_state *state, u32 arg)
 {
-       union code op = (union code) state->c_wrapper_arg;
+       union code op = (union code) arg;
 
        lightrec_mtc(state, op, state->regs.gpr[op.r.rt]);
 }
@@ -571,36 +622,31 @@ void lightrec_cp(struct lightrec_state *state, union code op)
        (*state->ops.cop2_op)(state, op.opcode);
 }
 
-static void lightrec_cp_cb(struct lightrec_state *state)
-{
-       lightrec_cp(state, (union code) state->c_wrapper_arg);
-}
-
-static void lightrec_syscall_cb(struct lightrec_state *state)
-{
-       lightrec_set_exit_flags(state, LIGHTREC_EXIT_SYSCALL);
-}
-
-static void lightrec_break_cb(struct lightrec_state *state)
+static void lightrec_cp_cb(struct lightrec_state *state, u32 arg)
 {
-       lightrec_set_exit_flags(state, LIGHTREC_EXIT_BREAK);
+       lightrec_cp(state, (union code) arg);
 }
 
-struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
+static struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
 {
        struct block *block = lightrec_find_block(state->block_cache, pc);
+       u8 old_flags;
 
        if (block && lightrec_block_is_outdated(state, block)) {
                pr_debug("Block at PC 0x%08x is outdated!\n", block->pc);
 
-               /* Make sure the recompiler isn't processing the block we'll
-                * destroy */
-               if (ENABLE_THREADED_COMPILER)
-                       lightrec_recompiler_remove(state->rec, block);
+               old_flags = block_set_flags(block, BLOCK_IS_DEAD);
+               if (!(old_flags & BLOCK_IS_DEAD)) {
+                       /* Make sure the recompiler isn't processing the block
+                        * we'll destroy */
+                       if (ENABLE_THREADED_COMPILER)
+                               lightrec_recompiler_remove(state->rec, block);
+
+                       lightrec_unregister_block(state->block_cache, block);
+                       remove_from_code_lut(state->block_cache, block);
+                       lightrec_free_block(state, block);
+               }
 
-               lightrec_unregister_block(state->block_cache, block);
-               remove_from_code_lut(state->block_cache, block);
-               lightrec_free_block(state, block);
                block = NULL;
        }
 
@@ -623,9 +669,10 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
        struct block *block;
        bool should_recompile;
        void *func;
+       int err;
 
        for (;;) {
-               func = lut_read(state, pc);
+               func = lut_read(state, lut_offset(pc));
                if (func && func != state->get_next_block)
                        break;
 
@@ -634,23 +681,27 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                if (unlikely(!block))
                        break;
 
-               if (OPT_REPLACE_MEMSET && (block->flags & BLOCK_IS_MEMSET)) {
+               if (OPT_REPLACE_MEMSET &&
+                   block_has_flag(block, BLOCK_IS_MEMSET)) {
                        func = state->memset_func;
                        break;
                }
 
-               should_recompile = block->flags & BLOCK_SHOULD_RECOMPILE &&
-                       !(block->flags & BLOCK_IS_DEAD);
+               should_recompile = block_has_flag(block, BLOCK_SHOULD_RECOMPILE) &&
+                       !block_has_flag(block, BLOCK_IS_DEAD);
 
                if (unlikely(should_recompile)) {
                        pr_debug("Block at PC 0x%08x should recompile\n", pc);
 
-                       lightrec_unregister(MEM_FOR_CODE, block->code_size);
-
-                       if (ENABLE_THREADED_COMPILER)
+                       if (ENABLE_THREADED_COMPILER) {
                                lightrec_recompiler_add(state->rec, block);
-                       else
-                               lightrec_compile_block(state->cstate, block);
+                       } else {
+                               err = lightrec_compile_block(state->cstate, block);
+                               if (err) {
+                                       state->exit_flags = LIGHTREC_EXIT_NOMEM;
+                                       return NULL;
+                               }
+                       }
                }
 
                if (ENABLE_THREADED_COMPILER && likely(!should_recompile))
@@ -661,18 +712,31 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                if (likely(func))
                        break;
 
-               if (unlikely(block->flags & BLOCK_NEVER_COMPILE)) {
+               if (unlikely(block_has_flag(block, BLOCK_NEVER_COMPILE))) {
                        pc = lightrec_emulate_block(state, block, pc);
 
                } else if (!ENABLE_THREADED_COMPILER) {
                        /* Block wasn't compiled yet - run the interpreter */
-                       if (block->flags & BLOCK_FULLY_TAGGED)
+                       if (block_has_flag(block, BLOCK_FULLY_TAGGED))
                                pr_debug("Block fully tagged, skipping first pass\n");
                        else if (ENABLE_FIRST_PASS && likely(!should_recompile))
                                pc = lightrec_emulate_block(state, block, pc);
 
                        /* Then compile it using the profiled data */
-                       lightrec_compile_block(state->cstate, block);
+                       err = lightrec_compile_block(state->cstate, block);
+                       if (err) {
+                               state->exit_flags = LIGHTREC_EXIT_NOMEM;
+                               return NULL;
+                       }
+               } else if (unlikely(block_has_flag(block, BLOCK_IS_DEAD))) {
+                       /*
+                        * If the block is dead but has never been compiled,
+                        * then its function pointer is NULL and we cannot
+                        * execute the block. In that case, reap all the dead
+                        * blocks now, and in the next loop we will create a
+                        * new block.
+                        */
+                       lightrec_reaper_reap(state->reaper);
                } else {
                        lightrec_recompiler_add(state->rec, block);
                }
@@ -686,17 +750,49 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
        return func;
 }
 
-static s32 c_function_wrapper(struct lightrec_state *state, s32 cycles_delta,
-                             void (*f)(struct lightrec_state *))
+static void * lightrec_alloc_code(struct lightrec_state *state, size_t size)
 {
-       state->current_cycle = state->target_cycle - cycles_delta;
+       void *code;
+
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_lock(state);
 
-       (*f)(state);
+       code = tlsf_malloc(state->tlsf, size);
 
-       return state->target_cycle - state->current_cycle;
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_unlock(state);
+
+       return code;
+}
+
+static void lightrec_realloc_code(struct lightrec_state *state,
+                                 void *ptr, size_t size)
+{
+       /* NOTE: 'size' MUST be smaller than the size specified during
+        * the allocation. */
+
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_lock(state);
+
+       tlsf_realloc(state->tlsf, ptr, size);
+
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_unlock(state);
+}
+
+static void lightrec_free_code(struct lightrec_state *state, void *ptr)
+{
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_lock(state);
+
+       tlsf_free(state->tlsf, ptr);
+
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_code_alloc_unlock(state);
 }
 
 static void * lightrec_emit_code(struct lightrec_state *state,
+                                const struct block *block,
                                 jit_state_t *_jit, unsigned int *size)
 {
        bool has_code_buffer = ENABLE_CODE_BUFFER && state->tlsf;
@@ -710,9 +806,28 @@ static void * lightrec_emit_code(struct lightrec_state *state,
 
        if (has_code_buffer) {
                jit_get_code(&code_size);
-               code = tlsf_malloc(state->tlsf, (size_t) code_size);
-               if (!code)
-                       return NULL;
+               code = lightrec_alloc_code(state, (size_t) code_size);
+
+               if (!code) {
+                       if (ENABLE_THREADED_COMPILER) {
+                               /* If we're using the threaded compiler, return
+                                * an allocation error here. The threaded
+                                * compiler will then empty its job queue and
+                                * request a code flush using the reaper. */
+                               return NULL;
+                       }
+
+                       /* Remove outdated blocks, and try again */
+                       lightrec_remove_outdated_blocks(state->block_cache, block);
+
+                       pr_debug("Re-try to alloc %zu bytes...\n", code_size);
+
+                       code = lightrec_alloc_code(state, code_size);
+                       if (!code) {
+                               pr_err("Could not alloc even after removing old blocks!\n");
+                               return NULL;
+                       }
+               }
 
                jit_set_code(code, code_size);
        }
@@ -723,7 +838,7 @@ static void * lightrec_emit_code(struct lightrec_state *state,
        lightrec_register(MEM_FOR_CODE, new_code_size);
 
        if (has_code_buffer) {
-               tlsf_realloc(state->tlsf, code, new_code_size);
+               lightrec_realloc_code(state, code, (size_t) new_code_size);
 
                pr_debug("Creating code block at address 0x%" PRIxPTR ", "
                         "code size: %" PRIuPTR " new: %" PRIuPTR "\n",
@@ -740,9 +855,8 @@ static struct block * generate_wrapper(struct lightrec_state *state)
        struct block *block;
        jit_state_t *_jit;
        unsigned int i;
-       int stack_ptr;
-       jit_node_t *to_tramp, *to_fn_epilog;
        jit_node_t *addr[C_WRAPPERS_COUNT - 1];
+       jit_node_t *to_end[C_WRAPPERS_COUNT - 1];
 
        block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block));
        if (!block)
@@ -759,67 +873,72 @@ static struct block * generate_wrapper(struct lightrec_state *state)
        jit_prolog();
        jit_tramp(256);
 
-       /* Add entry points; separate them by opcodes that increment
-        * LIGHTREC_REG_STATE (since we cannot touch other registers).
-        * The difference will then tell us which C function to call. */
+       /* Add entry points */
        for (i = C_WRAPPERS_COUNT - 1; i > 0; i--) {
-               jit_addi(LIGHTREC_REG_STATE, LIGHTREC_REG_STATE, __WORDSIZE / 8);
+               jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+                        offsetof(struct lightrec_state, c_wrappers[i]));
+               to_end[i - 1] = jit_b();
                addr[i - 1] = jit_indirect();
        }
 
+       jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+                offsetof(struct lightrec_state, c_wrappers[0]));
+
+       for (i = 0; i < C_WRAPPERS_COUNT - 1; i++)
+               jit_patch(to_end[i]);
+
        jit_epilog();
        jit_prolog();
 
-       stack_ptr = jit_allocai(sizeof(uintptr_t) * NUM_TEMPS);
-
        /* Save all temporaries on stack */
-       for (i = 0; i < NUM_TEMPS; i++)
-               jit_stxi(stack_ptr + i * sizeof(uintptr_t), JIT_FP, JIT_R(i));
-
-       /* Jump to the trampoline */
-       to_tramp = jit_jmpi();
+       for (i = 0; i < NUM_TEMPS; i++) {
+               if (i + FIRST_TEMP != 1) {
+                       jit_stxi(offsetof(struct lightrec_state, wrapper_regs[i]),
+                                LIGHTREC_REG_STATE, JIT_R(i + FIRST_TEMP));
+               }
+       }
 
-       /* The trampoline will jump back here */
-       to_fn_epilog = jit_label();
+       jit_getarg(JIT_R2, jit_arg());
 
-       /* Restore temporaries from stack */
-       for (i = 0; i < NUM_TEMPS; i++)
-               jit_ldxi(JIT_R(i), JIT_FP, stack_ptr + i * sizeof(uintptr_t));
+       jit_prepare();
+       jit_pushargr(LIGHTREC_REG_STATE);
+       jit_pushargr(JIT_R2);
 
-       jit_ret();
-       jit_epilog();
+       jit_ldxi_ui(JIT_R2, LIGHTREC_REG_STATE,
+                   offsetof(struct lightrec_state, target_cycle));
 
-       /* Trampoline entry point.
-        * The sole purpose of the trampoline is to cheese Lightning not to
-        * save/restore the callee-saved register LIGHTREC_REG_CYCLE, since we
-        * do want to return to the caller with this register modified. */
-       jit_prolog();
-       jit_tramp(256);
-       jit_patch(to_tramp);
+       /* state->current_cycle = state->target_cycle - delta; */
+       jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, LIGHTREC_REG_CYCLE);
+       jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+                  LIGHTREC_REG_STATE, LIGHTREC_REG_CYCLE);
 
-       /* Retrieve the wrapper function */
-       jit_ldxi(JIT_R0, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, c_wrappers));
+       /* Call the wrapper function */
+       jit_finishr(JIT_R1);
 
-       /* Restore LIGHTREC_REG_STATE to its correct value */
-       jit_movi(LIGHTREC_REG_STATE, (uintptr_t) state);
+       /* delta = state->target_cycle - state->current_cycle */;
+       jit_ldxi_ui(LIGHTREC_REG_CYCLE, LIGHTREC_REG_STATE,
+                   offsetof(struct lightrec_state, current_cycle));
+       jit_ldxi_ui(JIT_R1, LIGHTREC_REG_STATE,
+                   offsetof(struct lightrec_state, target_cycle));
+       jit_subr(LIGHTREC_REG_CYCLE, JIT_R1, LIGHTREC_REG_CYCLE);
 
-       jit_prepare();
-       jit_pushargr(LIGHTREC_REG_STATE);
-       jit_pushargr(LIGHTREC_REG_CYCLE);
-       jit_pushargr(JIT_R0);
-       jit_finishi(c_function_wrapper);
-       jit_retval_i(LIGHTREC_REG_CYCLE);
+       /* Restore temporaries from stack */
+       for (i = 0; i < NUM_TEMPS; i++) {
+               if (i + FIRST_TEMP != 1) {
+                       jit_ldxi(JIT_R(i + FIRST_TEMP), LIGHTREC_REG_STATE,
+                                offsetof(struct lightrec_state, wrapper_regs[i]));
+               }
+       }
 
-       jit_patch_at(jit_jmpi(), to_fn_epilog);
+       jit_ret();
        jit_epilog();
 
        block->_jit = _jit;
        block->opcode_list = NULL;
-       block->flags = 0;
+       block->flags = BLOCK_NO_OPCODE_LIST;
        block->nb_ops = 0;
 
-       block->function = lightrec_emit_code(state, _jit,
+       block->function = lightrec_emit_code(state, block, _jit,
                                             &block->code_size);
        if (!block->function)
                goto err_free_block;
@@ -890,12 +1009,12 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        jit_prolog();
        jit_frame(256);
 
-       jit_getarg(JIT_R0, jit_arg());
+       jit_getarg(JIT_V1, jit_arg());
        jit_getarg_i(LIGHTREC_REG_CYCLE, jit_arg());
 
        /* Force all callee-saved registers to be pushed on the stack */
        for (i = 0; i < NUM_REGS; i++)
-               jit_movr(JIT_V(i), JIT_V(i));
+               jit_movr(JIT_V(i + FIRST_REG), JIT_V(i + FIRST_REG));
 
        /* Pass lightrec_state structure to blocks, using the last callee-saved
         * register that Lightning provides */
@@ -904,13 +1023,15 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        loop = jit_label();
 
        /* Call the block's code */
-       jit_jmpr(JIT_R0);
+       jit_jmpr(JIT_V1);
 
        if (OPT_REPLACE_MEMSET) {
                /* Blocks will jump here when they need to call
                 * lightrec_memset() */
                addr3 = jit_indirect();
 
+               jit_movr(JIT_V1, LIGHTREC_REG_CYCLE);
+
                jit_prepare();
                jit_pushargr(LIGHTREC_REG_STATE);
                jit_finishi(lightrec_memset);
@@ -918,8 +1039,8 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
                jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
                            offsetof(struct lightrec_state, regs.gpr[31]));
 
-               jit_retval(JIT_R0);
-               jit_subr(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, JIT_R0);
+               jit_retval(LIGHTREC_REG_CYCLE);
+               jit_subr(LIGHTREC_REG_CYCLE, JIT_V1, LIGHTREC_REG_CYCLE);
        }
 
        /* The block will jump here, with the number of cycles remaining in
@@ -934,25 +1055,30 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        to_end = jit_blei(LIGHTREC_REG_CYCLE, 0);
 
        /* Convert next PC to KUNSEG and avoid mirrors */
-       jit_andi(JIT_R0, JIT_V0, 0x10000000 | (RAM_SIZE - 1));
-       jit_rshi_u(JIT_R1, JIT_R0, 28);
+       jit_andi(JIT_V1, JIT_V0, 0x10000000 | (RAM_SIZE - 1));
+       jit_rshi_u(JIT_R1, JIT_V1, 28);
        jit_andi(JIT_R2, JIT_V0, BIOS_SIZE - 1);
        jit_addi(JIT_R2, JIT_R2, RAM_SIZE);
-       jit_movnr(JIT_R0, JIT_R2, JIT_R1);
+       jit_movnr(JIT_V1, JIT_R2, JIT_R1);
 
        /* If possible, use the code LUT */
        if (!lut_is_32bit(state))
-               jit_lshi(JIT_R0, JIT_R0, 1);
-       jit_addr(JIT_R0, JIT_R0, LIGHTREC_REG_STATE);
+               jit_lshi(JIT_V1, JIT_V1, 1);
+       jit_addr(JIT_V1, JIT_V1, LIGHTREC_REG_STATE);
 
        offset = offsetof(struct lightrec_state, code_lut);
        if (lut_is_32bit(state))
-               jit_ldxi_ui(JIT_R0, JIT_R0, offset);
+               jit_ldxi_ui(JIT_V1, JIT_V1, offset);
        else
-               jit_ldxi(JIT_R0, JIT_R0, offset);
+               jit_ldxi(JIT_V1, JIT_V1, offset);
 
        /* If we get non-NULL, loop */
-       jit_patch_at(jit_bnei(JIT_R0, 0), loop);
+       jit_patch_at(jit_bnei(JIT_V1, 0), loop);
+
+       /* The code LUT will be set to this address when the block at the target
+        * PC has been preprocessed but not yet compiled by the threaded
+        * recompiler */
+       addr = jit_indirect();
 
        /* Slow path: call C function get_next_block_func() */
 
@@ -960,22 +1086,22 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
                /* We may call the interpreter - update state->current_cycle */
                jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
                           offsetof(struct lightrec_state, target_cycle));
-               jit_subr(JIT_R1, JIT_R2, LIGHTREC_REG_CYCLE);
+               jit_subr(JIT_V1, JIT_R2, LIGHTREC_REG_CYCLE);
                jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
-                          LIGHTREC_REG_STATE, JIT_R1);
+                          LIGHTREC_REG_STATE, JIT_V1);
        }
 
-       /* The code LUT will be set to this address when the block at the target
-        * PC has been preprocessed but not yet compiled by the threaded
-        * recompiler */
-       addr = jit_indirect();
-
-       /* Get the next block */
        jit_prepare();
        jit_pushargr(LIGHTREC_REG_STATE);
        jit_pushargr(JIT_V0);
+
+       /* Save the cycles register if needed */
+       if (!(ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES))
+               jit_movr(JIT_V0, LIGHTREC_REG_CYCLE);
+
+       /* Get the next block */
        jit_finishi(&get_next_block_func);
-       jit_retval(JIT_R0);
+       jit_retval(JIT_V1);
 
        if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* The interpreter may have updated state->current_cycle and
@@ -985,10 +1111,12 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
                jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
                           offsetof(struct lightrec_state, target_cycle));
                jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+       } else {
+               jit_movr(LIGHTREC_REG_CYCLE, JIT_V0);
        }
 
        /* If we get non-NULL, loop */
-       jit_patch_at(jit_bnei(JIT_R0, 0), loop);
+       jit_patch_at(jit_bnei(JIT_V1, 0), loop);
 
        /* When exiting, the recompiled code will jump to that address */
        jit_note(__FILE__, __LINE__);
@@ -999,10 +1127,10 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
 
        block->_jit = _jit;
        block->opcode_list = NULL;
-       block->flags = 0;
+       block->flags = BLOCK_NO_OPCODE_LIST;
        block->nb_ops = 0;
 
-       block->function = lightrec_emit_code(state, _jit,
+       block->function = lightrec_emit_code(state, block, _jit,
                                             &block->code_size);
        if (!block->function)
                goto err_free_block;
@@ -1043,11 +1171,13 @@ unsigned int lightrec_cycles_of_opcode(union code code)
        return 2;
 }
 
-void lightrec_free_opcode_list(struct lightrec_state *state, struct block *block)
+void lightrec_free_opcode_list(struct lightrec_state *state, struct opcode *ops)
 {
+       struct opcode_list *list = container_of(ops, struct opcode_list, ops);
+
        lightrec_free(state, MEM_FOR_IR,
-                     sizeof(*block->opcode_list) * block->nb_ops,
-                     block->opcode_list);
+                     sizeof(*list) + list->nb_ops * sizeof(struct opcode),
+                     list);
 }
 
 static unsigned int lightrec_get_mips_block_len(const u32 *src)
@@ -1069,25 +1199,28 @@ static unsigned int lightrec_get_mips_block_len(const u32 *src)
 static struct opcode * lightrec_disassemble(struct lightrec_state *state,
                                            const u32 *src, unsigned int *len)
 {
-       struct opcode *list;
+       struct opcode_list *list;
        unsigned int i, length;
 
        length = lightrec_get_mips_block_len(src);
 
-       list = lightrec_malloc(state, MEM_FOR_IR, sizeof(*list) * length);
+       list = lightrec_malloc(state, MEM_FOR_IR,
+                              sizeof(*list) + sizeof(struct opcode) * length);
        if (!list) {
                pr_err("Unable to allocate memory\n");
                return NULL;
        }
 
+       list->nb_ops = (u16) length;
+
        for (i = 0; i < length; i++) {
-               list[i].opcode = LE32TOH(src[i]);
-               list[i].flags = 0;
+               list->ops[i].opcode = LE32TOH(src[i]);
+               list->ops[i].flags = 0;
        }
 
        *len = length * sizeof(u32);
 
-       return list;
+       return list->ops;
 }
 
 static struct block * lightrec_precompile_block(struct lightrec_state *state,
@@ -1095,11 +1228,12 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
 {
        struct opcode *list;
        struct block *block;
-       void *host;
+       void *host, *addr;
        const struct lightrec_mem_map *map = lightrec_get_map(state, &host, kunseg(pc));
        const u32 *code = (u32 *) host;
        unsigned int length;
        bool fully_tagged;
+       u8 block_flags = 0;
 
        if (!map)
                return NULL;
@@ -1124,9 +1258,7 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
        block->next = NULL;
        block->flags = 0;
        block->code_size = 0;
-#if ENABLE_THREADED_COMPILER
-       block->op_list_freed = (atomic_flag)ATOMIC_FLAG_INIT;
-#endif
+       block->precompile_date = state->current_cycle;
        block->nb_ops = length / sizeof(u32);
 
        lightrec_optimize(state, block);
@@ -1145,17 +1277,23 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
        /* If the first opcode is an 'impossible' branch, never compile the
         * block */
        if (should_emulate(block->opcode_list))
-               block->flags |= BLOCK_NEVER_COMPILE;
+               block_flags |= BLOCK_NEVER_COMPILE;
 
        fully_tagged = lightrec_block_is_fully_tagged(block);
        if (fully_tagged)
-               block->flags |= BLOCK_FULLY_TAGGED;
+               block_flags |= BLOCK_FULLY_TAGGED;
 
-       if (OPT_REPLACE_MEMSET && (block->flags & BLOCK_IS_MEMSET))
-               lut_write(state, lut_offset(pc), state->memset_func);
+       if (block_flags)
+               block_set_flags(block, block_flags);
 
        block->hash = lightrec_calculate_block_hash(block);
 
+       if (OPT_REPLACE_MEMSET && block_has_flag(block, BLOCK_IS_MEMSET))
+               addr = state->memset_func;
+       else
+               addr = state->get_next_block;
+       lut_write(state, lut_offset(pc), addr);
+
        pr_debug("Recompile count: %u\n", state->nb_precompile++);
 
        return block;
@@ -1189,7 +1327,8 @@ static bool lightrec_block_is_fully_tagged(const struct block *block)
                case OP_SWC2:
                        if (!LIGHTREC_FLAGS_GET_IO_MODE(op->flags))
                                return false;
-               default: /* fall-through */
+                       fallthrough;
+               default:
                        continue;
                }
        }
@@ -1215,7 +1354,7 @@ static void lightrec_free_function(struct lightrec_state *state, void *fn)
 {
        if (ENABLE_CODE_BUFFER && state->tlsf) {
                pr_debug("Freeing code block at 0x%" PRIxPTR "\n", (uintptr_t) fn);
-               tlsf_free(state->tlsf, fn);
+               lightrec_free_code(state, fn);
        }
 }
 
@@ -1224,24 +1363,31 @@ static void lightrec_reap_function(struct lightrec_state *state, void *data)
        lightrec_free_function(state, data);
 }
 
+static void lightrec_reap_opcode_list(struct lightrec_state *state, void *data)
+{
+       lightrec_free_opcode_list(state, data);
+}
+
 int lightrec_compile_block(struct lightrec_cstate *cstate,
                           struct block *block)
 {
        struct lightrec_state *state = cstate->state;
        struct lightrec_branch_target *target;
-       bool op_list_freed = false, fully_tagged = false;
+       bool fully_tagged = false;
        struct block *block2;
        struct opcode *elm;
        jit_state_t *_jit, *oldjit;
        jit_node_t *start_of_block;
        bool skip_next = false;
-       void *old_fn;
+       void *old_fn, *new_fn;
+       size_t old_code_size;
        unsigned int i, j;
+       u8 old_flags;
        u32 offset;
 
        fully_tagged = lightrec_block_is_fully_tagged(block);
        if (fully_tagged)
-               block->flags |= BLOCK_FULLY_TAGGED;
+               block_set_flags(block, BLOCK_FULLY_TAGGED);
 
        _jit = jit_new_state();
        if (!_jit)
@@ -1249,11 +1395,11 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
 
        oldjit = block->_jit;
        old_fn = block->function;
+       old_code_size = block->code_size;
        block->_jit = _jit;
 
        lightrec_regcache_reset(cstate->reg_cache);
        cstate->cycles = 0;
-       cstate->nb_branches = 0;
        cstate->nb_local_branches = 0;
        cstate->nb_targets = 0;
 
@@ -1270,18 +1416,15 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        continue;
                }
 
-               cstate->cycles += lightrec_cycles_of_opcode(elm->c);
-
                if (should_emulate(elm)) {
                        pr_debug("Branch at offset 0x%x will be emulated\n",
                                 i << 2);
 
-                       lightrec_emit_eob(cstate, block, i, false);
-                       skip_next = !(elm->flags & LIGHTREC_NO_DS);
+                       lightrec_emit_eob(cstate, block, i);
+                       skip_next = !op_flag_no_ds(elm->flags);
                } else {
                        lightrec_rec_opcode(cstate, block, i);
-                       skip_next = has_delay_slot(elm->c) &&
-                               !(elm->flags & LIGHTREC_NO_DS);
+                       skip_next = !op_flag_no_ds(elm->flags) && has_delay_slot(elm->c);
 #if _WIN32
                        /* FIXME: GNU Lightning on Windows seems to use our
                         * mapped registers as temporaries. Until the actual bug
@@ -1290,10 +1433,9 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        lightrec_regcache_mark_live(cstate->reg_cache, _jit);
 #endif
                }
-       }
 
-       for (i = 0; i < cstate->nb_branches; i++)
-               jit_patch(cstate->branches[i]);
+               cstate->cycles += lightrec_cycles_of_opcode(elm->c);
+       }
 
        for (i = 0; i < cstate->nb_local_branches; i++) {
                struct lightrec_branch *branch = &cstate->local_branches[i];
@@ -1318,31 +1460,32 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        pr_err("Unable to find branch target\n");
        }
 
-       jit_ldxi(JIT_R0, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, eob_wrapper_func));
-
-       jit_jmpr(JIT_R0);
-
        jit_ret();
        jit_epilog();
 
-       block->function = lightrec_emit_code(state, _jit,
-                                            &block->code_size);
-       if (!block->function) {
-               pr_err("Unable to compile block!\n");
+       new_fn = lightrec_emit_code(state, block, _jit, &block->code_size);
+       if (!new_fn) {
+               if (!ENABLE_THREADED_COMPILER)
+                       pr_err("Unable to compile block!\n");
+               block->_jit = oldjit;
+               jit_clear_state();
+               _jit_destroy_state(_jit);
+               return -ENOMEM;
        }
 
-       block->flags &= ~BLOCK_SHOULD_RECOMPILE;
+       /* Pause the reaper, because lightrec_reset_lut_offset() may try to set
+        * the old block->function pointer to the code LUT. */
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_reaper_pause(state->reaper);
+
+       block->function = new_fn;
+       block_clear_flags(block, BLOCK_SHOULD_RECOMPILE);
 
        /* Add compiled function to the LUT */
        lut_write(state, lut_offset(block->pc), block->function);
 
-       if (ENABLE_THREADED_COMPILER) {
-               /* Since we might try to reap the same block multiple times,
-                * we need the reaper to wait until everything has been
-                * submitted, so that the duplicate entries can be dropped. */
-               lightrec_reaper_pause(state->reaper);
-       }
+       if (ENABLE_THREADED_COMPILER)
+               lightrec_reaper_continue(state->reaper);
 
        /* Detect old blocks that have been covered by the new one */
        for (i = 0; i < cstate->nb_targets; i++) {
@@ -1352,6 +1495,13 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        continue;
 
                offset = block->pc + target->offset * sizeof(u32);
+
+               /* Pause the reaper while we search for the block until we set
+                * the BLOCK_IS_DEAD flag, otherwise the block may be removed
+                * under our feet. */
+               if (ENABLE_THREADED_COMPILER)
+                       lightrec_reaper_pause(state->reaper);
+
                block2 = lightrec_find_block(state->block_cache, offset);
                if (block2) {
                        /* No need to check if block2 is compilable - it must
@@ -1359,12 +1509,16 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
 
                        /* Set the "block dead" flag to prevent the dynarec from
                         * recompiling this block */
-                       block2->flags |= BLOCK_IS_DEAD;
+                       old_flags = block_set_flags(block2, BLOCK_IS_DEAD);
+               }
+
+               if (ENABLE_THREADED_COMPILER) {
+                       lightrec_reaper_continue(state->reaper);
 
                        /* If block2 was pending for compilation, cancel it.
                         * If it's being compiled right now, wait until it
                         * finishes. */
-                       if (ENABLE_THREADED_COMPILER)
+                       if (block2)
                                lightrec_recompiler_remove(state->rec, block2);
                }
 
@@ -1379,20 +1533,17 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                                 "0x%08x\n", block2->pc, block->pc);
 
                        /* Finally, reap the block. */
-                       if (ENABLE_THREADED_COMPILER) {
+                       if (!ENABLE_THREADED_COMPILER) {
+                               lightrec_unregister_block(state->block_cache, block2);
+                               lightrec_free_block(state, block2);
+                       } else if (!(old_flags & BLOCK_IS_DEAD)) {
                                lightrec_reaper_add(state->reaper,
                                                    lightrec_reap_block,
                                                    block2);
-                       } else {
-                               lightrec_unregister_block(state->block_cache, block2);
-                               lightrec_free_block(state, block2);
                        }
                }
        }
 
-       if (ENABLE_THREADED_COMPILER)
-               lightrec_reaper_continue(state->reaper);
-
        if (ENABLE_DISASSEMBLER) {
                pr_debug("Compiling block at PC: 0x%08x\n", block->pc);
                jit_disassemble();
@@ -1400,15 +1551,20 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
 
        jit_clear_state();
 
-#if ENABLE_THREADED_COMPILER
        if (fully_tagged)
-               op_list_freed = atomic_flag_test_and_set(&block->op_list_freed);
-#endif
-       if (fully_tagged && !op_list_freed) {
+               old_flags = block_set_flags(block, BLOCK_NO_OPCODE_LIST);
+
+       if (fully_tagged && !(old_flags & BLOCK_NO_OPCODE_LIST)) {
                pr_debug("Block PC 0x%08x is fully tagged"
                         " - free opcode list\n", block->pc);
-               lightrec_free_opcode_list(state, block);
-               block->opcode_list = NULL;
+
+               if (ENABLE_THREADED_COMPILER) {
+                       lightrec_reaper_add(state->reaper,
+                                           lightrec_reap_opcode_list,
+                                           block->opcode_list);
+               } else {
+                       lightrec_free_opcode_list(state, block->opcode_list);
+               }
        }
 
        if (oldjit) {
@@ -1424,6 +1580,8 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        _jit_destroy_state(oldjit);
                        lightrec_free_function(state, old_fn);
                }
+
+               lightrec_unregister(MEM_FOR_CODE, old_code_size);
        }
 
        return 0;
@@ -1476,20 +1634,24 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
        return state->next_pc;
 }
 
-u32 lightrec_execute_one(struct lightrec_state *state, u32 pc)
+u32 lightrec_run_interpreter(struct lightrec_state *state, u32 pc,
+                            u32 target_cycle)
 {
-       return lightrec_execute(state, pc, state->current_cycle);
-}
-
-u32 lightrec_run_interpreter(struct lightrec_state *state, u32 pc)
-{
-       struct block *block = lightrec_get_block(state, pc);
-       if (!block)
-               return 0;
+       struct block *block;
 
        state->exit_flags = LIGHTREC_EXIT_NORMAL;
+       state->target_cycle = target_cycle;
+
+       do {
+               block = lightrec_get_block(state, pc);
+               if (!block)
+                       break;
+
+               pc = lightrec_emulate_block(state, block, pc);
 
-       pc = lightrec_emulate_block(state, block, pc);
+               if (ENABLE_THREADED_COMPILER)
+                       lightrec_reaper_reap(state->reaper);
+       } while (state->current_cycle < state->target_cycle);
 
        if (LOG_LEVEL >= INFO_L)
                lightrec_print_info(state);
@@ -1499,13 +1661,19 @@ u32 lightrec_run_interpreter(struct lightrec_state *state, u32 pc)
 
 void lightrec_free_block(struct lightrec_state *state, struct block *block)
 {
+       u8 old_flags;
+
        lightrec_unregister(MEM_FOR_MIPS_CODE, block->nb_ops * sizeof(u32));
-       if (block->opcode_list)
-               lightrec_free_opcode_list(state, block);
+       old_flags = block_set_flags(block, BLOCK_NO_OPCODE_LIST);
+
+       if (!(old_flags & BLOCK_NO_OPCODE_LIST))
+               lightrec_free_opcode_list(state, block->opcode_list);
        if (block->_jit)
                _jit_destroy_state(block->_jit);
-       lightrec_free_function(state, block->function);
-       lightrec_unregister(MEM_FOR_CODE, block->code_size);
+       if (block->function) {
+               lightrec_free_function(state, block->function);
+               lightrec_unregister(MEM_FOR_CODE, block->code_size);
+       }
        lightrec_free(state, MEM_FOR_IR, sizeof(*block), block);
 }
 
@@ -1539,7 +1707,7 @@ struct lightrec_state * lightrec_init(char *argv0,
                                      size_t nb,
                                      const struct lightrec_ops *ops)
 {
-       const struct lightrec_mem_map *codebuf_map;
+       const struct lightrec_mem_map *codebuf_map = &map[PSX_MAP_CODE_BUFFER];
        struct lightrec_state *state;
        uintptr_t addr;
        void *tlsf = NULL;
@@ -1552,9 +1720,13 @@ struct lightrec_state * lightrec_init(char *argv0,
                return NULL;
        }
 
-       if (ENABLE_CODE_BUFFER && nb > PSX_MAP_CODE_BUFFER) {
-               codebuf_map = &map[PSX_MAP_CODE_BUFFER];
+       if (ops->cop2_notify)
+               pr_debug("Optional cop2_notify callback in lightrec_ops\n");
+       else
+               pr_debug("No optional cop2_notify callback in lightrec_ops\n");
 
+       if (ENABLE_CODE_BUFFER && nb > PSX_MAP_CODE_BUFFER
+           && codebuf_map->address) {
                tlsf = tlsf_create_with_pool(codebuf_map->address,
                                             codebuf_map->length);
                if (!tlsf) {
@@ -1584,15 +1756,9 @@ struct lightrec_state * lightrec_init(char *argv0,
        state->tlsf = tlsf;
        state->with_32bit_lut = with_32bit_lut;
 
-#if ENABLE_TINYMM
-       state->tinymm = tinymm_init(malloc, free, 4096);
-       if (!state->tinymm)
-               goto err_free_state;
-#endif
-
        state->block_cache = lightrec_blockcache_init(state);
        if (!state->block_cache)
-               goto err_free_tinymm;
+               goto err_free_state;
 
        if (ENABLE_THREADED_COMPILER) {
                state->rec = lightrec_recompiler_init(state);
@@ -1623,10 +1789,9 @@ struct lightrec_state * lightrec_init(char *argv0,
 
        state->c_wrappers[C_WRAPPER_RW] = lightrec_rw_cb;
        state->c_wrappers[C_WRAPPER_RW_GENERIC] = lightrec_rw_generic_cb;
+       state->c_wrappers[C_WRAPPER_MFC] = lightrec_mfc_cb;
        state->c_wrappers[C_WRAPPER_MTC] = lightrec_mtc_cb;
        state->c_wrappers[C_WRAPPER_CP] = lightrec_cp_cb;
-       state->c_wrappers[C_WRAPPER_SYSCALL] = lightrec_syscall_cb;
-       state->c_wrappers[C_WRAPPER_BREAK] = lightrec_break_cb;
 
        map = &state->maps[PSX_MAP_BIOS];
        state->offset_bios = (uintptr_t)map->address - map->pc;
@@ -1634,6 +1799,9 @@ struct lightrec_state * lightrec_init(char *argv0,
        map = &state->maps[PSX_MAP_SCRATCH_PAD];
        state->offset_scratch = (uintptr_t)map->address - map->pc;
 
+       map = &state->maps[PSX_MAP_HW_REGISTERS];
+       state->offset_io = (uintptr_t)map->address - map->pc;
+
        map = &state->maps[PSX_MAP_KERNEL_USER_RAM];
        state->offset_ram = (uintptr_t)map->address - map->pc;
 
@@ -1645,6 +1813,7 @@ struct lightrec_state * lightrec_init(char *argv0,
        if (state->offset_bios == 0 &&
            state->offset_scratch == 0 &&
            state->offset_ram == 0 &&
+           state->offset_io == 0 &&
            state->mirrors_mapped) {
                pr_info("Memory map is perfect. Emitted code will be best.\n");
        } else {
@@ -1668,11 +1837,7 @@ err_free_recompiler:
                lightrec_free_cstate(state->cstate);
 err_free_block_cache:
        lightrec_free_block_cache(state->block_cache);
-err_free_tinymm:
-#if ENABLE_TINYMM
-       tinymm_shutdown(state->tinymm);
 err_free_state:
-#endif
        lightrec_unregister(MEM_FOR_LIGHTREC, sizeof(*state) +
                            lut_elm_size(state) * CODE_LUT_SIZE);
        free(state);
@@ -1689,6 +1854,10 @@ void lightrec_destroy(struct lightrec_state *state)
        state->current_cycle = ~state->current_cycle;
        lightrec_print_info(state);
 
+       lightrec_free_block_cache(state->block_cache);
+       lightrec_free_block(state, state->dispatcher);
+       lightrec_free_block(state, state->c_wrapper_block);
+
        if (ENABLE_THREADED_COMPILER) {
                lightrec_free_recompiler(state->rec);
                lightrec_reaper_destroy(state->reaper);
@@ -1696,16 +1865,10 @@ void lightrec_destroy(struct lightrec_state *state)
                lightrec_free_cstate(state->cstate);
        }
 
-       lightrec_free_block_cache(state->block_cache);
-       lightrec_free_block(state, state->dispatcher);
-       lightrec_free_block(state, state->c_wrapper_block);
        finish_jit();
        if (ENABLE_CODE_BUFFER && state->tlsf)
                tlsf_destroy(state->tlsf);
 
-#if ENABLE_TINYMM
-       tinymm_shutdown(state->tinymm);
-#endif
        lightrec_unregister(MEM_FOR_LIGHTREC, sizeof(*state) +
                            lut_elm_size(state) * CODE_LUT_SIZE);
        free(state);
@@ -1714,17 +1877,23 @@ void lightrec_destroy(struct lightrec_state *state)
 void lightrec_invalidate(struct lightrec_state *state, u32 addr, u32 len)
 {
        u32 kaddr = kunseg(addr & ~0x3);
-       const struct lightrec_mem_map *map = lightrec_get_map(state, NULL, kaddr);
-
-       if (map) {
-               if (map != &state->maps[PSX_MAP_KERNEL_USER_RAM])
-                       return;
+       enum psx_map idx = lightrec_get_map_idx(state, kaddr);
 
+       switch (idx) {
+       case PSX_MAP_MIRROR1:
+       case PSX_MAP_MIRROR2:
+       case PSX_MAP_MIRROR3:
                /* Handle mirrors */
-               kaddr &= (state->maps[PSX_MAP_KERNEL_USER_RAM].length - 1);
-
-               lightrec_invalidate_map(state, map, kaddr, len);
+               kaddr &= RAM_SIZE - 1;
+               fallthrough;
+       case PSX_MAP_KERNEL_USER_RAM:
+               break;
+       default:
+               return;
        }
+
+       memset(lut_address(state, lut_offset(kaddr)), 0,
+              ((len + 3) / 4) * lut_elm_size(state));
 }
 
 void lightrec_invalidate_all(struct lightrec_state *state)