git subrepo pull --force deps/lightrec
authorPaul Cercueil <paul@crapouillou.net>
Sun, 20 Feb 2022 00:16:48 +0000 (00:16 +0000)
committerPaul Cercueil <paul@crapouillou.net>
Mon, 21 Feb 2022 23:26:31 +0000 (23:26 +0000)
subrepo:
  subdir:   "deps/lightrec"
  merged:   "d90de684"
upstream:
  origin:   "https://github.com/pcercuei/lightrec.git"
  branch:   "master"
  commit:   "d90de684"
git-subrepo:
  version:  "0.4.3"
  origin:   "https://github.com/ingydotnet/git-subrepo.git"
  commit:   "2f68596"

29 files changed:
deps/lightrec/.gitrepo
deps/lightrec/CMakeLists.txt
deps/lightrec/blockcache.c
deps/lightrec/blockcache.h
deps/lightrec/config.h [deleted file]
deps/lightrec/config.h.cmakein [deleted file]
deps/lightrec/debug.h
deps/lightrec/disassembler.c
deps/lightrec/disassembler.h
deps/lightrec/emitter.c
deps/lightrec/emitter.h
deps/lightrec/interpreter.c
deps/lightrec/interpreter.h
deps/lightrec/lightning-wrapper.h [new file with mode: 0644]
deps/lightrec/lightrec-config.h.cmakein [new file with mode: 0644]
deps/lightrec/lightrec-private.h
deps/lightrec/lightrec.c
deps/lightrec/lightrec.h
deps/lightrec/memmanager.c
deps/lightrec/memmanager.h
deps/lightrec/optimizer.c
deps/lightrec/optimizer.h
deps/lightrec/reaper.c
deps/lightrec/reaper.h
deps/lightrec/recompiler.c
deps/lightrec/recompiler.h
deps/lightrec/regcache.c
deps/lightrec/regcache.h
deps/lightrec/slist.h

index 4ebb7d2..770ee66 100644 (file)
@@ -6,7 +6,7 @@
 [subrepo]
        remote = https://github.com/pcercuei/lightrec.git
        branch = master
-       commit = 2cca097e538876d219b8af9663abe0ca74f68bb2
-       parent = 5c00ea32a0eab812299b08acd14c25bf6ba4ca7a
+       commit = d90de68429bf9c2d67c5f5051d495d1e3131e636
+       parent = a9725dc07f40b39a5533d546b59e45377d1f9b66
        method = merge
-       cmdver = 0.4.1
+       cmdver = 0.4.3
index c58dac5..6a139f4 100644 (file)
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.0)
-project(lightrec LANGUAGES C VERSION 0.3)
+project(lightrec LANGUAGES C VERSION 0.4)
 
 set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared libraries")
 if (NOT BUILD_SHARED_LIBS)
@@ -25,9 +25,10 @@ if (CMAKE_COMPILER_IS_GNUCC)
        add_compile_options(-fvisibility=hidden)
 endif()
 
+set(HAS_DEFAULT_ELM ${CMAKE_COMPILER_IS_GNUCC})
+
 list(APPEND LIGHTREC_SOURCES
        blockcache.c
-       disassembler.c
        emitter.c
        interpreter.c
        lightrec.c
@@ -60,6 +61,17 @@ if (ENABLE_THREADED_COMPILER)
        endif (NOT ENABLE_FIRST_PASS)
 endif (ENABLE_THREADED_COMPILER)
 
+option(OPT_REMOVE_DIV_BY_ZERO_SEQ "(optimization) Remove div-by-zero check sequence" ON)
+option(OPT_REPLACE_MEMSET "(optimization) Detect and replace memset with host variant" ON)
+option(OPT_DETECT_IMPOSSIBLE_BRANCHES "(optimization) Detect impossible branches" ON)
+option(OPT_TRANSFORM_OPS "(optimization) Transform opcodes" ON)
+option(OPT_LOCAL_BRANCHES "(optimization) Detect local branches" ON)
+option(OPT_SWITCH_DELAY_SLOTS "(optimization) Switch delay slots" ON)
+option(OPT_FLAG_STORES "(optimization) Flag stores that don't require invalidation" ON)
+option(OPT_FLAG_IO "(optimization) Flag I/O opcodes whose target is known" ON)
+option(OPT_FLAG_MULT_DIV "(optimization) Flag MULT/DIV that only use one of HI/LO" ON)
+option(OPT_EARLY_UNLOAD "(optimization) Unload registers early" ON)
+
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_library(${PROJECT_NAME} ${LIGHTREC_SOURCES} ${LIGHTREC_HEADERS})
@@ -72,6 +84,13 @@ set_target_properties(${PROJECT_NAME} PROPERTIES
        C_EXTENSIONS OFF
 )
 
+if (CMAKE_C_COMPILER_ID MATCHES "GNU|Clang")
+       target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wno-parentheses)
+endif()
+if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+       target_compile_options(${PROJECT_NAME} PRIVATE -Wno-initializer-overrides)
+endif()
+
 option(ENABLE_TINYMM "Enable optional libtinymm dependency" OFF)
 if (ENABLE_TINYMM)
        find_library(TINYMM_LIBRARIES tinymm REQUIRED)
@@ -96,19 +115,11 @@ include_directories(${LIBLIGHTNING_INCLUDE_DIR})
 target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBLIGHTNING})
 
 if (LOG_LEVEL STREQUAL Debug)
-       find_library(LIBOPCODES NAMES opcodes-multiarch opcodes)
-       find_path(LIBOPCODES_INCLUDE_DIR dis-asm.h)
-
-       if (NOT LIBOPCODES OR NOT LIBOPCODES_INCLUDE_DIR)
-               message(SEND_ERROR "Debug log level requires libopcodes (from binutils) to be installed.")
-       endif ()
-
        set(ENABLE_DISASSEMBLER ON)
-       include_directories(${LIBOPCODES_INCLUDE_DIR})
-       target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBOPCODES})
+       target_sources(${PROJECT_NAME} PRIVATE disassembler.c)
 endif()
 
-configure_file(config.h.cmakein config.h @ONLY)
+configure_file(lightrec-config.h.cmakein lightrec-config.h @ONLY)
 
 include(GNUInstallDirs)
 install(TARGETS ${PROJECT_NAME}
index 4263431..4512392 100644 (file)
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2015-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2015-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "blockcache.h"
@@ -19,6 +10,7 @@
 
 #include <stdbool.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* Must be power of two */
 #define LUT_SIZE 0x4000
@@ -28,6 +20,11 @@ struct blockcache {
        struct block * lut[LUT_SIZE];
 };
 
+u16 lightrec_get_lut_entry(const struct block *block)
+{
+       return (kunseg(block->pc) >> 2) & (LUT_SIZE - 1);
+}
+
 struct block * lightrec_find_block(struct blockcache *cache, u32 pc)
 {
        struct block *block;
@@ -42,22 +39,33 @@ struct block * lightrec_find_block(struct blockcache *cache, u32 pc)
        return NULL;
 }
 
-void remove_from_code_lut(struct blockcache *cache, struct block *block)
+struct block * lightrec_find_block_from_lut(struct blockcache *cache,
+                                           u16 lut_entry, u32 addr_in_block)
 {
-       struct lightrec_state *state = block->state;
-       const struct opcode *op;
-       u32 offset = lut_offset(block->pc);
+       struct block *block;
+       u32 pc;
 
-       /* Use state->get_next_block in the code LUT, which basically
-        * calls back get_next_block_func(), until the compiler
-        * overrides this. This is required, as a NULL value in the code
-        * LUT means an outdated block. */
-       state->code_lut[offset] = state->get_next_block;
+       addr_in_block = kunseg(addr_in_block);
 
-       for (op = block->opcode_list; op; op = op->next)
-               if (op->c.i.op == OP_META_SYNC)
-                       state->code_lut[offset + op->offset] = NULL;
+       for (block = cache->lut[lut_entry]; block; block = block->next) {
+               pc = kunseg(block->pc);
+               if (addr_in_block >= pc &&
+                   addr_in_block < pc + (block->nb_ops << 2))
+                       return block;
+       }
 
+       return NULL;
+}
+
+void remove_from_code_lut(struct blockcache *cache, struct block *block)
+{
+       struct lightrec_state *state = cache->state;
+       u32 offset = lut_offset(block->pc);
+
+       if (block->function) {
+               memset(&state->code_lut[offset], 0,
+                      block->nb_ops * sizeof(*state->code_lut));
+       }
 }
 
 void lightrec_register_block(struct blockcache *cache, struct block *block)
@@ -102,7 +110,7 @@ void lightrec_free_block_cache(struct blockcache *cache)
        for (i = 0; i < LUT_SIZE; i++) {
                for (block = cache->lut[i]; block; block = next) {
                        next = block->next;
-                       lightrec_free_block(block);
+                       lightrec_free_block(cache->state, block);
                }
        }
 
@@ -124,18 +132,10 @@ struct blockcache * lightrec_blockcache_init(struct lightrec_state *state)
 
 u32 lightrec_calculate_block_hash(const struct block *block)
 {
-       const struct lightrec_mem_map *map = block->map;
-       u32 pc, hash = 0xffffffff;
-       const u32 *code;
+       const u32 *code = block->code;
+       u32 hash = 0xffffffff;
        unsigned int i;
 
-       pc = kunseg(block->pc) - map->pc;
-
-       while (map->mirror_of)
-               map = map->mirror_of;
-
-       code = map->address + pc;
-
        /* Jenkins one-at-a-time hash algorithm */
        for (i = 0; i < block->nb_ops; i++) {
                hash += *code++;
@@ -150,9 +150,9 @@ u32 lightrec_calculate_block_hash(const struct block *block)
        return hash;
 }
 
-bool lightrec_block_is_outdated(struct block *block)
+bool lightrec_block_is_outdated(struct lightrec_state *state, struct block *block)
 {
-       void **lut_entry = &block->state->code_lut[lut_offset(block->pc)];
+       void **lut_entry = &state->code_lut[lut_offset(block->pc)];
        bool outdated;
 
        if (*lut_entry)
@@ -165,7 +165,7 @@ bool lightrec_block_is_outdated(struct block *block)
                if (block->function)
                        *lut_entry = block->function;
                else
-                       *lut_entry = block->state->get_next_block;
+                       *lut_entry = state->get_next_block;
        }
 
        return outdated;
index ff63651..3b782f4 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __BLOCKCACHE_H__
 struct blockcache;
 
 struct block * lightrec_find_block(struct blockcache *cache, u32 pc);
+struct block * lightrec_find_block_from_lut(struct blockcache *cache,
+                                           u16 lut_entry, u32 addr_in_block);
+u16 lightrec_get_lut_entry(const struct block *block);
+
 void lightrec_register_block(struct blockcache *cache, struct block *block);
 void lightrec_unregister_block(struct blockcache *cache, struct block *block);
 
@@ -27,6 +22,6 @@ struct blockcache * lightrec_blockcache_init(struct lightrec_state *state);
 void lightrec_free_block_cache(struct blockcache *cache);
 
 u32 lightrec_calculate_block_hash(const struct block *block);
-_Bool lightrec_block_is_outdated(struct block *block);
+_Bool lightrec_block_is_outdated(struct lightrec_state *state, struct block *block);
 
 #endif /* __BLOCKCACHE_H__ */
diff --git a/deps/lightrec/config.h b/deps/lightrec/config.h
deleted file mode 100644 (file)
index b72ae10..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2019 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- */
-
-#ifndef __LIGHTREC_CONFIG_H__
-#define __LIGHTREC_CONFIG_H__
-
-#define ENABLE_THREADED_COMPILER 1
-#define ENABLE_FIRST_PASS 1
-#define ENABLE_DISASSEMBLER 0
-#define ENABLE_TINYMM 0
-
-#endif /* __LIGHTREC_CONFIG_H__ */
diff --git a/deps/lightrec/config.h.cmakein b/deps/lightrec/config.h.cmakein
deleted file mode 100644 (file)
index 1eac007..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) 2019 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- */
-
-#ifndef __LIGHTREC_CONFIG_H__
-#define __LIGHTREC_CONFIG_H__
-
-#cmakedefine01 ENABLE_THREADED_COMPILER
-#cmakedefine01 ENABLE_FIRST_PASS
-#cmakedefine01 ENABLE_DISASSEMBLER
-#cmakedefine01 ENABLE_TINYMM
-
-#endif /* __LIGHTREC_CONFIG_H__ */
-
index 4facc22..273f1e5 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef DEBUG_H
index 06fcec9..c357a30 100644 (file)
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
-#include "config.h"
-
-#if ENABLE_DISASSEMBLER
-#include <dis-asm.h>
-#endif
 #include <stdbool.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "debug.h"
-#include "disassembler.h"
 #include "lightrec-private.h"
-#include "memmanager.h"
+#include "regcache.h"
 
-static bool is_unconditional_jump(const struct opcode *op)
-{
-       switch (op->i.op) {
-       case OP_SPECIAL:
-               return op->r.op == OP_SPECIAL_JR || op->r.op == OP_SPECIAL_JALR;
-       case OP_J:
-       case OP_JAL:
-               return true;
-       case OP_BEQ:
-       case OP_BLEZ:
-               return op->i.rs == op->i.rt;
-       case OP_REGIMM:
-               return (op->r.rt == OP_REGIMM_BGEZ ||
-                       op->r.rt == OP_REGIMM_BGEZAL) && op->i.rs == 0;
-       default:
-               return false;
-       }
-}
+static const char *std_opcodes[] = {
+       [OP_J]                  = "j       ",
+       [OP_JAL]                = "jal     ",
+       [OP_BEQ]                = "beq     ",
+       [OP_BNE]                = "bne     ",
+       [OP_BLEZ]               = "blez    ",
+       [OP_BGTZ]               = "bgtz    ",
+       [OP_ADDI]               = "addi    ",
+       [OP_ADDIU]              = "addiu   ",
+       [OP_SLTI]               = "slti    ",
+       [OP_SLTIU]              = "sltiu   ",
+       [OP_ANDI]               = "andi    ",
+       [OP_ORI]                = "ori     ",
+       [OP_XORI]               = "xori    ",
+       [OP_LUI]                = "lui     ",
+       [OP_LB]                 = "lb      ",
+       [OP_LH]                 = "lh      ",
+       [OP_LWL]                = "lwl     ",
+       [OP_LW]                 = "lw      ",
+       [OP_LBU]                = "lbu     ",
+       [OP_LHU]                = "lhu     ",
+       [OP_LWR]                = "lwr     ",
+       [OP_SB]                 = "sb      ",
+       [OP_SH]                 = "sh      ",
+       [OP_SWL]                = "swl     ",
+       [OP_SW]                 = "sw      ",
+       [OP_SWR]                = "swr     ",
+       [OP_LWC2]               = "lwc2    ",
+       [OP_SWC2]               = "swc2    ",
+};
 
-static bool is_syscall(const struct opcode *op)
-{
-       return (op->i.op == OP_SPECIAL && (op->r.op == OP_SPECIAL_SYSCALL ||
-                                          op->r.op == OP_SPECIAL_BREAK)) ||
-               (op->i.op == OP_CP0 && (op->r.rs == OP_CP0_MTC0 ||
-                                       op->r.rs == OP_CP0_CTC0) &&
-                (op->r.rd == 12 || op->r.rd == 13));
-}
+static const char *special_opcodes[] = {
+       [OP_SPECIAL_SLL]        = "sll     ",
+       [OP_SPECIAL_SRL]        = "srl     ",
+       [OP_SPECIAL_SRA]        = "sra     ",
+       [OP_SPECIAL_SLLV]       = "sllv    ",
+       [OP_SPECIAL_SRLV]       = "srlv    ",
+       [OP_SPECIAL_SRAV]       = "srav    ",
+       [OP_SPECIAL_JR]         = "jr      ",
+       [OP_SPECIAL_JALR]       = "jalr    ",
+       [OP_SPECIAL_SYSCALL]    = "syscall ",
+       [OP_SPECIAL_BREAK]      = "break   ",
+       [OP_SPECIAL_MFHI]       = "mfhi    ",
+       [OP_SPECIAL_MTHI]       = "mthi    ",
+       [OP_SPECIAL_MFLO]       = "mflo    ",
+       [OP_SPECIAL_MTLO]       = "mtlo    ",
+       [OP_SPECIAL_MULT]       = "mult    ",
+       [OP_SPECIAL_MULTU]      = "multu   ",
+       [OP_SPECIAL_DIV]        = "div     ",
+       [OP_SPECIAL_DIVU]       = "divu    ",
+       [OP_SPECIAL_ADD]        = "add     ",
+       [OP_SPECIAL_ADDU]       = "addu    ",
+       [OP_SPECIAL_SUB]        = "sub     ",
+       [OP_SPECIAL_SUBU]       = "subu    ",
+       [OP_SPECIAL_AND]        = "and     ",
+       [OP_SPECIAL_OR]         = "or      ",
+       [OP_SPECIAL_XOR]        = "xor     ",
+       [OP_SPECIAL_NOR]        = "nor     ",
+       [OP_SPECIAL_SLT]        = "slt     ",
+       [OP_SPECIAL_SLTU]       = "sltu    ",
+};
 
-void lightrec_free_opcode_list(struct lightrec_state *state, struct opcode *list)
-{
-       struct opcode *next;
+static const char *regimm_opcodes[] = {
+       [OP_REGIMM_BLTZ]        = "bltz    ",
+       [OP_REGIMM_BGEZ]        = "bgez    ",
+       [OP_REGIMM_BLTZAL]      = "bltzal  ",
+       [OP_REGIMM_BGEZAL]      = "bgezal  ",
+};
 
-       while (list) {
-               next = list->next;
-               lightrec_free(state, MEM_FOR_IR, sizeof(*list), list);
-               list = next;
-       }
-}
+static const char *cp0_opcodes[] = {
+       [OP_CP0_MFC0]           = "mfc0    ",
+       [OP_CP0_CFC0]           = "cfc0    ",
+       [OP_CP0_MTC0]           = "mtc0    ",
+       [OP_CP0_CTC0]           = "ctc0    ",
+       [OP_CP0_RFE]            = "rfe",
+};
+
+static const char *cp2_opcodes[] = {
+       [OP_CP2_BASIC_MFC2]     = "mfc2    ",
+       [OP_CP2_BASIC_CFC2]     = "cfc2    ",
+       [OP_CP2_BASIC_MTC2]     = "mtc2    ",
+       [OP_CP2_BASIC_CTC2]     = "ctc2    ",
+};
+
+static const char *opcode_flags[] = {
+       "switched branch/DS",
+       "unload Rs",
+       "unload Rt",
+       "unload Rd",
+       "sync point",
+};
+
+static const char *opcode_io_flags[] = {
+       "memory I/O",
+       "hardware I/O",
+       "self-modifying code",
+       "no invalidation",
+};
 
-struct opcode * lightrec_disassemble(struct lightrec_state *state,
-                                    const u32 *src, unsigned int *len)
+static const char *opcode_branch_flags[] = {
+       "emulate branch",
+       "local branch",
+};
+
+static const char *opcode_multdiv_flags[] = {
+       "No LO",
+       "No HI",
+       "No div check",
+};
+
+static int print_flags(char *buf, size_t len, u16 flags,
+                      const char **array, size_t array_size)
 {
-       struct opcode *head = NULL;
-       bool stop_next = false;
-       struct opcode *curr, *last;
+       const char *flag_name;
        unsigned int i;
+       size_t count = 0, bytes;
+       bool first = true;
 
-       for (i = 0, last = NULL; ; i++, last = curr) {
-               curr = lightrec_calloc(state, MEM_FOR_IR, sizeof(*curr));
-               if (!curr) {
-                       pr_err("Unable to allocate memory\n");
-                       lightrec_free_opcode_list(state, head);
-                       return NULL;
-               }
+       for (i = 0; i < array_size + ARRAY_SIZE(opcode_flags); i++) {
+               if (!(flags & BIT(i)))
+                       continue;
 
-               if (!last)
-                       head = curr;
+               if (i < ARRAY_SIZE(opcode_flags))
+                       flag_name = opcode_flags[i];
                else
-                       last->next = curr;
-
-               /* TODO: Take care of endianness */
-               curr->opcode = LE32TOH(*src++);
-               curr->offset = i;
-
-               /* NOTE: The block disassembly ends after the opcode that
-                * follows an unconditional jump (delay slot) */
-               if (stop_next || is_syscall(curr))
-                       break;
-               else if (is_unconditional_jump(curr))
-                       stop_next = true;
+                       flag_name = array[i - ARRAY_SIZE(opcode_flags)];
+
+               if (first)
+                       bytes = snprintf(buf, len, "(%s", flag_name);
+               else
+                       bytes = snprintf(buf, len, ", %s", flag_name);
+
+               first = false;
+               buf += bytes;
+               len -= bytes;
+               count += bytes;
        }
 
-       if (len)
-               *len = (i + 1) * sizeof(u32);
+       if (!first)
+               count += snprintf(buf, len, ")");
+       else
+               *buf = '\0';
 
-       return head;
+       return count;
 }
 
-unsigned int lightrec_cycles_of_opcode(union code code)
+static int print_op_special(union code c, char *buf, size_t len,
+                           const char ***flags_ptr, size_t *nb_flags)
 {
-       switch (code.i.op) {
-       case OP_META_REG_UNLOAD:
-       case OP_META_SYNC:
-               return 0;
+       switch (c.r.op) {
+       case OP_SPECIAL_SLL:
+       case OP_SPECIAL_SRL:
+       case OP_SPECIAL_SRA:
+               return snprintf(buf, len, "%s%s,%s,%u",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(c.r.rd),
+                               lightrec_reg_name(c.r.rt),
+                               c.r.imm);
+       case OP_SPECIAL_SLLV:
+       case OP_SPECIAL_SRLV:
+       case OP_SPECIAL_SRAV:
+       case OP_SPECIAL_ADD:
+       case OP_SPECIAL_ADDU:
+       case OP_SPECIAL_SUB:
+       case OP_SPECIAL_SUBU:
+       case OP_SPECIAL_AND:
+       case OP_SPECIAL_OR:
+       case OP_SPECIAL_XOR:
+       case OP_SPECIAL_NOR:
+       case OP_SPECIAL_SLT:
+       case OP_SPECIAL_SLTU:
+               return snprintf(buf, len, "%s%s,%s,%s",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(c.r.rd),
+                               lightrec_reg_name(c.r.rt),
+                               lightrec_reg_name(c.r.rs));
+       case OP_SPECIAL_JR:
+       case OP_SPECIAL_MTHI:
+       case OP_SPECIAL_MTLO:
+               return snprintf(buf, len, "%s%s",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(c.r.rs));
+       case OP_SPECIAL_JALR:
+               return snprintf(buf, len, "%s%s,%s",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(c.r.rd),
+                               lightrec_reg_name(c.r.rt));
+       case OP_SPECIAL_SYSCALL:
+       case OP_SPECIAL_BREAK:
+               return snprintf(buf, len, "%s", special_opcodes[c.r.op]);
+       case OP_SPECIAL_MFHI:
+       case OP_SPECIAL_MFLO:
+               return snprintf(buf, len, "%s%s",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(c.r.rd));
+       case OP_SPECIAL_MULT:
+       case OP_SPECIAL_MULTU:
+       case OP_SPECIAL_DIV:
+       case OP_SPECIAL_DIVU:
+               *flags_ptr = opcode_multdiv_flags;
+               *nb_flags = ARRAY_SIZE(opcode_multdiv_flags);
+               return snprintf(buf, len, "%s%s,%s,%s,%s",
+                               special_opcodes[c.r.op],
+                               lightrec_reg_name(get_mult_div_hi(c)),
+                               lightrec_reg_name(get_mult_div_lo(c)),
+                               lightrec_reg_name(c.r.rs),
+                               lightrec_reg_name(c.r.rt));
        default:
-               return 2;
+               return snprintf(buf, len, "unknown (0x%08x)", c.opcode);
        }
 }
 
-#if ENABLE_DISASSEMBLER
-void lightrec_print_disassembly(const struct block *block,
-                               const u32 *code, unsigned int length)
+static int print_op_cp(union code c, char *buf, size_t len, unsigned int cp)
 {
-       struct disassemble_info info;
+       if (cp == 2) {
+               switch (c.i.rs) {
+               case OP_CP0_MFC0:
+               case OP_CP0_CFC0:
+               case OP_CP0_MTC0:
+               case OP_CP0_CTC0:
+                       return snprintf(buf, len, "%s%s,%u",
+                                       cp2_opcodes[c.i.rs],
+                                       lightrec_reg_name(c.i.rt),
+                                       c.r.rd);
+               default:
+                       return snprintf(buf, len, "cp2     (0x%08x)", c.opcode);
+               }
+       } else {
+               switch (c.i.rs) {
+               case OP_CP0_MFC0:
+               case OP_CP0_CFC0:
+               case OP_CP0_MTC0:
+               case OP_CP0_CTC0:
+                       return snprintf(buf, len, "%s%s,%u",
+                                       cp0_opcodes[c.i.rs],
+                                       lightrec_reg_name(c.i.rt),
+                                       c.r.rd);
+               case OP_CP0_RFE:
+                       return snprintf(buf, len, "rfe     ");
+               default:
+                       return snprintf(buf, len, "unknown (0x%08x)", c.opcode);
+               }
+       }
+}
+
+static int print_op(union code c, u32 pc, char *buf, size_t len,
+                   const char ***flags_ptr, size_t *nb_flags)
+{
+       if (c.opcode == 0)
+               return snprintf(buf, len, "nop     ");
+
+       switch (c.i.op) {
+       case OP_SPECIAL:
+               return print_op_special(c, buf, len, flags_ptr, nb_flags);
+       case OP_REGIMM:
+               *flags_ptr = opcode_branch_flags;
+               *nb_flags = ARRAY_SIZE(opcode_branch_flags);
+               return snprintf(buf, len, "%s%s,0x%x",
+                               regimm_opcodes[c.i.rt],
+                               lightrec_reg_name(c.i.rs),
+                               pc + 4 + ((s16)c.i.imm << 2));
+       case OP_J:
+       case OP_JAL:
+               return snprintf(buf, len, "%s0x%x",
+                               std_opcodes[c.i.op],
+                               (pc & 0xf0000000) | (c.j.imm << 2));
+       case OP_BEQ:
+       case OP_BNE:
+       case OP_BLEZ:
+       case OP_BGTZ:
+               *flags_ptr = opcode_branch_flags;
+               *nb_flags = ARRAY_SIZE(opcode_branch_flags);
+               return snprintf(buf, len, "%s%s,%s,0x%x",
+                               std_opcodes[c.i.op],
+                               lightrec_reg_name(c.i.rs),
+                               lightrec_reg_name(c.i.rt),
+                               pc + 4 + ((s16)c.i.imm << 2));
+       case OP_ADDI:
+       case OP_ADDIU:
+       case OP_SLTI:
+       case OP_SLTIU:
+       case OP_ANDI:
+       case OP_ORI:
+       case OP_XORI:
+               return snprintf(buf, len, "%s%s,%s,0x%04hx",
+                               std_opcodes[c.i.op],
+                               lightrec_reg_name(c.i.rt),
+                               lightrec_reg_name(c.i.rs),
+                               (u16)c.i.imm);
+
+       case OP_LUI:
+               return snprintf(buf, len, "%s%s,0x%04hx",
+                               std_opcodes[c.i.op],
+                               lightrec_reg_name(c.i.rt),
+                               (u16)c.i.imm);
+       case OP_CP0:
+               return print_op_cp(c, buf, len, 0);
+       case OP_CP2:
+               return print_op_cp(c, buf, len, 2);
+       case OP_LB:
+       case OP_LH:
+       case OP_LWL:
+       case OP_LW:
+       case OP_LBU:
+       case OP_LHU:
+       case OP_LWR:
+       case OP_SB:
+       case OP_SH:
+       case OP_SWL:
+       case OP_SW:
+       case OP_SWR:
+               *flags_ptr = opcode_io_flags;
+               *nb_flags = ARRAY_SIZE(opcode_io_flags);
+               return snprintf(buf, len, "%s%s,%hd(%s)",
+                               std_opcodes[c.i.op],
+                               lightrec_reg_name(c.i.rt),
+                               (s16)c.i.imm,
+                               lightrec_reg_name(c.i.rs));
+       case OP_LWC2:
+       case OP_SWC2:
+               *flags_ptr = opcode_io_flags;
+               *nb_flags = ARRAY_SIZE(opcode_io_flags);
+               return snprintf(buf, len, "%s%s,%hd(%s)",
+                               std_opcodes[c.i.op],
+                               lightrec_reg_name(c.i.rt),
+                               (s16)c.i.imm,
+                               lightrec_reg_name(c.i.rs));
+       case OP_META_MOV:
+               return snprintf(buf, len, "move    %s,%s",
+                               lightrec_reg_name(c.r.rd),
+                               lightrec_reg_name(c.r.rs));
+       case OP_META_EXTC:
+               return snprintf(buf, len, "extc    %s,%s",
+                               lightrec_reg_name(c.i.rt),
+                               lightrec_reg_name(c.i.rs));
+       case OP_META_EXTS:
+               return snprintf(buf, len, "exts    %s,%s",
+                               lightrec_reg_name(c.i.rt),
+                               lightrec_reg_name(c.i.rs));
+       default:
+               return snprintf(buf, len, "unknown (0x%08x)", c.opcode);
+       }
+}
+
+void lightrec_print_disassembly(const struct block *block, const u32 *code)
+{
+       const struct opcode *op;
+       const char **flags_ptr;
+       size_t nb_flags, count, count2;
+       char buf[256], buf2[256], buf3[256];
        unsigned int i;
+       u32 pc, branch_pc;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               op = &block->opcode_list[i];
+               branch_pc = get_branch_pc(block, i, 0);
+               pc = block->pc + (i << 2);
+
+               count = print_op((union code)code[i], pc, buf, sizeof(buf),
+                                &flags_ptr, &nb_flags);
+
+               flags_ptr = NULL;
+               nb_flags = 0;
+               count2 = print_op(op->c, branch_pc, buf2, sizeof(buf2),
+                                 &flags_ptr, &nb_flags);
+
+               if (code[i] == op->c.opcode) {
+                       *buf2 = '\0';
+                       count2 = 0;
+               }
+
+               print_flags(buf3, sizeof(buf3), op->flags, flags_ptr, nb_flags);
 
-       memset(&info, 0, sizeof(info));
-       init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf);
-
-       info.buffer = (bfd_byte *) code;
-       info.buffer_vma = (bfd_vma)(uintptr_t) code;
-       info.buffer_length = length;
-       info.flavour = bfd_target_unknown_flavour;
-       info.arch = bfd_arch_mips;
-       info.mach = bfd_mach_mips3000;
-       disassemble_init_for_target(&info);
-
-       for (i = 0; i < length; i += 4) {
-               void print_insn_little_mips(bfd_vma, struct disassemble_info *);
-               putc('\t', stdout);
-               print_insn_little_mips((bfd_vma)(uintptr_t) code++, &info);
-               putc('\n', stdout);
+               printf("0x%08x (0x%x)\t%s%*c%s%*c%s\n", pc, i << 2,
+                      buf, 30 - (int)count, ' ', buf2, 30 - (int)count2, ' ', buf3);
        }
 }
-#endif
index 249d094..ae2af7e 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __DISASSEMBLER_H__
 #define __packed __attribute__((packed))
 #endif
 
-#define LIGHTREC_DIRECT_IO     (1 << 0)
-#define LIGHTREC_NO_INVALIDATE (1 << 1)
-#define LIGHTREC_NO_DS         (1 << 2)
-#define LIGHTREC_SMC           (1 << 3)
-#define LIGHTREC_EMULATE_BRANCH        (1 << 4)
-#define LIGHTREC_LOCAL_BRANCH  (1 << 5)
-#define LIGHTREC_HW_IO         (1 << 6)
-#define LIGHTREC_MULT32                (1 << 7)
+#define BIT(x) (1ULL << (x))
+
+/* Flags for all opcodes */
+#define LIGHTREC_NO_DS         BIT(0)
+#define LIGHTREC_UNLOAD_RS     BIT(1)
+#define LIGHTREC_UNLOAD_RT     BIT(2)
+#define LIGHTREC_UNLOAD_RD     BIT(3)
+#define LIGHTREC_SYNC          BIT(4)
+
+/* Flags for load/store opcodes */
+#define LIGHTREC_DIRECT_IO     BIT(5)
+#define LIGHTREC_HW_IO         BIT(6)
+#define LIGHTREC_SMC           BIT(7)
+#define LIGHTREC_NO_INVALIDATE BIT(8)
+
+/* Flags for branches */
+#define LIGHTREC_EMULATE_BRANCH        BIT(5)
+#define LIGHTREC_LOCAL_BRANCH  BIT(6)
+
+/* Flags for div/mult opcodes */
+#define LIGHTREC_NO_LO         BIT(5)
+#define LIGHTREC_NO_HI         BIT(6)
+#define LIGHTREC_NO_DIV_CHECK  BIT(7)
 
 struct block;
 
@@ -67,13 +73,10 @@ enum standard_opcodes {
        OP_LWC2                 = 0x32,
        OP_SWC2                 = 0x3a,
 
-       OP_META_REG_UNLOAD      = 0x11,
-
-       OP_META_BEQZ            = 0x14,
-       OP_META_BNEZ            = 0x15,
-
        OP_META_MOV             = 0x16,
-       OP_META_SYNC            = 0x17,
+
+       OP_META_EXTC            = 0x17,
+       OP_META_EXTS            = 0x18,
 };
 
 enum special_opcodes {
@@ -195,18 +198,8 @@ struct opcode {
                struct opcode_j j;
        };
        u16 flags;
-       u16 offset;
-       struct opcode *next;
 };
 
-struct opcode * lightrec_disassemble(struct lightrec_state *state,
-                                    const u32 *src, unsigned int *len);
-void lightrec_free_opcode_list(struct lightrec_state *state,
-                              struct opcode *list);
-
-unsigned int lightrec_cycles_of_opcode(union code code);
-
-void lightrec_print_disassembly(const struct block *block,
-                               const u32 *code, unsigned int length);
+void lightrec_print_disassembly(const struct block *block, const u32 *code);
 
 #endif /* __DISASSEMBLER_H__ */
index 0cf75c3..99f6756 100644 (file)
@@ -1,61 +1,50 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "blockcache.h"
 #include "debug.h"
 #include "disassembler.h"
 #include "emitter.h"
+#include "lightning-wrapper.h"
 #include "optimizer.h"
 #include "regcache.h"
 
-#include <lightning.h>
 #include <stdbool.h>
 #include <stddef.h>
 
-typedef void (*lightrec_rec_func_t)(const struct block *,
-                                   const struct opcode *, u32);
+typedef void (*lightrec_rec_func_t)(struct lightrec_cstate *, const struct block *, u16);
 
 /* Forward declarations */
-static void rec_SPECIAL(const struct block *block,
-                      const struct opcode *op, u32 pc);
-static void rec_REGIMM(const struct block *block,
-                     const struct opcode *op, u32 pc);
-static void rec_CP0(const struct block *block, const struct opcode *op, u32 pc);
-static void rec_CP2(const struct block *block, const struct opcode *op, u32 pc);
+static void rec_SPECIAL(struct lightrec_cstate *state, const struct block *block, u16 offset);
+static void rec_REGIMM(struct lightrec_cstate *state, const struct block *block, u16 offset);
+static void rec_CP0(struct lightrec_cstate *state, const struct block *block, u16 offset);
+static void rec_CP2(struct lightrec_cstate *state, const struct block *block, u16 offset);
 
-
-static void unknown_opcode(const struct block *block,
-                          const struct opcode *op, u32 pc)
+static void unknown_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
-       pr_warn("Unknown opcode: 0x%08x at PC 0x%08x\n", op->opcode, pc);
+       pr_warn("Unknown opcode: 0x%08x at PC 0x%08x\n",
+               block->opcode_list[offset].c.opcode,
+               block->pc + (offset << 2));
 }
 
-static void lightrec_emit_end_of_block(const struct block *block,
-                                      const struct opcode *op, u32 pc,
+static void lightrec_emit_end_of_block(struct lightrec_cstate *state,
+                                      const struct block *block, u16 offset,
                                       s8 reg_new_pc, u32 imm, u8 ra_reg,
                                       u32 link, bool update_cycles)
 {
-       struct lightrec_state *state = block->state;
        struct regcache *reg_cache = state->reg_cache;
        u32 cycles = state->cycles;
        jit_state_t *_jit = block->_jit;
+       const struct opcode *op = &block->opcode_list[offset],
+                           *next = &block->opcode_list[offset + 1];
 
        jit_note(__FILE__, __LINE__);
 
        if (link) {
                /* Update the $ra register */
-               u8 link_reg = lightrec_alloc_reg_out(reg_cache, _jit, ra_reg);
+               u8 link_reg = lightrec_alloc_reg_out(reg_cache, _jit, ra_reg, 0);
                jit_movi(link_reg, link);
                lightrec_free_reg(reg_cache, link_reg);
        }
@@ -69,11 +58,11 @@ static void lightrec_emit_end_of_block(const struct block *block,
 
        if (has_delay_slot(op->c) &&
            !(op->flags & (LIGHTREC_NO_DS | LIGHTREC_LOCAL_BRANCH))) {
-               cycles += lightrec_cycles_of_opcode(op->next->c);
+               cycles += lightrec_cycles_of_opcode(next->c);
 
                /* Recompile the delay slot */
-               if (op->next->c.opcode)
-                       lightrec_rec_opcode(block, op->next, pc + 4);
+               if (next->c.opcode)
+                       lightrec_rec_opcode(state, block, offset + 1);
        }
 
        /* Store back remaining registers */
@@ -86,91 +75,122 @@ static void lightrec_emit_end_of_block(const struct block *block,
                pr_debug("EOB: %u cycles\n", cycles);
        }
 
-       if (op->next && ((op->flags & LIGHTREC_NO_DS) || op->next->next))
+       if (offset + !!(op->flags & LIGHTREC_NO_DS) < block->nb_ops - 1)
                state->branches[state->nb_branches++] = jit_jmpi();
 }
 
-void lightrec_emit_eob(const struct block *block,
-                      const struct opcode *op, u32 pc)
+void lightrec_emit_eob(struct lightrec_cstate *state, const struct block *block,
+                      u16 offset, bool after_op)
 {
-       struct lightrec_state *state = block->state;
        struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
+       union code c = block->opcode_list[offset].c;
+       u32 cycles = state->cycles;
+
+       if (!after_op)
+               cycles -= lightrec_cycles_of_opcode(c);
 
        lightrec_storeback_regs(reg_cache, _jit);
 
-       jit_movi(JIT_V0, pc);
-       jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE,
-                state->cycles - lightrec_cycles_of_opcode(op->c));
+       jit_movi(JIT_V0, block->pc + (offset << 2));
+       jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, cycles);
 
        state->branches[state->nb_branches++] = jit_jmpi();
 }
 
-static void rec_special_JR(const struct block *block,
-                          const struct opcode *op, u32 pc)
+static u8 get_jr_jalr_reg(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
+       const struct opcode *op = &block->opcode_list[offset],
+                           *next = &block->opcode_list[offset + 1];
        u8 rs = lightrec_request_reg_in(reg_cache, _jit, op->r.rs, JIT_V0);
 
-       _jit_name(block->_jit, __func__);
+       /* If the source register is already mapped to JIT_R0 or JIT_R1, and the
+        * delay slot is a I/O operation, unload the register, since JIT_R0 and
+        * JIT_R1 are explicitely used by the I/O opcode generators. */
+       if ((rs == JIT_R0 || rs == JIT_R1) &&
+           !(op->flags & LIGHTREC_NO_DS) &&
+           opcode_is_io(next->c) &&
+           !(next->flags & (LIGHTREC_NO_INVALIDATE | LIGHTREC_DIRECT_IO))) {
+               lightrec_unload_reg(reg_cache, _jit, rs);
+               lightrec_free_reg(reg_cache, rs);
+
+               rs = lightrec_request_reg_in(reg_cache, _jit, op->r.rs, JIT_V0);
+       }
+
        lightrec_lock_reg(reg_cache, _jit, rs);
-       lightrec_emit_end_of_block(block, op, pc, rs, 0, 31, 0, true);
+
+       return rs;
 }
 
-static void rec_special_JALR(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_JR(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
-       jit_state_t *_jit = block->_jit;
-       u8 rs = lightrec_request_reg_in(reg_cache, _jit, op->r.rs, JIT_V0);
+       u8 rs = get_jr_jalr_reg(state, block, offset);
 
        _jit_name(block->_jit, __func__);
-       lightrec_lock_reg(reg_cache, _jit, rs);
-       lightrec_emit_end_of_block(block, op, pc, rs, 0, op->r.rd, pc + 8, true);
+       lightrec_emit_end_of_block(state, block, offset, rs, 0, 31, 0, true);
+}
+
+static void rec_special_JALR(struct lightrec_cstate *state, const struct block *block, u16 offset)
+{
+       u8 rs = get_jr_jalr_reg(state, block, offset);
+       union code c = block->opcode_list[offset].c;
+
+       _jit_name(block->_jit, __func__);
+       lightrec_emit_end_of_block(state, block, offset, rs, 0, c.r.rd,
+                                  get_branch_pc(block, offset, 2), true);
 }
 
-static void rec_J(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_J(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       lightrec_emit_end_of_block(block, op, pc, -1,
-                                  (pc & 0xf0000000) | (op->j.imm << 2), 31, 0, true);
+       lightrec_emit_end_of_block(state, block, offset, -1,
+                                  (block->pc & 0xf0000000) | (c.j.imm << 2),
+                                  31, 0, true);
 }
 
-static void rec_JAL(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_JAL(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       lightrec_emit_end_of_block(block, op, pc, -1,
-                                  (pc & 0xf0000000) | (op->j.imm << 2),
-                                  31, pc + 8, true);
+       lightrec_emit_end_of_block(state, block, offset, -1,
+                                  (block->pc & 0xf0000000) | (c.j.imm << 2),
+                                  31, get_branch_pc(block, offset, 2), true);
 }
 
-static void rec_b(const struct block *block, const struct opcode *op, u32 pc,
+static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 offset,
                  jit_code_t code, u32 link, bool unconditional, bool bz)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
        struct native_register *regs_backup;
        jit_state_t *_jit = block->_jit;
        struct lightrec_branch *branch;
+       const struct opcode *op = &block->opcode_list[offset],
+                           *next = &block->opcode_list[offset + 1];
        jit_node_t *addr;
        u8 link_reg;
-       u32 offset, cycles = block->state->cycles;
+       u32 target_offset, cycles = state->cycles;
        bool is_forward = (s16)op->i.imm >= -1;
+       u32 next_pc;
 
        jit_note(__FILE__, __LINE__);
 
        if (!(op->flags & LIGHTREC_NO_DS))
-               cycles += lightrec_cycles_of_opcode(op->next->c);
+               cycles += lightrec_cycles_of_opcode(next->c);
 
-       block->state->cycles = 0;
+       state->cycles = 0;
 
        if (cycles)
                jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, cycles);
 
        if (!unconditional) {
-               u8 rs = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->i.rs),
-                  rt = bz ? 0 : lightrec_alloc_reg_in_ext(reg_cache,
-                                                          _jit, op->i.rt);
+               u8 rs = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rs, REG_EXT),
+                  rt = bz ? 0 : lightrec_alloc_reg_in(reg_cache,
+                                                      _jit, op->i.rt, REG_EXT);
 
                /* Generate the branch opcode */
                addr = jit_new_node_pww(code, NULL, rs, rt);
@@ -180,15 +200,15 @@ static void rec_b(const struct block *block, const struct opcode *op, u32 pc,
        }
 
        if (op->flags & LIGHTREC_LOCAL_BRANCH) {
-               if (op->next && !(op->flags & LIGHTREC_NO_DS)) {
+               if (next && !(op->flags & LIGHTREC_NO_DS)) {
                        /* Recompile the delay slot */
-                       if (op->next->opcode)
-                               lightrec_rec_opcode(block, op->next, pc + 4);
+                       if (next->opcode)
+                               lightrec_rec_opcode(state, block, offset + 1);
                }
 
                if (link) {
                        /* Update the $ra register */
-                       link_reg = lightrec_alloc_reg_out(reg_cache, _jit, 31);
+                       link_reg = lightrec_alloc_reg_out(reg_cache, _jit, 31, 0);
                        jit_movi(link_reg, link);
                        lightrec_free_reg(reg_cache, link_reg);
                }
@@ -196,12 +216,14 @@ static void rec_b(const struct block *block, const struct opcode *op, u32 pc,
                /* Store back remaining registers */
                lightrec_storeback_regs(reg_cache, _jit);
 
-               offset = op->offset + 1 + (s16)op->i.imm;
-               pr_debug("Adding local branch to offset 0x%x\n", offset << 2);
-               branch = &block->state->local_branches[
-                       block->state->nb_local_branches++];
+               target_offset = offset + 1 + (s16)op->i.imm
+                       - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
+               pr_debug("Adding local branch to offset 0x%x\n",
+                        target_offset << 2);
+               branch = &state->local_branches[
+                       state->nb_local_branches++];
 
-               branch->target = offset;
+               branch->target = target_offset;
                if (is_forward)
                        branch->branch = jit_jmpi();
                else
@@ -209,8 +231,8 @@ static void rec_b(const struct block *block, const struct opcode *op, u32 pc,
        }
 
        if (!(op->flags & LIGHTREC_LOCAL_BRANCH) || !is_forward) {
-               lightrec_emit_end_of_block(block, op, pc, -1,
-                                          pc + 4 + ((s16)op->i.imm << 2),
+               next_pc = get_branch_pc(block, offset, 1 + (s16)op->i.imm);
+               lightrec_emit_end_of_block(state, block, offset, -1, next_pc,
                                           31, link, false);
        }
 
@@ -220,105 +242,127 @@ static void rec_b(const struct block *block, const struct opcode *op, u32 pc,
 
                if (bz && link) {
                        /* Update the $ra register */
-                       link_reg = lightrec_alloc_reg_out_ext(reg_cache,
-                                                             _jit, 31);
+                       link_reg = lightrec_alloc_reg_out(reg_cache, _jit,
+                                                         31, REG_EXT);
                        jit_movi(link_reg, (s32)link);
                        lightrec_free_reg(reg_cache, link_reg);
                }
 
-               if (!(op->flags & LIGHTREC_NO_DS) && op->next->opcode)
-                       lightrec_rec_opcode(block, op->next, pc + 4);
+               if (!(op->flags & LIGHTREC_NO_DS) && next->opcode)
+                       lightrec_rec_opcode(state, block, offset + 1);
        }
 }
 
-static void rec_BNE(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_BNE(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_beqr, 0, false, false);
+
+       if (c.i.rt == 0)
+               rec_b(state, block, offset, jit_code_beqi, 0, false, true);
+       else
+               rec_b(state, block, offset, jit_code_beqr, 0, false, false);
 }
 
-static void rec_BEQ(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_BEQ(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_bner, 0,
-                       op->i.rs == op->i.rt, false);
+
+       if (c.i.rt == 0)
+               rec_b(state, block, offset, jit_code_bnei, 0, c.i.rs == 0, true);
+       else
+               rec_b(state, block, offset, jit_code_bner, 0, c.i.rs == c.i.rt, false);
 }
 
-static void rec_BLEZ(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_BLEZ(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_bgti, 0, op->i.rs == 0, true);
+       rec_b(state, block, offset, jit_code_bgti, 0, c.i.rs == 0, true);
 }
 
-static void rec_BGTZ(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_BGTZ(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_blei, 0, false, true);
+       rec_b(state, block, offset, jit_code_blei, 0, false, true);
 }
 
-static void rec_regimm_BLTZ(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_regimm_BLTZ(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_bgei, 0, false, true);
+       rec_b(state, block, offset, jit_code_bgei, 0, false, true);
 }
 
-static void rec_regimm_BLTZAL(const struct block *block,
-                             const struct opcode *op, u32 pc)
+static void rec_regimm_BLTZAL(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_bgei, pc + 8, false, true);
+       rec_b(state, block, offset, jit_code_bgei,
+             get_branch_pc(block, offset, 2), false, true);
 }
 
-static void rec_regimm_BGEZ(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_regimm_BGEZ(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_blti, 0, !op->i.rs, true);
+       rec_b(state, block, offset, jit_code_blti, 0, !c.i.rs, true);
 }
 
-static void rec_regimm_BGEZAL(const struct block *block,
-                             const struct opcode *op, u32 pc)
+static void rec_regimm_BGEZAL(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset)
 {
+       const struct opcode *op = &block->opcode_list[offset];
        _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_blti, pc + 8, !op->i.rs, true);
+       rec_b(state, block, offset, jit_code_blti,
+             get_branch_pc(block, offset, 2),
+             !op->i.rs, true);
 }
 
-static void rec_alu_imm(const struct block *block, const struct opcode *op,
-                       jit_code_t code, bool sign_extend)
+static void rec_alu_imm(struct lightrec_cstate *state, const struct block *block,
+                       u16 offset, jit_code_t code, bool slti)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rs, rt;
+       u8 rs, rt, out_flags = REG_EXT;
+
+       if (slti)
+               out_flags |= REG_ZEXT;
 
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->i.rs);
-       rt = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->i.rt);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, REG_EXT);
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, out_flags);
 
-       if (sign_extend)
-               jit_new_node_www(code, rt, rs, (s32)(s16) op->i.imm);
-       else
-               jit_new_node_www(code, rt, rs, (u32)(u16) op->i.imm);
+       jit_new_node_www(code, rt, rs, (s32)(s16) c.i.imm);
 
        lightrec_free_reg(reg_cache, rs);
        lightrec_free_reg(reg_cache, rt);
 }
 
-static void rec_alu_special(const struct block *block, const struct opcode *op,
-                           jit_code_t code, bool out_ext)
+static void rec_alu_special(struct lightrec_cstate *state, const struct block *block,
+                           u16 offset, jit_code_t code, bool out_ext)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
        u8 rd, rt, rs;
 
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rs);
-       rt = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rt);
-
-       if (out_ext)
-          rd = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->r.rd);
-       else
-          rd = lightrec_alloc_reg_out(reg_cache, _jit, op->r.rd);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, REG_EXT);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, REG_EXT);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd,
+                                   out_ext ? REG_EXT | REG_ZEXT : 0);
 
        jit_new_node_www(code, rd, rs, rt);
 
@@ -327,539 +371,698 @@ static void rec_alu_special(const struct block *block, const struct opcode *op,
        lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_alu_shiftv(const struct block *block,
-                          const struct opcode *op, jit_code_t code)
+static void rec_alu_shiftv(struct lightrec_cstate *state, const struct block *block,
+                          u16 offset, jit_code_t code)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rd, rt, rs, temp;
+       u8 rd, rt, rs, temp, flags = 0;
 
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rs);
-       temp = lightrec_alloc_reg_temp(reg_cache, _jit);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
 
-       if (code == jit_code_rshr) {
-               rt = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rt);
-               rd = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->r.rd);
-       } else {
-               rt = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rt);
-               rd = lightrec_alloc_reg_out(reg_cache, _jit, op->r.rd);
-       }
+       if (code == jit_code_rshr)
+               flags = REG_EXT;
+       else if (code == jit_code_rshr_u)
+               flags = REG_ZEXT;
 
-       jit_andi(temp, rs, 0x1f);
-
-#if __WORDSIZE == 64
-       if (code == jit_code_rshr_u) {
-               jit_extr_ui(rd, rt);
-               jit_new_node_www(code, rd, rd, temp);
-       }
-#endif
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, flags);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, flags);
 
-       if (__WORDSIZE == 32 || code != jit_code_rshr_u)
+       if (rs != rd && rt != rd) {
+               jit_andi(rd, rs, 0x1f);
+               jit_new_node_www(code, rd, rt, rd);
+       } else {
+               temp = lightrec_alloc_reg_temp(reg_cache, _jit);
+               jit_andi(temp, rs, 0x1f);
                jit_new_node_www(code, rd, rt, temp);
+               lightrec_free_reg(reg_cache, temp);
+       }
 
        lightrec_free_reg(reg_cache, rs);
-       lightrec_free_reg(reg_cache, temp);
        lightrec_free_reg(reg_cache, rt);
        lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_ADDIU(const struct block *block,
-                     const struct opcode *op, u32 pc)
+static void rec_ADDIU(struct lightrec_cstate *state,
+                     const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_addi, true);
+       rec_alu_imm(state, block, offset, jit_code_addi, false);
 }
 
-static void rec_ADDI(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_ADDI(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
        /* TODO: Handle the exception? */
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_addi, true);
+       rec_alu_imm(state, block, offset, jit_code_addi, false);
 }
 
-static void rec_SLTIU(const struct block *block,
-                     const struct opcode *op, u32 pc)
+static void rec_SLTIU(struct lightrec_cstate *state,
+                     const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_lti_u, true);
+       rec_alu_imm(state, block, offset, jit_code_lti_u, true);
 }
 
-static void rec_SLTI(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SLTI(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_lti, true);
+       rec_alu_imm(state, block, offset, jit_code_lti, true);
 }
 
-static void rec_ANDI(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_ANDI(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
        u8 rs, rt;
 
        _jit_name(block->_jit, __func__);
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rs);
-       rt = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->i.rt);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt,
+                                   REG_EXT | REG_ZEXT);
 
        /* PSX code uses ANDI 0xff / ANDI 0xffff a lot, which are basically
         * casts to uint8_t / uint16_t. */
-       if (op->i.imm == 0xff)
+       if (c.i.imm == 0xff)
                jit_extr_uc(rt, rs);
-       else if (op->i.imm == 0xffff)
+       else if (c.i.imm == 0xffff)
                jit_extr_us(rt, rs);
        else
-               jit_andi(rt, rs, (u32)(u16) op->i.imm);
+               jit_andi(rt, rs, (u32)(u16) c.i.imm);
+
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rt);
+}
+
+static void rec_alu_or_xor(struct lightrec_cstate *state, const struct block *block,
+                          u16 offset, jit_code_t code)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
+       u8 rs, rt, flags;
+
+       jit_note(__FILE__, __LINE__);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, 0);
+
+       flags = lightrec_get_reg_in_flags(reg_cache, rs);
+       lightrec_set_reg_out_flags(reg_cache, rt, flags);
+
+       jit_new_node_www(code, rt, rs, (u32)(u16) c.i.imm);
 
        lightrec_free_reg(reg_cache, rs);
        lightrec_free_reg(reg_cache, rt);
 }
 
-static void rec_ORI(const struct block *block, const struct opcode *op, u32 pc)
+
+static void rec_ORI(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_ori, false);
+       rec_alu_or_xor(state, block, offset, jit_code_ori);
 }
 
-static void rec_XORI(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_XORI(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_imm(block, op, jit_code_xori, false);
+       rec_alu_or_xor(state, block, offset, jit_code_xori);
 }
 
-static void rec_LUI(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LUI(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rt;
+       u8 rt, flags = REG_EXT;
 
        jit_name(__func__);
        jit_note(__FILE__, __LINE__);
-       rt = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->i.rt);
 
-       jit_movi(rt, (s32)(op->i.imm << 16));
+       if (!(c.i.imm & BIT(15)))
+               flags |= REG_ZEXT;
+
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, flags);
+
+       jit_movi(rt, (s32)(c.i.imm << 16));
 
        lightrec_free_reg(reg_cache, rt);
 }
 
-static void rec_special_ADDU(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_ADDU(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_addr, false);
+       rec_alu_special(state, block, offset, jit_code_addr, false);
 }
 
-static void rec_special_ADD(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_ADD(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        /* TODO: Handle the exception? */
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_addr, false);
+       rec_alu_special(state, block, offset, jit_code_addr, false);
 }
 
-static void rec_special_SUBU(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_SUBU(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_subr, false);
+       rec_alu_special(state, block, offset, jit_code_subr, false);
 }
 
-static void rec_special_SUB(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_SUB(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        /* TODO: Handle the exception? */
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_subr, false);
+       rec_alu_special(state, block, offset, jit_code_subr, false);
 }
 
-static void rec_special_AND(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_AND(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
+       u8 rd, rt, rs, flags_rs, flags_rt, flags_rd;
+
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_andr, false);
+       jit_note(__FILE__, __LINE__);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, 0);
+
+       flags_rs = lightrec_get_reg_in_flags(reg_cache, rs);
+       flags_rt = lightrec_get_reg_in_flags(reg_cache, rt);
+
+       /* Z(rd) = Z(rs) | Z(rt) */
+       flags_rd = REG_ZEXT & (flags_rs | flags_rt);
+
+       /* E(rd) = (E(rt) & Z(rt)) | (E(rs) & Z(rs)) | (E(rs) & E(rt)) */
+       if (((flags_rs & REG_EXT) && (flags_rt & REG_ZEXT)) ||
+           ((flags_rt & REG_EXT) && (flags_rs & REG_ZEXT)) ||
+           (REG_EXT & flags_rs & flags_rt))
+               flags_rd |= REG_EXT;
+
+       lightrec_set_reg_out_flags(reg_cache, rd, flags_rd);
+
+       jit_andr(rd, rs, rt);
+
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rt);
+       lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_special_OR(const struct block *block,
-                          const struct opcode *op, u32 pc)
+static void rec_special_or_nor(struct lightrec_cstate *state,
+                              const struct block *block, u16 offset, bool nor)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
+       u8 rd, rt, rs, flags_rs, flags_rt, flags_rd = 0;
+
+       jit_note(__FILE__, __LINE__);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, 0);
+
+       flags_rs = lightrec_get_reg_in_flags(reg_cache, rs);
+       flags_rt = lightrec_get_reg_in_flags(reg_cache, rt);
+
+       /* or: Z(rd) = Z(rs) & Z(rt)
+        * nor: Z(rd) = 0 */
+       if (!nor)
+               flags_rd = REG_ZEXT & flags_rs & flags_rt;
+
+       /* E(rd) = (E(rs) & E(rt)) | (E(rt) & !Z(rt)) | (E(rs) & !Z(rs)) */
+       if ((REG_EXT & flags_rs & flags_rt) ||
+           (flags_rt & (REG_EXT | REG_ZEXT) == REG_EXT) ||
+           (flags_rs & (REG_EXT | REG_ZEXT) == REG_EXT))
+               flags_rd |= REG_EXT;
+
+       lightrec_set_reg_out_flags(reg_cache, rd, flags_rd);
+
+       jit_orr(rd, rs, rt);
+
+       if (nor)
+               jit_comr(rd, rd);
+
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rt);
+       lightrec_free_reg(reg_cache, rd);
+}
+
+static void rec_special_OR(struct lightrec_cstate *state,
+                          const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_orr, false);
+       rec_special_or_nor(state, block, offset, false);
 }
 
-static void rec_special_XOR(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_NOR(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_xorr, false);
+       rec_special_or_nor(state, block, offset, true);
 }
 
-static void rec_special_NOR(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_XOR(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rd;
+       u8 rd, rt, rs, flags_rs, flags_rt, flags_rd;
 
-       jit_name(__func__);
-       rec_alu_special(block, op, jit_code_orr, false);
-       rd = lightrec_alloc_reg_out(reg_cache, _jit, op->r.rd);
+       _jit_name(block->_jit, __func__);
+
+       jit_note(__FILE__, __LINE__);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, 0);
 
-       jit_comr(rd, rd);
+       flags_rs = lightrec_get_reg_in_flags(reg_cache, rs);
+       flags_rt = lightrec_get_reg_in_flags(reg_cache, rt);
 
+       /* Z(rd) = Z(rs) & Z(rt) */
+       flags_rd = REG_ZEXT & flags_rs & flags_rt;
+
+       /* E(rd) = E(rs) & E(rt) */
+       flags_rd |= REG_EXT & flags_rs & flags_rt;
+
+       lightrec_set_reg_out_flags(reg_cache, rd, flags_rd);
+
+       jit_xorr(rd, rs, rt);
+
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rt);
        lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_special_SLTU(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_SLTU(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_ltr_u, true);
+       rec_alu_special(state, block, offset, jit_code_ltr_u, true);
 }
 
-static void rec_special_SLT(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_SLT(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_special(block, op, jit_code_ltr, true);
+       rec_alu_special(state, block, offset, jit_code_ltr, true);
 }
 
-static void rec_special_SLLV(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_SLLV(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shiftv(block, op, jit_code_lshr);
+       rec_alu_shiftv(state, block, offset, jit_code_lshr);
 }
 
-static void rec_special_SRLV(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_SRLV(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shiftv(block, op, jit_code_rshr_u);
+       rec_alu_shiftv(state, block, offset, jit_code_rshr_u);
 }
 
-static void rec_special_SRAV(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_SRAV(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shiftv(block, op, jit_code_rshr);
+       rec_alu_shiftv(state, block, offset, jit_code_rshr);
 }
 
-static void rec_alu_shift(const struct block *block,
-                         const struct opcode *op, jit_code_t code)
+static void rec_alu_shift(struct lightrec_cstate *state, const struct block *block,
+                         u16 offset, jit_code_t code)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rd, rt;
+       u8 rd, rt, flags = 0;
 
        jit_note(__FILE__, __LINE__);
 
-       if (code == jit_code_rshi) {
-               rt = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rt);
-               rd = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->r.rd);
-       } else {
-               rt = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rt);
-               rd = lightrec_alloc_reg_out(reg_cache, _jit, op->r.rd);
-       }
+       if (code == jit_code_rshi)
+               flags = REG_EXT;
+       else if (code == jit_code_rshi_u)
+               flags = REG_ZEXT;
 
-#if __WORDSIZE == 64
-       if (code == jit_code_rshi_u) {
-               jit_extr_ui(rd, rt);
-               jit_new_node_www(code, rd, rd, op->r.imm);
-       }
-#endif
-       if (__WORDSIZE == 32 || code != jit_code_rshi_u)
-               jit_new_node_www(code, rd, rt, op->r.imm);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, flags);
+
+       /* Input reg is zero-extended, if we SRL at least by one bit, we know
+        * the output reg will be both zero-extended and sign-extended. */
+       if (code == jit_code_rshi_u && c.r.imm)
+               flags |= REG_EXT;
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, flags);
+
+       jit_new_node_www(code, rd, rt, c.r.imm);
 
        lightrec_free_reg(reg_cache, rt);
        lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_special_SLL(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_SLL(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shift(block, op, jit_code_lshi);
+       rec_alu_shift(state, block, offset, jit_code_lshi);
 }
 
-static void rec_special_SRL(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_SRL(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shift(block, op, jit_code_rshi_u);
+       rec_alu_shift(state, block, offset, jit_code_rshi_u);
 }
 
-static void rec_special_SRA(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_SRA(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_shift(block, op, jit_code_rshi);
+       rec_alu_shift(state, block, offset, jit_code_rshi);
 }
 
-static void rec_alu_mult(const struct block *block,
-                        const struct opcode *op, bool is_signed)
+static void rec_alu_mult(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset, bool is_signed)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       u16 flags = block->opcode_list[offset].flags;
+       u8 reg_lo = get_mult_div_lo(c);
+       u8 reg_hi = get_mult_div_hi(c);
        jit_state_t *_jit = block->_jit;
-       u8 lo, hi, rs, rt;
+       u8 lo, hi, rs, rt, rflags = 0;
 
        jit_note(__FILE__, __LINE__);
 
-       lo = lightrec_alloc_reg_out(reg_cache, _jit, REG_LO);
-       if (!(op->flags & LIGHTREC_MULT32))
-               hi = lightrec_alloc_reg_out_ext(reg_cache, _jit, REG_HI);
-       else if (__WORDSIZE == 64)
-               hi = lightrec_alloc_reg_temp(reg_cache, _jit);
-
-       if (__WORDSIZE == 32 || !is_signed) {
-               rs = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rs);
-               rt = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rt);
+       if (is_signed)
+               rflags = REG_EXT;
+       else
+               rflags = REG_ZEXT;
+
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, rflags);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, rflags);
+
+       if (!(flags & LIGHTREC_NO_LO))
+               lo = lightrec_alloc_reg_out(reg_cache, _jit, reg_lo, 0);
+       else if (__WORDSIZE == 32)
+               lo = lightrec_alloc_reg_temp(reg_cache, _jit);
+
+       if (!(flags & LIGHTREC_NO_HI))
+               hi = lightrec_alloc_reg_out(reg_cache, _jit, reg_hi, REG_EXT);
+
+       if (__WORDSIZE == 32) {
+               /* On 32-bit systems, do a 32*32->64 bit operation, or a 32*32->32 bit
+                * operation if the MULT was detected a 32-bit only. */
+               if (!(flags & LIGHTREC_NO_HI)) {
+                       if (is_signed)
+                               jit_qmulr(lo, hi, rs, rt);
+                       else
+                               jit_qmulr_u(lo, hi, rs, rt);
+               } else {
+                       jit_mulr(lo, rs, rt);
+               }
        } else {
-               rs = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rs);
-               rt = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rt);
-       }
+               /* On 64-bit systems, do a 64*64->64 bit operation. */
+               if (flags & LIGHTREC_NO_LO) {
+                       jit_mulr(hi, rs, rt);
+                       jit_rshi(hi, hi, 32);
+               } else {
+                       jit_mulr(lo, rs, rt);
 
-#if __WORDSIZE == 32
-       /* On 32-bit systems, do a 32*32->64 bit operation, or a 32*32->32 bit
-        * operation if the MULT was detected a 32-bit only. */
-       if (!(op->flags & LIGHTREC_MULT32)) {
-               if (is_signed)
-                       jit_qmulr(lo, hi, rs, rt);
-               else
-                       jit_qmulr_u(lo, hi, rs, rt);
-       } else {
-               jit_mulr(lo, rs, rt);
-       }
-#else
-       /* On 64-bit systems, do a 64*64->64 bit operation.
-        * The input registers must be 32 bits, so we first sign-extend (if
-        * mult) or clear (if multu) the input registers. */
-       if (is_signed) {
-               jit_mulr(lo, rs, rt);
-       } else {
-               jit_extr_ui(lo, rt);
-               jit_extr_ui(hi, rs);
-               jit_mulr(lo, hi, lo);
+                       /* The 64-bit output value is in $lo, store the upper 32 bits in $hi */
+                       if (!(flags & LIGHTREC_NO_HI))
+                               jit_rshi(hi, lo, 32);
+               }
        }
 
-       /* The 64-bit output value is in $lo, store the upper 32 bits in $hi */
-       if (!(op->flags & LIGHTREC_MULT32))
-               jit_rshi(hi, lo, 32);
-#endif
-
        lightrec_free_reg(reg_cache, rs);
        lightrec_free_reg(reg_cache, rt);
-       lightrec_free_reg(reg_cache, lo);
-       if (__WORDSIZE == 64 || !(op->flags & LIGHTREC_MULT32))
+       if (!(flags & LIGHTREC_NO_LO) || __WORDSIZE == 32)
+               lightrec_free_reg(reg_cache, lo);
+       if (!(flags & LIGHTREC_NO_HI))
                lightrec_free_reg(reg_cache, hi);
 }
 
-static void rec_alu_div(const struct block *block,
-                       const struct opcode *op, bool is_signed)
+static void rec_alu_div(struct lightrec_cstate *state,
+                       const struct block *block, u16 offset, bool is_signed)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       u16 flags = block->opcode_list[offset].flags;
+       bool no_check = flags & LIGHTREC_NO_DIV_CHECK;
+       u8 reg_lo = get_mult_div_lo(c);
+       u8 reg_hi = get_mult_div_hi(c);
        jit_state_t *_jit = block->_jit;
        jit_node_t *branch, *to_end;
-       u8 lo, hi, rs, rt;
+       u8 lo, hi, rs, rt, rflags = 0;
 
        jit_note(__FILE__, __LINE__);
-       lo = lightrec_alloc_reg_out(reg_cache, _jit, REG_LO);
-       hi = lightrec_alloc_reg_out(reg_cache, _jit, REG_HI);
 
-       if (__WORDSIZE == 32 || !is_signed) {
-               rs = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rs);
-               rt = lightrec_alloc_reg_in(reg_cache, _jit, op->r.rt);
-       } else {
-               rs = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rs);
-               rt = lightrec_alloc_reg_in_ext(reg_cache, _jit, op->r.rt);
-       }
+       if (is_signed)
+               rflags = REG_EXT;
+       else
+               rflags = REG_ZEXT;
+
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, rflags);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, rflags);
+
+       if (!(flags & LIGHTREC_NO_LO))
+               lo = lightrec_alloc_reg_out(reg_cache, _jit, reg_lo, 0);
+
+       if (!(flags & LIGHTREC_NO_HI))
+               hi = lightrec_alloc_reg_out(reg_cache, _jit, reg_hi, 0);
 
        /* Jump to special handler if dividing by zero  */
-       branch = jit_beqi(rt, 0);
+       if (!no_check)
+               branch = jit_beqi(rt, 0);
 
-#if __WORDSIZE == 32
-       if (is_signed)
-               jit_qdivr(lo, hi, rs, rt);
-       else
-               jit_qdivr_u(lo, hi, rs, rt);
-#else
-       /* On 64-bit systems, the input registers must be 32 bits, so we first sign-extend
-        * (if div) or clear (if divu) the input registers. */
-       if (is_signed) {
-               jit_qdivr(lo, hi, rs, rt);
+       if (flags & LIGHTREC_NO_LO) {
+               if (is_signed)
+                       jit_remr(hi, rs, rt);
+               else
+                       jit_remr_u(hi, rs, rt);
+       } else if (flags & LIGHTREC_NO_HI) {
+               if (is_signed)
+                       jit_divr(lo, rs, rt);
+               else
+                       jit_divr_u(lo, rs, rt);
        } else {
-               jit_extr_ui(lo, rt);
-               jit_extr_ui(hi, rs);
-               jit_qdivr_u(lo, hi, hi, lo);
+               if (is_signed)
+                       jit_qdivr(lo, hi, rs, rt);
+               else
+                       jit_qdivr_u(lo, hi, rs, rt);
        }
-#endif
 
-       /* Jump above the div-by-zero handler */
-       to_end = jit_jmpi();
+       if (!no_check) {
+               lightrec_regcache_mark_live(reg_cache, _jit);
 
-       jit_patch(branch);
+               /* Jump above the div-by-zero handler */
+               to_end = jit_jmpi();
 
-       if (is_signed) {
-               jit_lti(lo, rs, 0);
-               jit_lshi(lo, lo, 1);
-               jit_subi(lo, lo, 1);
-       } else {
-               jit_movi(lo, 0xffffffff);
-       }
+               jit_patch(branch);
+
+               if (!(flags & LIGHTREC_NO_LO)) {
+                       if (is_signed) {
+                               jit_lti(lo, rs, 0);
+                               jit_lshi(lo, lo, 1);
+                               jit_subi(lo, lo, 1);
+                       } else {
+                               jit_movi(lo, 0xffffffff);
+                       }
+               }
 
-       jit_movr(hi, rs);
+               if (!(flags & LIGHTREC_NO_HI))
+                       jit_movr(hi, rs);
 
-       jit_patch(to_end);
+               jit_patch(to_end);
+       }
 
        lightrec_free_reg(reg_cache, rs);
        lightrec_free_reg(reg_cache, rt);
-       lightrec_free_reg(reg_cache, lo);
-       lightrec_free_reg(reg_cache, hi);
+
+       if (!(flags & LIGHTREC_NO_LO))
+               lightrec_free_reg(reg_cache, lo);
+
+       if (!(flags & LIGHTREC_NO_HI))
+               lightrec_free_reg(reg_cache, hi);
 }
 
-static void rec_special_MULT(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_MULT(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_mult(block, op, true);
+       rec_alu_mult(state, block, offset, true);
 }
 
-static void rec_special_MULTU(const struct block *block,
-                             const struct opcode *op, u32 pc)
+static void rec_special_MULTU(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_mult(block, op, false);
+       rec_alu_mult(state, block, offset, false);
 }
 
-static void rec_special_DIV(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_special_DIV(struct lightrec_cstate *state,
+                           const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_div(block, op, true);
+       rec_alu_div(state, block, offset, true);
 }
 
-static void rec_special_DIVU(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_DIVU(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_alu_div(block, op, false);
+       rec_alu_div(state, block, offset, false);
 }
 
-static void rec_alu_mv_lo_hi(const struct block *block, u8 dst, u8 src)
+static void rec_alu_mv_lo_hi(struct lightrec_cstate *state,
+                            const struct block *block, u8 dst, u8 src)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
 
        jit_note(__FILE__, __LINE__);
-       src = lightrec_alloc_reg_in(reg_cache, _jit, src);
-       dst = lightrec_alloc_reg_out_ext(reg_cache, _jit, dst);
+       src = lightrec_alloc_reg_in(reg_cache, _jit, src, 0);
+       dst = lightrec_alloc_reg_out(reg_cache, _jit, dst, REG_EXT);
 
-#if __WORDSIZE == 32
-       jit_movr(dst, src);
-#else
        jit_extr_i(dst, src);
-#endif
 
        lightrec_free_reg(reg_cache, src);
        lightrec_free_reg(reg_cache, dst);
 }
 
-static void rec_special_MFHI(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_MFHI(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_alu_mv_lo_hi(block, op->r.rd, REG_HI);
+       rec_alu_mv_lo_hi(state, block, c.r.rd, REG_HI);
 }
 
-static void rec_special_MTHI(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_MTHI(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_alu_mv_lo_hi(block, REG_HI, op->r.rs);
+       rec_alu_mv_lo_hi(state, block, REG_HI, c.r.rs);
 }
 
-static void rec_special_MFLO(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_MFLO(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_alu_mv_lo_hi(block, op->r.rd, REG_LO);
+       rec_alu_mv_lo_hi(state, block, c.r.rd, REG_LO);
 }
 
-static void rec_special_MTLO(const struct block *block,
-                            const struct opcode *op, u32 pc)
+static void rec_special_MTLO(struct lightrec_cstate *state,
+                            const struct block *block, u16 offset)
 {
+       union code c = block->opcode_list[offset].c;
+
        _jit_name(block->_jit, __func__);
-       rec_alu_mv_lo_hi(block, REG_LO, op->r.rs);
+       rec_alu_mv_lo_hi(state, block, REG_LO, c.r.rs);
 }
 
-static void rec_io(const struct block *block, const struct opcode *op,
-                  bool load_rt, bool read_rt)
+static void call_to_c_wrapper(struct lightrec_cstate *state, const struct block *block,
+                             u32 arg, bool with_arg, enum c_wrappers wrapper)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
-       bool is_tagged = op->flags & (LIGHTREC_HW_IO | LIGHTREC_DIRECT_IO);
-       u32 offset;
        u8 tmp, tmp2, tmp3;
 
-       jit_note(__FILE__, __LINE__);
+       if (with_arg)
+               tmp3 = lightrec_alloc_reg(reg_cache, _jit, JIT_R1);
+       tmp2 = lightrec_alloc_reg(reg_cache, _jit, JIT_R0);
+       tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
 
-       tmp = lightrec_alloc_reg(reg_cache, _jit, JIT_R0);
+       jit_ldxi(tmp, LIGHTREC_REG_STATE,
+                offsetof(struct lightrec_state, c_wrapper));
+       jit_ldxi(tmp2, LIGHTREC_REG_STATE,
+                offsetof(struct lightrec_state, c_wrappers[wrapper]));
+       if (with_arg)
+               jit_movi(tmp3, arg);
 
-       if (is_tagged) {
-               offset = offsetof(struct lightrec_state, rw_func);
-       } else {
-               tmp3 = lightrec_alloc_reg(reg_cache, _jit, JIT_R1);
-               offset = offsetof(struct lightrec_state, rw_generic_func);
-       }
+       jit_callr(tmp);
 
-       tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
-       jit_ldxi(tmp2, LIGHTREC_REG_STATE, offset);
+       lightrec_free_reg(reg_cache, tmp);
+       lightrec_free_reg(reg_cache, tmp2);
+       if (with_arg)
+               lightrec_free_reg(reg_cache, tmp3);
+       lightrec_regcache_mark_live(reg_cache, _jit);
+}
+
+static void rec_io(struct lightrec_cstate *state,
+                  const struct block *block, u16 offset,
+                  bool load_rt, bool read_rt)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       jit_state_t *_jit = block->_jit;
+       union code c = block->opcode_list[offset].c;
+       u16 flags = block->opcode_list[offset].flags;
+       bool is_tagged = flags & (LIGHTREC_HW_IO | LIGHTREC_DIRECT_IO);
+       u32 lut_entry;
 
-       lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rs, false);
+       jit_note(__FILE__, __LINE__);
 
-       if (read_rt && likely(op->i.rt))
-               lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rt, true);
+       lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rs, false);
+
+       if (read_rt && likely(c.i.rt))
+               lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, true);
        else if (load_rt)
-               lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rt, false);
+               lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
 
        if (is_tagged) {
-               jit_movi(tmp, op->opcode);
+               call_to_c_wrapper(state, block, c.opcode, true, C_WRAPPER_RW);
        } else {
-               jit_movi(tmp, (uintptr_t)op);
-               jit_movi(tmp3, (uintptr_t)block);
+               lut_entry = lightrec_get_lut_entry(block);
+               call_to_c_wrapper(state, block, (lut_entry << 16) | offset,
+                                 true, C_WRAPPER_RW_GENERIC);
        }
-
-       jit_callr(tmp2);
-
-       lightrec_free_reg(reg_cache, tmp);
-       lightrec_free_reg(reg_cache, tmp2);
-       if (!is_tagged)
-               lightrec_free_reg(reg_cache, tmp3);
-       lightrec_regcache_mark_live(reg_cache, _jit);
 }
 
-static void rec_store_direct_no_invalidate(const struct block *block,
-                                          const struct opcode *op,
-                                          jit_code_t code)
+static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate,
+                                          const struct block *block,
+                                          u16 offset, jit_code_t code)
 {
-       struct lightrec_state *state = block->state;
-       struct regcache *reg_cache = state->reg_cache;
+       struct lightrec_state *state = cstate->state;
+       struct regcache *reg_cache = cstate->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
        jit_node_t *to_not_ram, *to_end;
        u8 tmp, tmp2, rs, rt;
        s16 imm;
 
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rs);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
        tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
-       tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
+
+       if (state->offset_ram || state->offset_scratch)
+               tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
 
        /* Convert to KUNSEG and avoid RAM mirrors */
        if (state->mirrors_mapped) {
-               imm = (s16)op->i.imm;
+               imm = (s16)c.i.imm;
                jit_andi(tmp, rs, 0x1f800000 | (4 * RAM_SIZE - 1));
-       } else if (op->i.imm) {
+       } else if (c.i.imm) {
                imm = 0;
-               jit_addi(tmp, rs, (s16)op->i.imm);
+               jit_addi(tmp, rs, (s16)c.i.imm);
                jit_andi(tmp, tmp, 0x1f800000 | (RAM_SIZE - 1));
        } else {
                imm = 0;
@@ -871,6 +1074,8 @@ static void rec_store_direct_no_invalidate(const struct block *block,
        if (state->offset_ram != state->offset_scratch) {
                to_not_ram = jit_bmsi(tmp, BIT(28));
 
+               lightrec_regcache_mark_live(reg_cache, _jit);
+
                jit_movi(tmp2, state->offset_ram);
 
                to_end = jit_jmpi();
@@ -882,51 +1087,54 @@ static void rec_store_direct_no_invalidate(const struct block *block,
                jit_movi(tmp2, state->offset_ram);
        }
 
-       if (state->offset_ram || state->offset_scratch)
+       if (state->offset_ram || state->offset_scratch) {
                jit_addr(tmp, tmp, tmp2);
+               lightrec_free_reg(reg_cache, tmp2);
+       }
 
-       lightrec_free_reg(reg_cache, tmp2);
-
-       rt = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rt);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0);
        jit_new_node_www(code, imm, tmp, rt);
 
        lightrec_free_reg(reg_cache, rt);
        lightrec_free_reg(reg_cache, tmp);
 }
 
-static void rec_store_direct(const struct block *block, const struct opcode *op,
-                            jit_code_t code)
+static void rec_store_direct(struct lightrec_cstate *cstate, const struct block *block,
+                            u16 offset, jit_code_t code)
 {
-       struct lightrec_state *state = block->state;
-       struct regcache *reg_cache = state->reg_cache;
+       struct lightrec_state *state = cstate->state;
+       u32 ram_size = state->mirrors_mapped ? RAM_SIZE * 4 : RAM_SIZE;
+       struct regcache *reg_cache = cstate->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       jit_node_t *to_not_ram, *to_end = 0;
+       jit_node_t *to_not_ram, *to_end;
        u8 tmp, tmp2, tmp3, rs, rt;
 
        jit_note(__FILE__, __LINE__);
 
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rs);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
        tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
-       tmp3 = lightrec_alloc_reg_in(reg_cache, _jit, 0);
+       tmp3 = lightrec_alloc_reg_in(reg_cache, _jit, 0, 0);
 
        /* Convert to KUNSEG and avoid RAM mirrors */
-       if (op->i.imm) {
-               jit_addi(tmp2, rs, (s16)op->i.imm);
-               jit_andi(tmp2, tmp2, 0x1f800000 | (RAM_SIZE - 1));
+       if (c.i.imm) {
+               jit_addi(tmp2, rs, (s16)c.i.imm);
+               jit_andi(tmp2, tmp2, 0x1f800000 | (ram_size - 1));
        } else {
-               jit_andi(tmp2, rs, 0x1f800000 | (RAM_SIZE - 1));
+               jit_andi(tmp2, rs, 0x1f800000 | (ram_size - 1));
        }
 
        lightrec_free_reg(reg_cache, rs);
        tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
 
-       to_not_ram = jit_bgti(tmp2, RAM_SIZE);
+       to_not_ram = jit_bgti(tmp2, ram_size);
+
+       lightrec_regcache_mark_live(reg_cache, _jit);
 
        /* Compute the offset to the code LUT */
        jit_andi(tmp, tmp2, (RAM_SIZE - 1) & ~3);
-#if __WORDSIZE == 64
-       jit_lshi(tmp, tmp, 1);
-#endif
+       if (__WORDSIZE == 64)
+               jit_lshi(tmp, tmp, 1);
        jit_addr(tmp, LIGHTREC_REG_STATE, tmp);
 
        /* Write NULL to the code LUT to invalidate any block that's there */
@@ -952,92 +1160,105 @@ static void rec_store_direct(const struct block *block, const struct opcode *op,
        lightrec_free_reg(reg_cache, tmp);
        lightrec_free_reg(reg_cache, tmp3);
 
-       rt = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rt);
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0);
        jit_new_node_www(code, 0, tmp2, rt);
 
        lightrec_free_reg(reg_cache, rt);
        lightrec_free_reg(reg_cache, tmp2);
 }
 
-static void rec_store(const struct block *block, const struct opcode *op,
-                    jit_code_t code)
+static void rec_store(struct lightrec_cstate *state,
+                     const struct block *block, u16 offset, jit_code_t code)
 {
-       if (op->flags & LIGHTREC_NO_INVALIDATE) {
-               rec_store_direct_no_invalidate(block, op, code);
-       } else if (op->flags & LIGHTREC_DIRECT_IO) {
-               if (block->state->invalidate_from_dma_only)
-                       rec_store_direct_no_invalidate(block, op, code);
+       u16 flags = block->opcode_list[offset].flags;
+
+       if (flags & LIGHTREC_NO_INVALIDATE) {
+               rec_store_direct_no_invalidate(state, block, offset, code);
+       } else if (flags & LIGHTREC_DIRECT_IO) {
+               if (state->state->invalidate_from_dma_only)
+                       rec_store_direct_no_invalidate(state, block, offset, code);
                else
-                       rec_store_direct(block, op, code);
+                       rec_store_direct(state, block, offset, code);
        } else {
-               rec_io(block, op, true, false);
+               rec_io(state, block, offset, true, false);
        }
 }
 
-static void rec_SB(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SB(struct lightrec_cstate *state,
+                  const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_store(block, op, jit_code_stxi_c);
+       rec_store(state, block, offset, jit_code_stxi_c);
 }
 
-static void rec_SH(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SH(struct lightrec_cstate *state,
+                  const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_store(block, op, jit_code_stxi_s);
+       rec_store(state, block, offset, jit_code_stxi_s);
 }
 
-static void rec_SW(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SW(struct lightrec_cstate *state,
+                  const struct block *block, u16 offset)
+
 {
        _jit_name(block->_jit, __func__);
-       rec_store(block, op, jit_code_stxi_i);
+       rec_store(state, block, offset, jit_code_stxi_i);
 }
 
-static void rec_SWL(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SWL(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, true, false);
+       rec_io(state, block, offset, true, false);
 }
 
-static void rec_SWR(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SWR(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, true, false);
+       rec_io(state, block, offset, true, false);
 }
 
-static void rec_SWC2(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_SWC2(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, false, false);
+       rec_io(state, block, offset, false, false);
 }
 
-static void rec_load_direct(const struct block *block, const struct opcode *op,
-                           jit_code_t code)
+static void rec_load_direct(struct lightrec_cstate *cstate, const struct block *block,
+                           u16 offset, jit_code_t code, bool is_unsigned)
 {
-       struct lightrec_state *state = block->state;
-       struct regcache *reg_cache = state->reg_cache;
+       struct lightrec_state *state = cstate->state;
+       struct regcache *reg_cache = cstate->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       jit_node_t *to_not_ram, *to_not_bios = 0, *to_end, *to_end2;
-       u8 tmp, rs, rt, addr_reg;
+       jit_node_t *to_not_ram, *to_not_bios, *to_end, *to_end2;
+       u8 tmp, rs, rt, addr_reg, flags = REG_EXT;
        s16 imm;
 
-       if (!op->i.rt)
+       if (!c.i.rt)
                return;
 
+       if (is_unsigned)
+               flags |= REG_ZEXT;
+
        jit_note(__FILE__, __LINE__);
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, op->i.rs);
-       rt = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->i.rt);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, flags);
 
        if ((state->offset_ram == state->offset_bios &&
            state->offset_ram == state->offset_scratch &&
-           state->mirrors_mapped) || !op->i.imm) {
+           state->mirrors_mapped) || !c.i.imm) {
                addr_reg = rs;
-               imm = (s16)op->i.imm;
+               imm = (s16)c.i.imm;
        } else {
-               jit_addi(rt, rs, (s16)op->i.imm);
+               jit_addi(rt, rs, (s16)c.i.imm);
                addr_reg = rt;
                imm = 0;
 
-               if (op->i.rs != op->i.rt)
+               if (c.i.rs != c.i.rt)
                        lightrec_free_reg(reg_cache, rs);
        }
 
@@ -1059,6 +1280,8 @@ static void rec_load_direct(const struct block *block, const struct opcode *op,
        } else {
                to_not_ram = jit_bmsi(addr_reg, BIT(28));
 
+               lightrec_regcache_mark_live(reg_cache, _jit);
+
                /* Convert to KUNSEG and avoid RAM mirrors */
                jit_andi(rt, addr_reg, RAM_SIZE - 1);
 
@@ -1104,334 +1327,397 @@ static void rec_load_direct(const struct block *block, const struct opcode *op,
        lightrec_free_reg(reg_cache, tmp);
 }
 
-static void rec_load(const struct block *block, const struct opcode *op,
-                   jit_code_t code)
+static void rec_load(struct lightrec_cstate *state, const struct block *block,
+                    u16 offset, jit_code_t code, bool is_unsigned)
 {
-       if (op->flags & LIGHTREC_DIRECT_IO)
-               rec_load_direct(block, op, code);
+       u16 flags = block->opcode_list[offset].flags;
+
+       if (flags & LIGHTREC_DIRECT_IO)
+               rec_load_direct(state, block, offset, code, is_unsigned);
        else
-               rec_io(block, op, false, true);
+               rec_io(state, block, offset, false, true);
 }
 
-static void rec_LB(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LB(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_load(block, op, jit_code_ldxi_c);
+       rec_load(state, block, offset, jit_code_ldxi_c, false);
 }
 
-static void rec_LBU(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LBU(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_load(block, op, jit_code_ldxi_uc);
+       rec_load(state, block, offset, jit_code_ldxi_uc, true);
 }
 
-static void rec_LH(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LH(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_load(block, op, jit_code_ldxi_s);
+       rec_load(state, block, offset, jit_code_ldxi_s, false);
 }
 
-static void rec_LHU(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LHU(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_load(block, op, jit_code_ldxi_us);
+       rec_load(state, block, offset, jit_code_ldxi_us, true);
 }
 
-static void rec_LWL(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LWL(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, true, true);
+       rec_io(state, block, offset, true, true);
 }
 
-static void rec_LWR(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LWR(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, true, true);
+       rec_io(state, block, offset, true, true);
 }
 
-static void rec_LW(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LW(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_load(block, op, jit_code_ldxi_i);
+       rec_load(state, block, offset, jit_code_ldxi_i, false);
 }
 
-static void rec_LWC2(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_LWC2(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_io(block, op, false, false);
+       rec_io(state, block, offset, false, false);
 }
 
-static void rec_break_syscall(const struct block *block,
-                             const struct opcode *op, u32 pc, bool is_break)
+static void rec_break_syscall(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset, bool is_break)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
-       jit_state_t *_jit = block->_jit;
-       u32 offset;
-       u8 tmp;
-
-       jit_note(__FILE__, __LINE__);
+       _jit_note(block->_jit, __FILE__, __LINE__);
 
        if (is_break)
-               offset = offsetof(struct lightrec_state, break_func);
+               call_to_c_wrapper(state, block, 0, false, C_WRAPPER_BREAK);
        else
-               offset = offsetof(struct lightrec_state, syscall_func);
-
-       tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
-       jit_ldxi(tmp, LIGHTREC_REG_STATE, offset);
-       jit_callr(tmp);
-       lightrec_free_reg(reg_cache, tmp);
-
-       lightrec_regcache_mark_live(reg_cache, _jit);
+               call_to_c_wrapper(state, block, 0, false, C_WRAPPER_SYSCALL);
 
        /* TODO: the return address should be "pc - 4" if we're a delay slot */
-       lightrec_emit_end_of_block(block, op, pc, -1, pc, 31, 0, true);
+       lightrec_emit_end_of_block(state, block, offset, -1,
+                                  get_ds_pc(block, offset, 0),
+                                  31, 0, true);
 }
 
-static void rec_special_SYSCALL(const struct block *block,
-                               const struct opcode *op, u32 pc)
+static void rec_special_SYSCALL(struct lightrec_cstate *state,
+                               const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_break_syscall(block, op, pc, false);
+       rec_break_syscall(state, block, offset, false);
 }
 
-static void rec_special_BREAK(const struct block *block,
-                             const struct opcode *op, u32 pc)
+static void rec_special_BREAK(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_break_syscall(block, op, pc, true);
+       rec_break_syscall(state, block, offset, true);
 }
 
-static void rec_mfc(const struct block *block, const struct opcode *op)
+static void rec_mfc(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
-       u8 tmp, tmp2;
-       struct lightrec_state *state = block->state;
        struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
 
        jit_note(__FILE__, __LINE__);
+       lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, true);
 
-       tmp = lightrec_alloc_reg(reg_cache, _jit, JIT_R0);
-       tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
+       call_to_c_wrapper(state, block, c.opcode, true, C_WRAPPER_MFC);
+}
 
-       jit_ldxi(tmp2, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, mfc_func));
+static void rec_mtc(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
 
-       lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rt, true);
+       jit_note(__FILE__, __LINE__);
+       lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rs, false);
+       lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
 
-       jit_movi(tmp, op->opcode);
-       jit_callr(tmp2);
-       lightrec_free_reg(reg_cache, tmp);
-       lightrec_free_reg(reg_cache, tmp2);
+       call_to_c_wrapper(state, block, c.opcode, true, C_WRAPPER_MTC);
 
-       lightrec_regcache_mark_live(reg_cache, _jit);
+       if (c.i.op == OP_CP0 &&
+           !(block->opcode_list[offset].flags & LIGHTREC_NO_DS) &&
+           (c.r.rd == 12 || c.r.rd == 13))
+               lightrec_emit_end_of_block(state, block, offset, -1,
+                                          get_ds_pc(block, offset, 1),
+                                          0, 0, true);
 }
 
-static void rec_mtc(const struct block *block, const struct opcode *op, u32 pc)
+static void
+rec_mfc0(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
-       struct lightrec_state *state = block->state;
        struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 tmp, tmp2;
+       u8 rt;
 
        jit_note(__FILE__, __LINE__);
 
-       tmp = lightrec_alloc_reg(reg_cache, _jit, JIT_R0);
-       tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
-       jit_ldxi(tmp2, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, mtc_func));
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, REG_EXT);
 
-       lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rs, false);
-       lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rt, false);
+       jit_ldxi_i(rt, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, regs.cp0[c.r.rd]));
 
-       jit_movi(tmp, op->opcode);
-       jit_callr(tmp2);
-       lightrec_free_reg(reg_cache, tmp);
-       lightrec_free_reg(reg_cache, tmp2);
+       lightrec_free_reg(reg_cache, rt);
+}
 
-       lightrec_regcache_mark_live(reg_cache, _jit);
+static bool block_in_bios(const struct lightrec_cstate *state,
+                         const struct block *block)
+{
+       const struct lightrec_mem_map *bios = &state->state->maps[PSX_MAP_BIOS];
+       u32 pc = kunseg(block->pc);
 
-       if (op->i.op == OP_CP0 && !(op->flags & LIGHTREC_NO_DS) &&
-           (op->r.rd == 12 || op->r.rd == 13))
-               lightrec_emit_end_of_block(block, op, pc, -1, pc + 4, 0, 0, true);
+       return pc >= bios->pc && pc < bios->pc + bios->length;
 }
 
-static void rec_cp0_MFC0(const struct block *block,
-                        const struct opcode *op, u32 pc)
+static void
+rec_mtc0(struct lightrec_cstate *state, const struct block *block, u16 offset)
 {
-       _jit_name(block->_jit, __func__);
-       rec_mfc(block, op);
+       struct regcache *reg_cache = state->reg_cache;
+       const union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
+       u8 rt, tmp, tmp2, status;
+
+       jit_note(__FILE__, __LINE__);
+
+       switch(c.r.rd) {
+       case 1:
+       case 4:
+       case 8:
+       case 14:
+       case 15:
+               /* Those registers are read-only */
+               return;
+       default:
+               break;
+       }
+
+       if (block_in_bios(state, block) && c.r.rd == 12) {
+               /* If we are running code from the BIOS, handle writes to the
+                * Status register in C. BIOS code may toggle bit 16 which will
+                * map/unmap the RAM, while game code cannot do that. */
+               rec_mtc(state, block, offset);
+               return;
+       }
+
+       rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0);
+
+       if (c.r.rd != 13) {
+               jit_stxi_i(offsetof(struct lightrec_state, regs.cp0[c.r.rd]),
+                          LIGHTREC_REG_STATE, rt);
+       }
+
+       if (c.r.rd == 12 || c.r.rd == 13) {
+               tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
+               jit_ldxi_i(tmp, LIGHTREC_REG_STATE,
+                          offsetof(struct lightrec_state, regs.cp0[13]));
+       }
+
+       if (c.r.rd == 12) {
+               status = rt;
+       } else if (c.r.rd == 13) {
+               tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
+
+               /* Cause = (Cause & ~0x0300) | (value & 0x0300) */
+               jit_andi(tmp2, rt, 0x0300);
+               jit_ori(tmp, tmp, 0x0300);
+               jit_xori(tmp, tmp, 0x0300);
+               jit_orr(tmp, tmp, tmp2);
+               jit_ldxi_i(tmp2, LIGHTREC_REG_STATE,
+                          offsetof(struct lightrec_state, regs.cp0[12]));
+               jit_stxi_i(offsetof(struct lightrec_state, regs.cp0[13]),
+                          LIGHTREC_REG_STATE, tmp);
+               status = tmp2;
+       }
+
+       if (c.r.rd == 12 || c.r.rd == 13) {
+               /* Exit dynarec in case there's a software interrupt.
+                * exit_flags = !!(status & tmp & 0x0300) & status; */
+               jit_andr(tmp, tmp, status);
+               jit_andi(tmp, tmp, 0x0300);
+               jit_nei(tmp, tmp, 0);
+               jit_andr(tmp, tmp, status);
+               jit_stxi_i(offsetof(struct lightrec_state, exit_flags),
+                          LIGHTREC_REG_STATE, tmp);
+
+               lightrec_free_reg(reg_cache, tmp);
+       }
+
+       if (c.r.rd == 13)
+               lightrec_free_reg(reg_cache, tmp2);
+
+       lightrec_free_reg(reg_cache, rt);
+
+       if (!(block->opcode_list[offset].flags & LIGHTREC_NO_DS) &&
+           (c.r.rd == 12 || c.r.rd == 13))
+               lightrec_emit_eob(state, block, offset + 1, true);
 }
 
-static void rec_cp0_CFC0(const struct block *block,
-                        const struct opcode *op, u32 pc)
+static void rec_cp0_MFC0(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mfc(block, op);
+       rec_mfc0(state, block, offset);
 }
 
-static void rec_cp0_MTC0(const struct block *block,
-                        const struct opcode *op, u32 pc)
+static void rec_cp0_CFC0(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mtc(block, op, pc);
+       rec_mfc0(state, block, offset);
 }
 
-static void rec_cp0_CTC0(const struct block *block,
-                        const struct opcode *op, u32 pc)
+static void rec_cp0_MTC0(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mtc(block, op, pc);
+       rec_mtc0(state, block, offset);
 }
 
-static void rec_cp2_basic_MFC2(const struct block *block,
-                              const struct opcode *op, u32 pc)
+static void rec_cp0_CTC0(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mfc(block, op);
+       rec_mtc0(state, block, offset);
 }
 
-static void rec_cp2_basic_CFC2(const struct block *block,
-                              const struct opcode *op, u32 pc)
+static void rec_cp2_basic_MFC2(struct lightrec_cstate *state,
+                              const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mfc(block, op);
+       rec_mfc(state, block, offset);
 }
 
-static void rec_cp2_basic_MTC2(const struct block *block,
-                              const struct opcode *op, u32 pc)
+static void rec_cp2_basic_CFC2(struct lightrec_cstate *state,
+                              const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mtc(block, op, pc);
+       rec_mfc(state, block, offset);
 }
 
-static void rec_cp2_basic_CTC2(const struct block *block,
-                              const struct opcode *op, u32 pc)
+static void rec_cp2_basic_MTC2(struct lightrec_cstate *state,
+                              const struct block *block, u16 offset)
 {
        _jit_name(block->_jit, __func__);
-       rec_mtc(block, op, pc);
+       rec_mtc(state, block, offset);
 }
 
-static void rec_cp0_RFE(const struct block *block,
-                       const struct opcode *op, u32 pc)
+static void rec_cp2_basic_CTC2(struct lightrec_cstate *state,
+                              const struct block *block, u16 offset)
 {
-       struct lightrec_state *state = block->state;
-       jit_state_t *_jit = block->_jit;
-       u8 tmp;
-
-       jit_name(__func__);
-       jit_note(__FILE__, __LINE__);
-
-       tmp = lightrec_alloc_reg_temp(state->reg_cache, _jit);
-       jit_ldxi(tmp, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, rfe_func));
-       jit_callr(tmp);
-       lightrec_free_reg(state->reg_cache, tmp);
-
-       lightrec_regcache_mark_live(state->reg_cache, _jit);
+       _jit_name(block->_jit, __func__);
+       rec_mtc(state, block, offset);
 }
 
-static void rec_CP(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_cp0_RFE(struct lightrec_cstate *state,
+                       const struct block *block, u16 offset)
 {
-       struct regcache *reg_cache = block->state->reg_cache;
+       struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
-       u8 tmp, tmp2;
+       u8 status, tmp;
 
        jit_name(__func__);
        jit_note(__FILE__, __LINE__);
 
-       tmp = lightrec_alloc_reg(reg_cache, _jit, JIT_R0);
-       tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
+       status = lightrec_alloc_reg_temp(reg_cache, _jit);
+       jit_ldxi_i(status, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, regs.cp0[12]));
 
-       jit_ldxi(tmp2, LIGHTREC_REG_STATE,
-                offsetof(struct lightrec_state, cp_func));
+       tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
 
-       jit_movi(tmp, op->opcode);
-       jit_callr(tmp2);
+       /* status = ((status >> 2) & 0xf) | status & ~0xf; */
+       jit_rshi(tmp, status, 2);
+       jit_andi(tmp, tmp, 0xf);
+       jit_andi(status, status, ~0xful);
+       jit_orr(status, status, tmp);
+
+       jit_ldxi_i(tmp, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, regs.cp0[13]));
+       jit_stxi_i(offsetof(struct lightrec_state, regs.cp0[12]),
+                  LIGHTREC_REG_STATE, status);
+
+       /* Exit dynarec in case there's a software interrupt.
+        * exit_flags = !!(status & cause & 0x0300) & status; */
+       jit_andr(tmp, tmp, status);
+       jit_andi(tmp, tmp, 0x0300);
+       jit_nei(tmp, tmp, 0);
+       jit_andr(tmp, tmp, status);
+       jit_stxi_i(offsetof(struct lightrec_state, exit_flags),
+                  LIGHTREC_REG_STATE, tmp);
+
+       lightrec_free_reg(reg_cache, status);
        lightrec_free_reg(reg_cache, tmp);
-       lightrec_free_reg(reg_cache, tmp2);
-
-       lightrec_regcache_mark_live(reg_cache, _jit);
 }
 
-static void rec_meta_unload(const struct block *block,
-                           const struct opcode *op, u32 pc)
+static void rec_CP(struct lightrec_cstate *state,
+                  const struct block *block, u16 offset)
 {
-       struct lightrec_state *state = block->state;
-       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
 
        jit_name(__func__);
        jit_note(__FILE__, __LINE__);
 
-       pr_debug("Unloading reg %s\n", lightrec_reg_name(op->i.rs));
-       lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rs, true);
+       call_to_c_wrapper(state, block, c.opcode, true, C_WRAPPER_CP);
 }
 
-static void rec_meta_BEQZ(const struct block *block,
-                         const struct opcode *op, u32 pc)
+static void rec_meta_MOV(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
-       _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_bnei, 0, false, true);
-}
-
-static void rec_meta_BNEZ(const struct block *block,
-                         const struct opcode *op, u32 pc)
-{
-       _jit_name(block->_jit, __func__);
-       rec_b(block, op, pc, jit_code_beqi, 0, false, true);
-}
-
-static void rec_meta_MOV(const struct block *block,
-                        const struct opcode *op, u32 pc)
-{
-       struct lightrec_state *state = block->state;
        struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
        u8 rs, rd;
 
        _jit_name(block->_jit, __func__);
        jit_note(__FILE__, __LINE__);
-       rs = op->r.rs ? lightrec_alloc_reg_in(reg_cache, _jit, op->r.rs) : 0;
-       rd = lightrec_alloc_reg_out_ext(reg_cache, _jit, op->r.rd);
+       if (c.r.rs)
+               rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT);
 
-       if (op->r.rs == 0) {
+       if (c.r.rs == 0)
                jit_movi(rd, 0);
-       } else {
-#if __WORDSIZE == 32
-               jit_movr(rd, rs);
-#else
+       else
                jit_extr_i(rd, rs);
-#endif
-       }
 
-       lightrec_free_reg(state->reg_cache, rs);
-       lightrec_free_reg(state->reg_cache, rd);
+       if (c.r.rs)
+               lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rd);
 }
 
-static void rec_meta_sync(const struct block *block,
-                         const struct opcode *op, u32 pc)
+static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state,
+                              const struct block *block,
+                              u16 offset)
 {
-       struct lightrec_state *state = block->state;
-       struct lightrec_branch_target *target;
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
+       u8 rs, rt;
 
-       jit_name(__func__);
+       _jit_name(block->_jit, __func__);
        jit_note(__FILE__, __LINE__);
 
-       jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles);
-       state->cycles = 0;
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
+       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, REG_EXT);
 
-       lightrec_storeback_regs(state->reg_cache, _jit);
-       lightrec_regcache_reset(state->reg_cache);
+       if (c.i.op == OP_META_EXTC)
+               jit_extr_c(rt, rs);
+       else
+               jit_extr_s(rt, rs);
 
-       pr_debug("Adding branch target at offset 0x%x\n",
-                op->offset << 2);
-       target = &state->targets[state->nb_targets++];
-       target->offset = op->offset;
-       target->label = jit_indirect();
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rt);
 }
 
 static const lightrec_rec_func_t rec_standard[64] = {
+       SET_DEFAULT_ELM(rec_standard, unknown_opcode),
        [OP_SPECIAL]            = rec_SPECIAL,
        [OP_REGIMM]             = rec_REGIMM,
        [OP_J]                  = rec_J,
@@ -1465,14 +1751,13 @@ static const lightrec_rec_func_t rec_standard[64] = {
        [OP_LWC2]               = rec_LWC2,
        [OP_SWC2]               = rec_SWC2,
 
-       [OP_META_REG_UNLOAD]    = rec_meta_unload,
-       [OP_META_BEQZ]          = rec_meta_BEQZ,
-       [OP_META_BNEZ]          = rec_meta_BNEZ,
        [OP_META_MOV]           = rec_meta_MOV,
-       [OP_META_SYNC]          = rec_meta_sync,
+       [OP_META_EXTC]          = rec_meta_EXTC_EXTS,
+       [OP_META_EXTS]          = rec_meta_EXTC_EXTS,
 };
 
 static const lightrec_rec_func_t rec_special[64] = {
+       SET_DEFAULT_ELM(rec_special, unknown_opcode),
        [OP_SPECIAL_SLL]        = rec_special_SLL,
        [OP_SPECIAL_SRL]        = rec_special_SRL,
        [OP_SPECIAL_SRA]        = rec_special_SRA,
@@ -1504,6 +1789,7 @@ static const lightrec_rec_func_t rec_special[64] = {
 };
 
 static const lightrec_rec_func_t rec_regimm[64] = {
+       SET_DEFAULT_ELM(rec_regimm, unknown_opcode),
        [OP_REGIMM_BLTZ]        = rec_regimm_BLTZ,
        [OP_REGIMM_BGEZ]        = rec_regimm_BGEZ,
        [OP_REGIMM_BLTZAL]      = rec_regimm_BLTZAL,
@@ -1511,6 +1797,7 @@ static const lightrec_rec_func_t rec_regimm[64] = {
 };
 
 static const lightrec_rec_func_t rec_cp0[64] = {
+       SET_DEFAULT_ELM(rec_cp0, rec_CP),
        [OP_CP0_MFC0]           = rec_cp0_MFC0,
        [OP_CP0_CFC0]           = rec_cp0_CFC0,
        [OP_CP0_MTC0]           = rec_cp0_MTC0,
@@ -1519,60 +1806,107 @@ static const lightrec_rec_func_t rec_cp0[64] = {
 };
 
 static const lightrec_rec_func_t rec_cp2_basic[64] = {
+       SET_DEFAULT_ELM(rec_cp2_basic, rec_CP),
        [OP_CP2_BASIC_MFC2]     = rec_cp2_basic_MFC2,
        [OP_CP2_BASIC_CFC2]     = rec_cp2_basic_CFC2,
        [OP_CP2_BASIC_MTC2]     = rec_cp2_basic_MTC2,
        [OP_CP2_BASIC_CTC2]     = rec_cp2_basic_CTC2,
 };
 
-static void rec_SPECIAL(const struct block *block,
-                       const struct opcode *op, u32 pc)
+static void rec_SPECIAL(struct lightrec_cstate *state,
+                       const struct block *block, u16 offset)
 {
-       lightrec_rec_func_t f = rec_special[op->r.op];
-       if (likely(f))
-               (*f)(block, op, pc);
+       union code c = block->opcode_list[offset].c;
+       lightrec_rec_func_t f = rec_special[c.r.op];
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
+               unknown_opcode(state, block, offset);
        else
-               unknown_opcode(block, op, pc);
+               (*f)(state, block, offset);
 }
 
-static void rec_REGIMM(const struct block *block,
-                      const struct opcode *op, u32 pc)
+static void rec_REGIMM(struct lightrec_cstate *state,
+                      const struct block *block, u16 offset)
 {
-       lightrec_rec_func_t f = rec_regimm[op->r.rt];
-       if (likely(f))
-               (*f)(block, op, pc);
+       union code c = block->opcode_list[offset].c;
+       lightrec_rec_func_t f = rec_regimm[c.r.rt];
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
+               unknown_opcode(state, block, offset);
        else
-               unknown_opcode(block, op, pc);
+               (*f)(state, block, offset);
 }
 
-static void rec_CP0(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_CP0(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
-       lightrec_rec_func_t f = rec_cp0[op->r.rs];
-       if (likely(f))
-               (*f)(block, op, pc);
+       union code c = block->opcode_list[offset].c;
+       lightrec_rec_func_t f = rec_cp0[c.r.rs];
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
+               rec_CP(state, block, offset);
        else
-               rec_CP(block, op, pc);
+               (*f)(state, block, offset);
 }
 
-static void rec_CP2(const struct block *block, const struct opcode *op, u32 pc)
+static void rec_CP2(struct lightrec_cstate *state,
+                   const struct block *block, u16 offset)
 {
-       if (op->r.op == OP_CP2_BASIC) {
-               lightrec_rec_func_t f = rec_cp2_basic[op->r.rs];
-               if (likely(f)) {
-                       (*f)(block, op, pc);
+       union code c = block->opcode_list[offset].c;
+
+       if (c.r.op == OP_CP2_BASIC) {
+               lightrec_rec_func_t f = rec_cp2_basic[c.r.rs];
+
+               if (HAS_DEFAULT_ELM || likely(f)) {
+                       (*f)(state, block, offset);
                        return;
                }
        }
 
-       rec_CP(block, op, pc);
+       rec_CP(state, block, offset);
 }
 
-void lightrec_rec_opcode(const struct block *block,
-                        const struct opcode *op, u32 pc)
+void lightrec_rec_opcode(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
 {
-       lightrec_rec_func_t f = rec_standard[op->i.op];
-       if (likely(f))
-               (*f)(block, op, pc);
-       else
-               unknown_opcode(block, op, pc);
+       struct regcache *reg_cache = state->reg_cache;
+       struct lightrec_branch_target *target;
+       const struct opcode *op = &block->opcode_list[offset];
+       jit_state_t *_jit = block->_jit;
+       lightrec_rec_func_t f;
+
+       if (op->flags & LIGHTREC_SYNC) {
+               jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles);
+               state->cycles = 0;
+
+               lightrec_storeback_regs(reg_cache, _jit);
+               lightrec_regcache_reset(reg_cache);
+
+               pr_debug("Adding branch target at offset 0x%x\n", offset << 2);
+               target = &state->targets[state->nb_targets++];
+               target->offset = offset;
+               target->label = jit_indirect();
+       }
+
+       if (likely(op->opcode)) {
+               f = rec_standard[op->i.op];
+
+               if (!HAS_DEFAULT_ELM && unlikely(!f))
+                       unknown_opcode(state, block, offset);
+               else
+                       (*f)(state, block, offset);
+       }
+
+       if (unlikely(op->flags & LIGHTREC_UNLOAD_RD)) {
+               lightrec_clean_reg_if_loaded(reg_cache, _jit, op->r.rd, true);
+               pr_debug("Cleaning RD reg %s\n", lightrec_reg_name(op->r.rd));
+       }
+       if (unlikely(op->flags & LIGHTREC_UNLOAD_RS)) {
+               lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rs, true);
+               pr_debug("Cleaning RS reg %s\n", lightrec_reg_name(op->i.rt));
+       }
+       if (unlikely(op->flags & LIGHTREC_UNLOAD_RT)) {
+               lightrec_clean_reg_if_loaded(reg_cache, _jit, op->i.rt, true);
+               pr_debug("Cleaning RT reg %s\n", lightrec_reg_name(op->i.rt));
+       }
 }
index ec3fc78..b7f54fd 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __EMITTER_H__
 #include "lightrec.h"
 
 struct block;
+struct lightrec_cstate;
 struct opcode;
 
-void lightrec_rec_opcode(const struct block *block,
-                        const struct opcode *op, u32 pc);
-void lightrec_emit_eob(const struct block *block,
-                      const struct opcode *op, u32 pc);
+void lightrec_rec_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset);
+void lightrec_emit_eob(struct lightrec_cstate *state, const struct block *block,
+                      u16 offset, _Bool after_op);
 
 #endif /* __EMITTER_H__ */
index ff609a4..922f081 100644 (file)
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "disassembler.h"
@@ -39,18 +30,45 @@ struct interpreter {
        struct opcode *op;
        u32 cycles;
        bool delay_slot;
+       u16 offset;
 };
 
+static u32 int_get_branch_pc(const struct interpreter *inter)
+{
+       return get_branch_pc(inter->block, inter->offset, 0);
+}
+
+static inline u32 int_get_ds_pc(const struct interpreter *inter, s16 imm)
+{
+       return get_ds_pc(inter->block, inter->offset, imm);
+}
+
+static inline struct opcode *next_op(const struct interpreter *inter)
+{
+       return &inter->block->opcode_list[inter->offset + 1];
+}
+
 static inline u32 execute(lightrec_int_func_t func, struct interpreter *inter)
 {
        return (*func)(inter);
 }
 
+static inline u32 lightrec_int_op(struct interpreter *inter)
+{
+       return execute(int_standard[inter->op->i.op], inter);
+}
+
 static inline u32 jump_skip(struct interpreter *inter)
 {
-       inter->op = inter->op->next;
+       inter->op = next_op(inter);
+       inter->offset++;
 
-       return execute(int_standard[inter->op->i.op], inter);
+       if (inter->op->flags & LIGHTREC_SYNC) {
+               inter->state->current_cycle += inter->cycles;
+               inter->cycles = 0;
+       }
+
+       return lightrec_int_op(inter);
 }
 
 static inline u32 jump_next(struct interpreter *inter)
@@ -70,7 +88,8 @@ static inline u32 jump_after_branch(struct interpreter *inter)
        if (unlikely(inter->delay_slot))
                return 0;
 
-       inter->op = inter->op->next;
+       inter->op = next_op(inter);
+       inter->offset++;
 
        return jump_skip(inter);
 }
@@ -84,7 +103,7 @@ static void update_cycles_before_branch(struct interpreter *inter)
 
                if (has_delay_slot(inter->op->c) &&
                    !(inter->op->flags & LIGHTREC_NO_DS))
-                       cycles += lightrec_cycles_of_opcode(inter->op->next->c);
+                       cycles += lightrec_cycles_of_opcode(next_op(inter)->c);
 
                inter->cycles += cycles;
                inter->state->current_cycle += inter->cycles;
@@ -101,10 +120,8 @@ static bool is_branch_taken(const u32 *reg_cache, union code op)
        case OP_JAL:
                return true;
        case OP_BEQ:
-       case OP_META_BEQZ:
                return reg_cache[op.r.rs] == reg_cache[op.r.rt];
        case OP_BNE:
-       case OP_META_BNEZ:
                return reg_cache[op.r.rs] != reg_cache[op.r.rt];
        case OP_REGIMM:
                switch (op.r.rt) {
@@ -125,8 +142,8 @@ static bool is_branch_taken(const u32 *reg_cache, union code op)
 static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
 {
        struct lightrec_state *state = inter->state;
-       u32 *reg_cache = state->native_reg_cache;
-       struct opcode new_op, *op = inter->op->next;
+       u32 *reg_cache = state->regs.gpr;
+       struct opcode new_op, *op = next_op(inter);
        union code op_next;
        struct interpreter inter2 = {
                .state = state,
@@ -150,8 +167,8 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
                 * but on branch boundaries, we need to adjust the return
                 * address so that the GTE opcode is effectively executed.
                 */
-               cause = (*state->ops.cop0_ops.cfc)(state, op->c.opcode, 13);
-               epc = (*state->ops.cop0_ops.cfc)(state, op->c.opcode, 14);
+               cause = state->regs.cp0[13];
+               epc = state->regs.cp0[14];
 
                if (!(cause & 0x7c) && epc == pc - 4)
                        pc -= 4;
@@ -223,12 +240,10 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
                } else {
                        new_op.c = op_next;
                        new_op.flags = 0;
-                       new_op.offset = 0;
-                       new_op.next = NULL;
                        inter2.op = &new_op;
 
                        /* Execute the first opcode of the next block */
-                       (*int_standard[inter2.op->i.op])(&inter2);
+                       lightrec_int_op(&inter2);
 
                        if (save_rs) {
                                new_rs = reg_cache[op->r.rs];
@@ -238,8 +253,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
                        inter->cycles += lightrec_cycles_of_opcode(op_next);
                }
        } else {
-               next_pc = inter->block->pc
-                       + (inter->op->offset + 2) * sizeof(u32);
+               next_pc = int_get_ds_pc(inter, 2);
        }
 
        inter2.block = inter->block;
@@ -250,7 +264,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
                new_rt = reg_cache[op->r.rt];
 
        /* Execute delay slot opcode */
-       ds_next_pc = (*int_standard[inter2.op->i.op])(&inter2);
+       ds_next_pc = lightrec_int_op(&inter2);
 
        if (branch_at_addr) {
                if (op_next.i.op == OP_SPECIAL)
@@ -286,8 +300,6 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
 
                new_op.c = op_next;
                new_op.flags = 0;
-               new_op.offset = sizeof(u32);
-               new_op.next = NULL;
                inter2.op = &new_op;
                inter2.block = NULL;
 
@@ -295,7 +307,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
 
                pr_debug("Running delay slot of branch at target of impossible "
                         "branch\n");
-               (*int_standard[inter2.op->i.op])(&inter2);
+               lightrec_int_op(&inter2);
        }
 
        return next_pc;
@@ -311,11 +323,11 @@ static u32 int_unimplemented(struct interpreter *inter)
 static u32 int_jump(struct interpreter *inter, bool link)
 {
        struct lightrec_state *state = inter->state;
-       u32 old_pc = inter->block->pc + inter->op->offset * sizeof(u32);
+       u32 old_pc = int_get_branch_pc(inter);
        u32 pc = (old_pc & 0xf0000000) | (inter->op->j.imm << 2);
 
        if (link)
-               state->native_reg_cache[31] = old_pc + 8;
+               state->regs.gpr[31] = old_pc + 8;
 
        if (inter->op->flags & LIGHTREC_NO_DS)
                return pc;
@@ -336,11 +348,11 @@ static u32 int_JAL(struct interpreter *inter)
 static u32 int_jumpr(struct interpreter *inter, u8 link_reg)
 {
        struct lightrec_state *state = inter->state;
-       u32 old_pc, next_pc = state->native_reg_cache[inter->op->r.rs];
+       u32 old_pc, next_pc = state->regs.gpr[inter->op->r.rs];
 
        if (link_reg) {
-               old_pc = inter->block->pc + inter->op->offset * sizeof(u32);
-               state->native_reg_cache[link_reg] = old_pc + 8;
+               old_pc = int_get_branch_pc(inter);
+               state->regs.gpr[link_reg] = old_pc + 8;
        }
 
        if (inter->op->flags & LIGHTREC_NO_DS)
@@ -365,7 +377,7 @@ static u32 int_do_branch(struct interpreter *inter, u32 old_pc, u32 next_pc)
            (inter->op->flags & LIGHTREC_LOCAL_BRANCH) &&
            (s16)inter->op->c.i.imm >= 0) {
                next_pc = old_pc + ((1 + (s16)inter->op->c.i.imm) << 2);
-               next_pc = lightrec_emulate_block(inter->block, next_pc);
+               next_pc = lightrec_emulate_block(inter->state, inter->block, next_pc);
        }
 
        return next_pc;
@@ -399,10 +411,10 @@ static u32 int_branch(struct interpreter *inter, u32 pc,
 
 static u32 int_beq(struct interpreter *inter, bool bne)
 {
-       u32 rs, rt, old_pc = inter->block->pc + inter->op->offset * sizeof(u32);
+       u32 rs, rt, old_pc = int_get_branch_pc(inter);
 
-       rs = inter->state->native_reg_cache[inter->op->i.rs];
-       rt = inter->state->native_reg_cache[inter->op->i.rt];
+       rs = inter->state->regs.gpr[inter->op->i.rs];
+       rt = inter->state->regs.gpr[inter->op->i.rt];
 
        return int_branch(inter, old_pc, inter->op->c, (rs == rt) ^ bne);
 }
@@ -419,13 +431,13 @@ static u32 int_BNE(struct interpreter *inter)
 
 static u32 int_bgez(struct interpreter *inter, bool link, bool lt, bool regimm)
 {
-       u32 old_pc = inter->block->pc + inter->op->offset * sizeof(u32);
+       u32 old_pc = int_get_branch_pc(inter);
        s32 rs;
 
        if (link)
-               inter->state->native_reg_cache[31] = old_pc + 8;
+               inter->state->regs.gpr[31] = old_pc + 8;
 
-       rs = (s32)inter->state->native_reg_cache[inter->op->i.rs];
+       rs = (s32)inter->state->regs.gpr[inter->op->i.rs];
 
        return int_branch(inter, old_pc, inter->op->c,
                          ((regimm && !rs) || rs > 0) ^ lt);
@@ -470,7 +482,7 @@ static u32 int_cfc(struct interpreter *inter)
        val = lightrec_mfc(state, op->c);
 
        if (likely(op->r.rt))
-               state->native_reg_cache[op->r.rt] = val;
+               state->regs.gpr[op->r.rt] = val;
 
        return jump_next(inter);
 }
@@ -480,54 +492,35 @@ static u32 int_ctc(struct interpreter *inter)
        struct lightrec_state *state = inter->state;
        const struct opcode *op = inter->op;
 
-       lightrec_mtc(state, op->c, state->native_reg_cache[op->r.rt]);
+       lightrec_mtc(state, op->c, state->regs.gpr[op->r.rt]);
 
        /* If we have a MTC0 or CTC0 to CP0 register 12 (Status) or 13 (Cause),
         * return early so that the emulator will be able to check software
         * interrupt status. */
        if (!(inter->op->flags & LIGHTREC_NO_DS) &&
            op->i.op == OP_CP0 && (op->r.rd == 12 || op->r.rd == 13))
-               return inter->block->pc + (op->offset + 1) * sizeof(u32);
+               return int_get_ds_pc(inter, 1);
        else
                return jump_next(inter);
 }
 
 static u32 int_cp0_RFE(struct interpreter *inter)
 {
-       struct lightrec_state *state = inter->state;
-       u32 status;
-
-       /* Read CP0 Status register (r12) */
-       status = state->ops.cop0_ops.mfc(state, inter->op->c.opcode, 12);
-
-       /* Switch the bits */
-       status = ((status & 0x3c) >> 2) | (status & ~0xf);
-
-       /* Write it back */
-       state->ops.cop0_ops.ctc(state, inter->op->c.opcode, 12, status);
+       lightrec_rfe(inter->state);
 
        return jump_next(inter);
 }
 
 static u32 int_CP(struct interpreter *inter)
 {
-       struct lightrec_state *state = inter->state;
-       const struct lightrec_cop_ops *ops;
-       const struct opcode *op = inter->op;
-
-       if ((op->j.imm >> 25) & 1)
-               ops = &state->ops.cop2_ops;
-       else
-               ops = &state->ops.cop0_ops;
-
-       (*ops->op)(state, (op->j.imm) & ~(1 << 25));
+       lightrec_cp(inter->state, inter->op->c);
 
        return jump_next(inter);
 }
 
 static u32 int_ADDI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -538,7 +531,7 @@ static u32 int_ADDI(struct interpreter *inter)
 
 static u32 int_SLTI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -549,7 +542,7 @@ static u32 int_SLTI(struct interpreter *inter)
 
 static u32 int_SLTIU(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -560,7 +553,7 @@ static u32 int_SLTIU(struct interpreter *inter)
 
 static u32 int_ANDI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -571,7 +564,7 @@ static u32 int_ANDI(struct interpreter *inter)
 
 static u32 int_ORI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -582,7 +575,7 @@ static u32 int_ORI(struct interpreter *inter)
 
 static u32 int_XORI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_i *op = &inter->op->i;
 
        if (likely(op->rt))
@@ -595,7 +588,7 @@ static u32 int_LUI(struct interpreter *inter)
 {
        struct opcode_i *op = &inter->op->i;
 
-       inter->state->native_reg_cache[op->rt] = op->imm << 16;
+       inter->state->regs.gpr[op->rt] = op->imm << 16;
 
        return jump_next(inter);
 }
@@ -603,12 +596,12 @@ static u32 int_LUI(struct interpreter *inter)
 static u32 int_io(struct interpreter *inter, bool is_load)
 {
        struct opcode_i *op = &inter->op->i;
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        u32 val;
 
        val = lightrec_rw(inter->state, inter->op->c,
                          reg_cache[op->rs], reg_cache[op->rt],
-                         &inter->op->flags);
+                         &inter->op->flags, inter->block);
 
        if (is_load && op->rt)
                reg_cache[op->rt] = val;
@@ -629,11 +622,11 @@ static u32 int_store(struct interpreter *inter)
                return int_io(inter, false);
 
        lightrec_rw(inter->state, inter->op->c,
-                   inter->state->native_reg_cache[inter->op->i.rs],
-                   inter->state->native_reg_cache[inter->op->i.rt],
-                   &inter->op->flags);
+                   inter->state->regs.gpr[inter->op->i.rs],
+                   inter->state->regs.gpr[inter->op->i.rt],
+                   &inter->op->flags, inter->block);
 
-       next_pc = inter->block->pc + (inter->op->offset + 1) * 4;
+       next_pc = int_get_ds_pc(inter, 1);
 
        /* Invalidate next PC, to force the rest of the block to be rebuilt */
        lightrec_invalidate(inter->state, next_pc, 4);
@@ -652,8 +645,8 @@ static u32 int_special_SLL(struct interpreter *inter)
        u32 rt;
 
        if (op->opcode) { /* Handle NOPs */
-               rt = inter->state->native_reg_cache[op->r.rt];
-               inter->state->native_reg_cache[op->r.rd] = rt << op->r.imm;
+               rt = inter->state->regs.gpr[op->r.rt];
+               inter->state->regs.gpr[op->r.rd] = rt << op->r.imm;
        }
 
        return jump_next(inter);
@@ -662,9 +655,9 @@ static u32 int_special_SLL(struct interpreter *inter)
 static u32 int_special_SRL(struct interpreter *inter)
 {
        struct opcode *op = inter->op;
-       u32 rt = inter->state->native_reg_cache[op->r.rt];
+       u32 rt = inter->state->regs.gpr[op->r.rt];
 
-       inter->state->native_reg_cache[op->r.rd] = rt >> op->r.imm;
+       inter->state->regs.gpr[op->r.rd] = rt >> op->r.imm;
 
        return jump_next(inter);
 }
@@ -672,9 +665,9 @@ static u32 int_special_SRL(struct interpreter *inter)
 static u32 int_special_SRA(struct interpreter *inter)
 {
        struct opcode *op = inter->op;
-       s32 rt = inter->state->native_reg_cache[op->r.rt];
+       s32 rt = inter->state->regs.gpr[op->r.rt];
 
-       inter->state->native_reg_cache[op->r.rd] = rt >> op->r.imm;
+       inter->state->regs.gpr[op->r.rd] = rt >> op->r.imm;
 
        return jump_next(inter);
 }
@@ -682,10 +675,10 @@ static u32 int_special_SRA(struct interpreter *inter)
 static u32 int_special_SLLV(struct interpreter *inter)
 {
        struct opcode *op = inter->op;
-       u32 rs = inter->state->native_reg_cache[op->r.rs];
-       u32 rt = inter->state->native_reg_cache[op->r.rt];
+       u32 rs = inter->state->regs.gpr[op->r.rs];
+       u32 rt = inter->state->regs.gpr[op->r.rt];
 
-       inter->state->native_reg_cache[op->r.rd] = rt << (rs & 0x1f);
+       inter->state->regs.gpr[op->r.rd] = rt << (rs & 0x1f);
 
        return jump_next(inter);
 }
@@ -693,10 +686,10 @@ static u32 int_special_SLLV(struct interpreter *inter)
 static u32 int_special_SRLV(struct interpreter *inter)
 {
        struct opcode *op = inter->op;
-       u32 rs = inter->state->native_reg_cache[op->r.rs];
-       u32 rt = inter->state->native_reg_cache[op->r.rt];
+       u32 rs = inter->state->regs.gpr[op->r.rs];
+       u32 rt = inter->state->regs.gpr[op->r.rt];
 
-       inter->state->native_reg_cache[op->r.rd] = rt >> (rs & 0x1f);
+       inter->state->regs.gpr[op->r.rd] = rt >> (rs & 0x1f);
 
        return jump_next(inter);
 }
@@ -704,10 +697,10 @@ static u32 int_special_SRLV(struct interpreter *inter)
 static u32 int_special_SRAV(struct interpreter *inter)
 {
        struct opcode *op = inter->op;
-       u32 rs = inter->state->native_reg_cache[op->r.rs];
-       s32 rt = inter->state->native_reg_cache[op->r.rt];
+       u32 rs = inter->state->regs.gpr[op->r.rs];
+       s32 rt = inter->state->regs.gpr[op->r.rt];
 
-       inter->state->native_reg_cache[op->r.rd] = rt >> (rs & 0x1f);
+       inter->state->regs.gpr[op->r.rd] = rt >> (rs & 0x1f);
 
        return jump_next(inter);
 }
@@ -720,12 +713,12 @@ static u32 int_syscall_break(struct interpreter *inter)
        else
                inter->state->exit_flags |= LIGHTREC_EXIT_SYSCALL;
 
-       return inter->block->pc + inter->op->offset * sizeof(u32);
+       return int_get_ds_pc(inter, 0);
 }
 
 static u32 int_special_MFHI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
 
        if (likely(op->rd))
@@ -736,7 +729,7 @@ static u32 int_special_MFHI(struct interpreter *inter)
 
 static u32 int_special_MTHI(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
 
        reg_cache[REG_HI] = reg_cache[inter->op->r.rs];
 
@@ -745,7 +738,7 @@ static u32 int_special_MTHI(struct interpreter *inter)
 
 static u32 int_special_MFLO(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
 
        if (likely(op->rd))
@@ -756,7 +749,7 @@ static u32 int_special_MFLO(struct interpreter *inter)
 
 static u32 int_special_MTLO(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
 
        reg_cache[REG_LO] = reg_cache[inter->op->r.rs];
 
@@ -765,61 +758,70 @@ static u32 int_special_MTLO(struct interpreter *inter)
 
 static u32 int_special_MULT(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        s32 rs = reg_cache[inter->op->r.rs];
        s32 rt = reg_cache[inter->op->r.rt];
+       u8 reg_lo = get_mult_div_lo(inter->op->c);
+       u8 reg_hi = get_mult_div_hi(inter->op->c);
        u64 res = (s64)rs * (s64)rt;
 
-       if (!(inter->op->flags & LIGHTREC_MULT32))
-               reg_cache[REG_HI] = res >> 32;
-       reg_cache[REG_LO] = res;
+       if (!(inter->op->flags & LIGHTREC_NO_HI))
+               reg_cache[reg_hi] = res >> 32;
+       if (!(inter->op->flags & LIGHTREC_NO_LO))
+               reg_cache[reg_lo] = res;
 
        return jump_next(inter);
 }
 
 static u32 int_special_MULTU(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        u32 rs = reg_cache[inter->op->r.rs];
        u32 rt = reg_cache[inter->op->r.rt];
+       u8 reg_lo = get_mult_div_lo(inter->op->c);
+       u8 reg_hi = get_mult_div_hi(inter->op->c);
        u64 res = (u64)rs * (u64)rt;
 
-       if (!(inter->op->flags & LIGHTREC_MULT32))
-               reg_cache[REG_HI] = res >> 32;
-       reg_cache[REG_LO] = res;
+       if (!(inter->op->flags & LIGHTREC_NO_HI))
+               reg_cache[reg_hi] = res >> 32;
+       if (!(inter->op->flags & LIGHTREC_NO_LO))
+               reg_cache[reg_lo] = res;
 
        return jump_next(inter);
 }
 
 static u32 int_special_DIV(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        s32 rs = reg_cache[inter->op->r.rs];
        s32 rt = reg_cache[inter->op->r.rt];
+       u8 reg_lo = get_mult_div_lo(inter->op->c);
+       u8 reg_hi = get_mult_div_hi(inter->op->c);
        u32 lo, hi;
 
        if (rt == 0) {
                hi = rs;
                lo = (rs < 0) * 2 - 1;
-       } else if ((rs == 0x80000000) && (rt == 0xFFFFFFFF)) {
-               lo = rs;
-               hi = 0;
        } else {
                lo = rs / rt;
                hi = rs % rt;
        }
 
-       reg_cache[REG_HI] = hi;
-       reg_cache[REG_LO] = lo;
+       if (!(inter->op->flags & LIGHTREC_NO_HI))
+               reg_cache[reg_hi] = hi;
+       if (!(inter->op->flags & LIGHTREC_NO_LO))
+               reg_cache[reg_lo] = lo;
 
        return jump_next(inter);
 }
 
 static u32 int_special_DIVU(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        u32 rs = reg_cache[inter->op->r.rs];
        u32 rt = reg_cache[inter->op->r.rt];
+       u8 reg_lo = get_mult_div_lo(inter->op->c);
+       u8 reg_hi = get_mult_div_hi(inter->op->c);
        u32 lo, hi;
 
        if (rt == 0) {
@@ -830,15 +832,17 @@ static u32 int_special_DIVU(struct interpreter *inter)
                hi = rs % rt;
        }
 
-       reg_cache[REG_HI] = hi;
-       reg_cache[REG_LO] = lo;
+       if (!(inter->op->flags & LIGHTREC_NO_HI))
+               reg_cache[reg_hi] = hi;
+       if (!(inter->op->flags & LIGHTREC_NO_LO))
+               reg_cache[reg_lo] = lo;
 
        return jump_next(inter);
 }
 
 static u32 int_special_ADD(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        s32 rs = reg_cache[op->rs];
        s32 rt = reg_cache[op->rt];
@@ -851,7 +855,7 @@ static u32 int_special_ADD(struct interpreter *inter)
 
 static u32 int_special_SUB(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -864,7 +868,7 @@ static u32 int_special_SUB(struct interpreter *inter)
 
 static u32 int_special_AND(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -877,7 +881,7 @@ static u32 int_special_AND(struct interpreter *inter)
 
 static u32 int_special_OR(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -890,7 +894,7 @@ static u32 int_special_OR(struct interpreter *inter)
 
 static u32 int_special_XOR(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -903,7 +907,7 @@ static u32 int_special_XOR(struct interpreter *inter)
 
 static u32 int_special_NOR(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -916,7 +920,7 @@ static u32 int_special_NOR(struct interpreter *inter)
 
 static u32 int_special_SLT(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        s32 rs = reg_cache[op->rs];
        s32 rt = reg_cache[op->rt];
@@ -929,7 +933,7 @@ static u32 int_special_SLT(struct interpreter *inter)
 
 static u32 int_special_SLTU(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
        u32 rs = reg_cache[op->rs];
        u32 rt = reg_cache[op->rt];
@@ -940,14 +944,9 @@ static u32 int_special_SLTU(struct interpreter *inter)
        return jump_next(inter);
 }
 
-static u32 int_META_SKIP(struct interpreter *inter)
-{
-       return jump_skip(inter);
-}
-
 static u32 int_META_MOV(struct interpreter *inter)
 {
-       u32 *reg_cache = inter->state->native_reg_cache;
+       u32 *reg_cache = inter->state->regs.gpr;
        struct opcode_r *op = &inter->op->r;
 
        if (likely(op->rd))
@@ -956,15 +955,30 @@ static u32 int_META_MOV(struct interpreter *inter)
        return jump_next(inter);
 }
 
-static u32 int_META_SYNC(struct interpreter *inter)
+static u32 int_META_EXTC(struct interpreter *inter)
 {
-       inter->state->current_cycle += inter->cycles;
-       inter->cycles = 0;
+       u32 *reg_cache = inter->state->regs.gpr;
+       struct opcode_i *op = &inter->op->i;
 
-       return jump_skip(inter);
+       if (likely(op->rt))
+               reg_cache[op->rt] = (u32)(s32)(s8)reg_cache[op->rs];
+
+       return jump_next(inter);
+}
+
+static u32 int_META_EXTS(struct interpreter *inter)
+{
+       u32 *reg_cache = inter->state->regs.gpr;
+       struct opcode_i *op = &inter->op->i;
+
+       if (likely(op->rt))
+               reg_cache[op->rt] = (u32)(s32)(s16)reg_cache[op->rs];
+
+       return jump_next(inter);
 }
 
 static const lightrec_int_func_t int_standard[64] = {
+       SET_DEFAULT_ELM(int_standard, int_unimplemented),
        [OP_SPECIAL]            = int_SPECIAL,
        [OP_REGIMM]             = int_REGIMM,
        [OP_J]                  = int_J,
@@ -998,14 +1012,13 @@ static const lightrec_int_func_t int_standard[64] = {
        [OP_LWC2]               = int_LWC2,
        [OP_SWC2]               = int_store,
 
-       [OP_META_REG_UNLOAD]    = int_META_SKIP,
-       [OP_META_BEQZ]          = int_BEQ,
-       [OP_META_BNEZ]          = int_BNE,
        [OP_META_MOV]           = int_META_MOV,
-       [OP_META_SYNC]          = int_META_SYNC,
+       [OP_META_EXTC]          = int_META_EXTC,
+       [OP_META_EXTS]          = int_META_EXTS,
 };
 
 static const lightrec_int_func_t int_special[64] = {
+       SET_DEFAULT_ELM(int_special, int_unimplemented),
        [OP_SPECIAL_SLL]        = int_special_SLL,
        [OP_SPECIAL_SRL]        = int_special_SRL,
        [OP_SPECIAL_SRA]        = int_special_SRA,
@@ -1037,6 +1050,7 @@ static const lightrec_int_func_t int_special[64] = {
 };
 
 static const lightrec_int_func_t int_regimm[64] = {
+       SET_DEFAULT_ELM(int_regimm, int_unimplemented),
        [OP_REGIMM_BLTZ]        = int_regimm_BLTZ,
        [OP_REGIMM_BGEZ]        = int_regimm_BGEZ,
        [OP_REGIMM_BLTZAL]      = int_regimm_BLTZAL,
@@ -1044,6 +1058,7 @@ static const lightrec_int_func_t int_regimm[64] = {
 };
 
 static const lightrec_int_func_t int_cp0[64] = {
+       SET_DEFAULT_ELM(int_cp0, int_CP),
        [OP_CP0_MFC0]           = int_cfc,
        [OP_CP0_CFC0]           = int_cfc,
        [OP_CP0_MTC0]           = int_ctc,
@@ -1052,6 +1067,7 @@ static const lightrec_int_func_t int_cp0[64] = {
 };
 
 static const lightrec_int_func_t int_cp2_basic[64] = {
+       SET_DEFAULT_ELM(int_cp2_basic, int_CP),
        [OP_CP2_BASIC_MFC2]     = int_cfc,
        [OP_CP2_BASIC_CFC2]     = int_cfc,
        [OP_CP2_BASIC_MTC2]     = int_ctc,
@@ -1061,54 +1077,54 @@ static const lightrec_int_func_t int_cp2_basic[64] = {
 static u32 int_SPECIAL(struct interpreter *inter)
 {
        lightrec_int_func_t f = int_special[inter->op->r.op];
-       if (likely(f))
-               return execute(f, inter);
-       else
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
                return int_unimplemented(inter);
+
+       return execute(f, inter);
 }
 
 static u32 int_REGIMM(struct interpreter *inter)
 {
        lightrec_int_func_t f = int_regimm[inter->op->r.rt];
-       if (likely(f))
-               return execute(f, inter);
-       else
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
                return int_unimplemented(inter);
+
+       return execute(f, inter);
 }
 
 static u32 int_CP0(struct interpreter *inter)
 {
        lightrec_int_func_t f = int_cp0[inter->op->r.rs];
-       if (likely(f))
-               return execute(f, inter);
-       else
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
                return int_CP(inter);
+
+       return execute(f, inter);
 }
 
 static u32 int_CP2(struct interpreter *inter)
 {
        if (inter->op->r.op == OP_CP2_BASIC) {
                lightrec_int_func_t f = int_cp2_basic[inter->op->r.rs];
-               if (likely(f))
+               if (HAS_DEFAULT_ELM || likely(f))
                        return execute(f, inter);
        }
 
        return int_CP(inter);
 }
 
-static u32 lightrec_int_op(struct interpreter *inter)
-{
-       return execute(int_standard[inter->op->i.op], inter);
-}
-
-static u32 lightrec_emulate_block_list(struct block *block, struct opcode *op)
+static u32 lightrec_emulate_block_list(struct lightrec_state *state,
+                                      struct block *block, u32 offset)
 {
        struct interpreter inter;
        u32 pc;
 
        inter.block = block;
-       inter.state = block->state;
-       inter.op = op;
+       inter.state = state;
+       inter.offset = offset;
+       inter.op = &block->opcode_list[offset];
        inter.cycles = 0;
        inter.delay_slot = false;
 
@@ -1117,20 +1133,17 @@ static u32 lightrec_emulate_block_list(struct block *block, struct opcode *op)
        /* Add the cycles of the last branch */
        inter.cycles += lightrec_cycles_of_opcode(inter.op->c);
 
-       block->state->current_cycle += inter.cycles;
+       state->current_cycle += inter.cycles;
 
        return pc;
 }
 
-u32 lightrec_emulate_block(struct block *block, u32 pc)
+u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u32 pc)
 {
        u32 offset = (kunseg(pc) - kunseg(block->pc)) >> 2;
-       struct opcode *op;
 
-       for (op = block->opcode_list;
-            op && (op->offset < offset); op = op->next);
-       if (op)
-               return lightrec_emulate_block_list(block, op);
+       if (offset < block->nb_ops)
+               return lightrec_emulate_block_list(state, block, offset);
 
        pr_err("PC 0x%x is outside block at PC 0x%x\n", pc, block->pc);
 
index 2113779..96600bf 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_INTERPRETER_H__
@@ -19,6 +10,6 @@
 
 struct block;
 
-u32 lightrec_emulate_block(struct block *block, u32 pc);
+u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u32 pc);
 
 #endif /* __LIGHTREC_INTERPRETER_H__ */
diff --git a/deps/lightrec/lightning-wrapper.h b/deps/lightrec/lightning-wrapper.h
new file mode 100644 (file)
index 0000000..7eeb15f
--- /dev/null
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright (C) 2022 Paul Cercueil <paul@crapouillou.net>
+ */
+
+#ifndef __LIGHTNING_WRAPPER_H__
+#define __LIGHTNING_WRAPPER_H__
+
+#include <lightning.h>
+
+#if __WORDSIZE == 32
+
+#define jit_ldxi_ui(u,v,w)     jit_ldxi_i(u,v,w)
+#define jit_stxi_ui(u,v,w)     jit_stxi_i(u,v,w)
+#define jit_extr_i(u,v)                jit_movr(u,v)
+#define jit_extr_ui(u,v)       jit_movr(u,v)
+#define jit_retval_ui(u)       jit_retval(u)
+#define jit_getarg_ui(u,v)     jit_getarg_i(u,v)
+
+#endif
+
+#endif /* __LIGHTNING_WRAPPER_H__ */
diff --git a/deps/lightrec/lightrec-config.h.cmakein b/deps/lightrec/lightrec-config.h.cmakein
new file mode 100644 (file)
index 0000000..3cef2b8
--- /dev/null
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
+ */
+
+#ifndef __LIGHTREC_CONFIG_H__
+#define __LIGHTREC_CONFIG_H__
+
+#cmakedefine01 ENABLE_THREADED_COMPILER
+#cmakedefine01 ENABLE_FIRST_PASS
+#cmakedefine01 ENABLE_DISASSEMBLER
+#cmakedefine01 ENABLE_TINYMM
+
+#cmakedefine01 HAS_DEFAULT_ELM
+
+#cmakedefine01 OPT_REMOVE_DIV_BY_ZERO_SEQ
+#cmakedefine01 OPT_REPLACE_MEMSET
+#cmakedefine01 OPT_DETECT_IMPOSSIBLE_BRANCHES
+#cmakedefine01 OPT_TRANSFORM_OPS
+#cmakedefine01 OPT_LOCAL_BRANCHES
+#cmakedefine01 OPT_SWITCH_DELAY_SLOTS
+#cmakedefine01 OPT_FLAG_STORES
+#cmakedefine01 OPT_FLAG_IO
+#cmakedefine01 OPT_FLAG_MULT_DIV
+#cmakedefine01 OPT_EARLY_UNLOAD
+
+#endif /* __LIGHTREC_CONFIG_H__ */
+
index 6304515..e9efcb5 100644 (file)
@@ -1,21 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2016-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2016-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_PRIVATE_H__
 #define __LIGHTREC_PRIVATE_H__
 
-#include "config.h"
+#include "lightrec-config.h"
 #include "disassembler.h"
 #include "lightrec.h"
 
@@ -24,7 +15,6 @@
 #endif
 
 #define ARRAY_SIZE(x) (sizeof(x) ? sizeof(x) / sizeof((x)[0]) : 0)
-#define BIT(x) (1 << (x))
 
 #ifdef __GNUC__
 #      define likely(x)       __builtin_expect(!!(x),1)
 #      define HTOLE16(x)       (x)
 #endif
 
+#if HAS_DEFAULT_ELM
+#define SET_DEFAULT_ELM(table, value) [0 ... ARRAY_SIZE(table) - 1] = value
+#else
+#define SET_DEFAULT_ELM(table, value) [0] = NULL
+#endif
+
 /* Flags for (struct block *)->flags */
 #define BLOCK_NEVER_COMPILE    BIT(0)
 #define BLOCK_SHOULD_RECOMPILE BIT(1)
 #define BLOCK_FULLY_TAGGED     BIT(2)
 #define BLOCK_IS_DEAD          BIT(3)
+#define BLOCK_IS_MEMSET                BIT(4)
 
 #define RAM_SIZE       0x200000
 #define BIOS_SIZE      0x80000
 
 #define CODE_LUT_SIZE  ((RAM_SIZE + BIOS_SIZE) >> 2)
 
+#define REG_LO 32
+#define REG_HI 33
+
 /* Definition of jit_state_t (avoids inclusion of <lightning.h>) */
 struct jit_node;
 struct jit_state;
@@ -71,19 +71,18 @@ struct reaper;
 
 struct block {
        jit_state_t *_jit;
-       struct lightrec_state *state;
        struct opcode *opcode_list;
        void (*function)(void);
+       const u32 *code;
+       struct block *next;
        u32 pc;
        u32 hash;
+       unsigned int code_size;
+       u16 nb_ops;
+       u8 flags;
 #if ENABLE_THREADED_COMPILER
        atomic_flag op_list_freed;
 #endif
-       unsigned int code_size;
-       u16 flags;
-       u16 nb_ops;
-       const struct lightrec_mem_map *map;
-       struct block *next;
 };
 
 struct lightrec_branch {
@@ -96,33 +95,50 @@ struct lightrec_branch_target {
        u32 offset;
 };
 
-struct lightrec_state {
-       u32 native_reg_cache[34];
-       u32 next_pc;
-       u32 current_cycle;
-       u32 target_cycle;
-       u32 exit_flags;
-       struct block *dispatcher, *rw_wrapper, *rw_generic_wrapper,
-                    *mfc_wrapper, *mtc_wrapper, *rfe_wrapper, *cp_wrapper,
-                    *syscall_wrapper, *break_wrapper;
-       void *rw_func, *rw_generic_func, *mfc_func, *mtc_func, *rfe_func,
-            *cp_func, *syscall_func, *break_func;
+enum c_wrappers {
+       C_WRAPPER_RW,
+       C_WRAPPER_RW_GENERIC,
+       C_WRAPPER_MFC,
+       C_WRAPPER_MTC,
+       C_WRAPPER_CP,
+       C_WRAPPER_SYSCALL,
+       C_WRAPPER_BREAK,
+       C_WRAPPERS_COUNT,
+};
+
+struct lightrec_cstate {
+       struct lightrec_state *state;
+
        struct jit_node *branches[512];
        struct lightrec_branch local_branches[512];
        struct lightrec_branch_target targets[512];
        unsigned int nb_branches;
        unsigned int nb_local_branches;
        unsigned int nb_targets;
+       unsigned int cycles;
+
+       struct regcache *reg_cache;
+};
+
+struct lightrec_state {
+       struct lightrec_registers regs;
+       u32 next_pc;
+       u32 current_cycle;
+       u32 target_cycle;
+       u32 exit_flags;
+       u32 old_cycle_counter;
+       struct block *dispatcher, *c_wrapper_block;
+       void *c_wrapper, *c_wrappers[C_WRAPPERS_COUNT];
        struct tinymm *tinymm;
        struct blockcache *block_cache;
-       struct regcache *reg_cache;
        struct recompiler *rec;
+       struct lightrec_cstate *cstate;
        struct reaper *reaper;
        void (*eob_wrapper_func)(void);
+       void (*memset_func)(void);
        void (*get_next_block)(void);
        struct lightrec_ops ops;
        unsigned int nb_precompile;
-       unsigned int cycles;
        unsigned int nb_maps;
        const struct lightrec_mem_map *maps;
        uintptr_t offset_ram, offset_bios, offset_scratch;
@@ -132,12 +148,16 @@ struct lightrec_state {
 };
 
 u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u16 *flags);
+               u32 addr, u32 data, u16 *flags,
+               struct block *block);
 
-void lightrec_free_block(struct block *block);
+void lightrec_free_block(struct lightrec_state *state, struct block *block);
 
 void remove_from_code_lut(struct blockcache *cache, struct block *block);
 
+const struct lightrec_mem_map *
+lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr);
+
 static inline u32 kunseg(u32 addr)
 {
        if (unlikely(addr >= 0xa0000000))
@@ -154,12 +174,48 @@ static inline u32 lut_offset(u32 pc)
                return (pc & (RAM_SIZE - 1)) >> 2; // RAM
 }
 
+static inline u32 get_ds_pc(const struct block *block, u16 offset, s16 imm)
+{
+       u16 flags = block->opcode_list[offset].flags;
+
+       offset += !!(OPT_SWITCH_DELAY_SLOTS && (flags & LIGHTREC_NO_DS));
+
+       return block->pc + (offset + imm << 2);
+}
+
+static inline u32 get_branch_pc(const struct block *block, u16 offset, s16 imm)
+{
+       u16 flags = block->opcode_list[offset].flags;
+
+       offset -= !!(OPT_SWITCH_DELAY_SLOTS && (flags & LIGHTREC_NO_DS));
+
+       return block->pc + (offset + imm << 2);
+}
+
 void lightrec_mtc(struct lightrec_state *state, union code op, u32 data);
 u32 lightrec_mfc(struct lightrec_state *state, union code op);
+void lightrec_rfe(struct lightrec_state *state);
+void lightrec_cp(struct lightrec_state *state, union code op);
+
+struct lightrec_cstate * lightrec_create_cstate(struct lightrec_state *state);
+void lightrec_free_cstate(struct lightrec_cstate *cstate);
 
 union code lightrec_read_opcode(struct lightrec_state *state, u32 pc);
 
 struct block * lightrec_get_block(struct lightrec_state *state, u32 pc);
-int lightrec_compile_block(struct block *block);
+int lightrec_compile_block(struct lightrec_cstate *cstate, struct block *block);
+void lightrec_free_opcode_list(struct lightrec_state *state, struct block *block);
+
+unsigned int lightrec_cycles_of_opcode(union code code);
+
+static inline u8 get_mult_div_lo(union code c)
+{
+       return (OPT_FLAG_MULT_DIV && c.r.rd) ? c.r.rd : REG_LO;
+}
+
+static inline u8 get_mult_div_hi(union code c)
+{
+       return (OPT_FLAG_MULT_DIV && c.r.imm) ? c.r.imm : REG_HI;
+}
 
 #endif /* __LIGHTREC_PRIVATE_H__ */
index 7fdf74a..3d4e1a2 100644 (file)
@@ -1,23 +1,15 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "blockcache.h"
-#include "config.h"
 #include "debug.h"
 #include "disassembler.h"
 #include "emitter.h"
 #include "interpreter.h"
+#include "lightrec-config.h"
+#include "lightning-wrapper.h"
 #include "lightrec.h"
 #include "memmanager.h"
 #include "reaper.h"
@@ -26,7 +18,7 @@
 #include "optimizer.h"
 
 #include <errno.h>
-#include <lightning.h>
+#include <inttypes.h>
 #include <limits.h>
 #if ENABLE_THREADED_COMPILER
 #include <stdatomic.h>
 
 static struct block * lightrec_precompile_block(struct lightrec_state *state,
                                                u32 pc);
+static bool lightrec_block_is_fully_tagged(const struct block *block);
+
+static void lightrec_mtc2(struct lightrec_state *state, u8 reg, u32 data);
+static u32 lightrec_mfc2(struct lightrec_state *state, u8 reg);
 
 static void lightrec_default_sb(struct lightrec_state *state, u32 opcode,
                                void *host, u32 addr, u8 data)
@@ -98,11 +94,14 @@ static const struct lightrec_mem_map_ops lightrec_default_ops = {
        .lw = lightrec_default_lw,
 };
 
-static void __segfault_cb(struct lightrec_state *state, u32 addr)
+static void __segfault_cb(struct lightrec_state *state, u32 addr,
+                         const struct block *block)
 {
        lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
        pr_err("Segmentation fault in recompiled code: invalid "
               "load/store at address 0x%08x\n", addr);
+       if (block)
+               pr_err("Was executing block PC 0x%08x\n", block->pc);
 }
 
 static void lightrec_swl(struct lightrec_state *state,
@@ -147,7 +146,7 @@ static void lightrec_swc2(struct lightrec_state *state, union code op,
                          const struct lightrec_mem_map_ops *ops,
                          void *host, u32 addr)
 {
-       u32 data = state->ops.cop2_ops.mfc(state, op.opcode, op.i.rt);
+       u32 data = lightrec_mfc2(state, op.i.rt);
 
        ops->sw(state, op.opcode, host, addr, data);
 }
@@ -192,55 +191,64 @@ static void lightrec_lwc2(struct lightrec_state *state, union code op,
 {
        u32 data = ops->lw(state, op.opcode, host, addr);
 
-       state->ops.cop2_ops.mtc(state, op.opcode, op.i.rt, data);
+       lightrec_mtc2(state, op.i.rt, data);
 }
 
 static void lightrec_invalidate_map(struct lightrec_state *state,
-               const struct lightrec_mem_map *map, u32 addr)
+               const struct lightrec_mem_map *map, u32 addr, u32 len)
 {
-       if (map == &state->maps[PSX_MAP_KERNEL_USER_RAM])
-               state->code_lut[lut_offset(addr)] = NULL;
+       if (map == &state->maps[PSX_MAP_KERNEL_USER_RAM]) {
+               memset(&state->code_lut[lut_offset(addr)], 0,
+                      ((len + 3) / 4) * sizeof(void *));
+       }
 }
 
-static const struct lightrec_mem_map *
-lightrec_get_map(struct lightrec_state *state, u32 kaddr)
+const struct lightrec_mem_map *
+lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr)
 {
+       const struct lightrec_mem_map *map;
        unsigned int i;
+       u32 addr;
 
        for (i = 0; i < state->nb_maps; i++) {
-               const struct lightrec_mem_map *map = &state->maps[i];
+               const struct lightrec_mem_map *mapi = &state->maps[i];
 
-               if (kaddr >= map->pc && kaddr < map->pc + map->length)
-                       return map;
+               if (kaddr >= mapi->pc && kaddr < mapi->pc + mapi->length) {
+                       map = mapi;
+                       break;
+               }
        }
 
-       return NULL;
+       if (i == state->nb_maps)
+               return NULL;
+
+       addr = kaddr - map->pc;
+
+       while (map->mirror_of)
+               map = map->mirror_of;
+
+       if (host)
+               *host = map->address + addr;
+
+       return map;
 }
 
 u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u16 *flags)
+               u32 addr, u32 data, u16 *flags, struct block *block)
 {
        const struct lightrec_mem_map *map;
        const struct lightrec_mem_map_ops *ops;
-       u32 kaddr, pc, opcode = op.opcode;
+       u32 opcode = op.opcode;
        void *host;
 
        addr += (s16) op.i.imm;
-       kaddr = kunseg(addr);
 
-       map = lightrec_get_map(state, kaddr);
+       map = lightrec_get_map(state, &host, kunseg(addr));
        if (!map) {
-               __segfault_cb(state, addr);
+               __segfault_cb(state, addr, block);
                return 0;
        }
 
-       pc = map->pc;
-
-       while (map->mirror_of)
-               map = map->mirror_of;
-
-       host = (void *)((uintptr_t)map->address + kaddr - pc);
-
        if (unlikely(map->ops)) {
                if (flags)
                        *flags |= LIGHTREC_HW_IO;
@@ -294,11 +302,11 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
 }
 
 static void lightrec_rw_helper(struct lightrec_state *state,
-                              union code op, u16 *flags)
+                              union code op, u16 *flags,
+                              struct block *block)
 {
-       u32 ret = lightrec_rw(state, op,
-                         state->native_reg_cache[op.i.rs],
-                         state->native_reg_cache[op.i.rt], flags);
+       u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs],
+                             state->regs.gpr[op.i.rt], flags, block);
 
        switch (op.i.op) {
        case OP_LB:
@@ -309,7 +317,7 @@ static void lightrec_rw_helper(struct lightrec_state *state,
        case OP_LWR:
        case OP_LW:
                if (op.i.rt)
-                       state->native_reg_cache[op.i.rt] = ret;
+                       state->regs.gpr[op.i.rt] = ret;
        default: /* fall-through */
                break;
        }
@@ -317,43 +325,85 @@ static void lightrec_rw_helper(struct lightrec_state *state,
 
 static void lightrec_rw_cb(struct lightrec_state *state, union code op)
 {
-       lightrec_rw_helper(state, op, NULL);
+       lightrec_rw_helper(state, op, NULL, NULL);
 }
 
-static void lightrec_rw_generic_cb(struct lightrec_state *state,
-                                  struct opcode *op, struct block *block)
+static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
 {
-       bool was_tagged = op->flags & (LIGHTREC_HW_IO | LIGHTREC_DIRECT_IO);
+       struct block *block;
+       struct opcode *op;
+       bool was_tagged;
+       u16 offset = (u16)arg;
+
+       block = lightrec_find_block_from_lut(state->block_cache,
+                                            arg >> 16, state->next_pc);
+       if (unlikely(!block)) {
+               pr_err("rw_generic: No block found in LUT for PC 0x%x offset 0x%x\n",
+                        state->next_pc, offset);
+               return;
+       }
+
+       op = &block->opcode_list[offset];
+       was_tagged = op->flags & (LIGHTREC_HW_IO | LIGHTREC_DIRECT_IO);
 
-       lightrec_rw_helper(state, op->c, &op->flags);
+       lightrec_rw_helper(state, op->c, &op->flags, block);
 
        if (!was_tagged) {
-               pr_debug("Opcode of block at PC 0x%08x offset 0x%x has been "
-                        "tagged - flag for recompilation\n",
-                        block->pc, op->offset << 2);
+               pr_debug("Opcode of block at PC 0x%08x has been tagged - flag "
+                        "for recompilation\n", block->pc);
 
                block->flags |= BLOCK_SHOULD_RECOMPILE;
        }
 }
 
-u32 lightrec_mfc(struct lightrec_state *state, union code op)
+static u32 clamp_s32(s32 val, s32 min, s32 max)
 {
-       bool is_cfc = (op.i.op == OP_CP0 && op.r.rs == OP_CP0_CFC0) ||
-                     (op.i.op == OP_CP2 && op.r.rs == OP_CP2_BASIC_CFC2);
-       u32 (*func)(struct lightrec_state *, u32, u8);
-       const struct lightrec_cop_ops *ops;
+       return val < min ? min : val > max ? max : val;
+}
 
-       if (op.i.op == OP_CP0)
-               ops = &state->ops.cop0_ops;
-       else
-               ops = &state->ops.cop2_ops;
+static u32 lightrec_mfc2(struct lightrec_state *state, u8 reg)
+{
+       s16 gteir1, gteir2, gteir3;
+
+       switch (reg) {
+       case 1:
+       case 3:
+       case 5:
+       case 8:
+       case 9:
+       case 10:
+       case 11:
+               return (s32)(s16) state->regs.cp2d[reg];
+       case 7:
+       case 16:
+       case 17:
+       case 18:
+       case 19:
+               return (u16) state->regs.cp2d[reg];
+       case 28:
+       case 29:
+               gteir1 = (s16) state->regs.cp2d[9];
+               gteir2 = (s16) state->regs.cp2d[10];
+               gteir3 = (s16) state->regs.cp2d[11];
+
+               return clamp_s32(gteir1 >> 7, 0, 0x1f) << 0 |
+                       clamp_s32(gteir2 >> 7, 0, 0x1f) << 5 |
+                       clamp_s32(gteir3 >> 7, 0, 0x1f) << 10;
+       case 15:
+               reg = 14;
+       default: /* fall-through */
+               return state->regs.cp2d[reg];
+       }
+}
 
-       if (is_cfc)
-               func = ops->cfc;
+u32 lightrec_mfc(struct lightrec_state *state, union code op)
+{
+       if (op.i.op == OP_CP0)
+               return state->regs.cp0[op.r.rd];
+       else if (op.r.rs == OP_CP2_BASIC_MFC2)
+               return lightrec_mfc2(state, op.r.rd);
        else
-               func = ops->mfc;
-
-       return (*func)(state, op.opcode, op.r.rd);
+               return state->regs.cp2c[op.r.rd];
 }
 
 static void lightrec_mfc_cb(struct lightrec_state *state, union code op)
@@ -361,58 +411,146 @@ static void lightrec_mfc_cb(struct lightrec_state *state, union code op)
        u32 rt = lightrec_mfc(state, op);
 
        if (op.r.rt)
-               state->native_reg_cache[op.r.rt] = rt;
+               state->regs.gpr[op.r.rt] = rt;
 }
 
-void lightrec_mtc(struct lightrec_state *state, union code op, u32 data)
+static void lightrec_mtc0(struct lightrec_state *state, u8 reg, u32 data)
 {
-       bool is_ctc = (op.i.op == OP_CP0 && op.r.rs == OP_CP0_CTC0) ||
-                     (op.i.op == OP_CP2 && op.r.rs == OP_CP2_BASIC_CTC2);
-       void (*func)(struct lightrec_state *, u32, u8, u32);
-       const struct lightrec_cop_ops *ops;
+       u32 status, cause;
+
+       switch (reg) {
+       case 1:
+       case 4:
+       case 8:
+       case 14:
+       case 15:
+               /* Those registers are read-only */
+               return;
+       default: /* fall-through */
+               break;
+       }
 
-       if (op.i.op == OP_CP0)
-               ops = &state->ops.cop0_ops;
-       else
-               ops = &state->ops.cop2_ops;
+       if (reg == 12) {
+               status = state->regs.cp0[12];
 
-       if (is_ctc)
-               func = ops->ctc;
-       else
-               func = ops->mtc;
+               if (status & ~data & BIT(16)) {
+                       state->ops.enable_ram(state, true);
+                       lightrec_invalidate_all(state);
+               } else if (~status & data & BIT(16)) {
+                       state->ops.enable_ram(state, false);
+               }
+       }
+
+       state->regs.cp0[reg] = data;
+
+       if (reg == 12 || reg == 13) {
+               cause = state->regs.cp0[13];
+               status = state->regs.cp0[12];
+
+               if (!!(status & cause & 0x300) & status)
+                       lightrec_set_exit_flags(state, LIGHTREC_EXIT_CHECK_INTERRUPT);
+       }
+}
+
+static u32 count_leading_bits(s32 data)
+{
+#if defined(__has_builtin) && __has_builtin(__builtin_clrsb)
+       return 1 + __builtin_clrsb(data);
+#else
+       u32 cnt = 33;
+
+       data = (data ^ (data >> 31)) << 1;
+
+       do {
+               cnt -= 1;
+               data >>= 1;
+       } while (data);
+
+       return cnt;
+#endif
+}
+
+static void lightrec_mtc2(struct lightrec_state *state, u8 reg, u32 data)
+{
+       switch (reg) {
+       case 15:
+               state->regs.cp2d[12] = state->regs.cp2d[13];
+               state->regs.cp2d[13] = state->regs.cp2d[14];
+               state->regs.cp2d[14] = data;
+               break;
+       case 28:
+               state->regs.cp2d[9] = (data << 7) & 0xf80;
+               state->regs.cp2d[10] = (data << 2) & 0xf80;
+               state->regs.cp2d[11] = (data >> 3) & 0xf80;
+               break;
+       case 31:
+               return;
+       case 30:
+               state->regs.cp2d[31] = count_leading_bits((s32) data);
+       default: /* fall-through */
+               state->regs.cp2d[reg] = data;
+               break;
+       }
+}
 
-       (*func)(state, op.opcode, op.r.rd, data);
+static void lightrec_ctc2(struct lightrec_state *state, u8 reg, u32 data)
+{
+       switch (reg) {
+       case 4:
+       case 12:
+       case 20:
+       case 26:
+       case 27:
+       case 29:
+       case 30:
+               data = (s32)(s16) data;
+               break;
+       case 31:
+               data = (data & 0x7ffff000) | !!(data & 0x7f87e000) << 31;
+       default: /* fall-through */
+               break;
+       }
+
+       state->regs.cp2c[reg] = data;
+}
+
+void lightrec_mtc(struct lightrec_state *state, union code op, u32 data)
+{
+       if (op.i.op == OP_CP0)
+               lightrec_mtc0(state, op.r.rd, data);
+       else if (op.r.rs == OP_CP2_BASIC_CTC2)
+               lightrec_ctc2(state, op.r.rd, data);
+       else
+               lightrec_mtc2(state, op.r.rd, data);
 }
 
 static void lightrec_mtc_cb(struct lightrec_state *state, union code op)
 {
-       lightrec_mtc(state, op, state->native_reg_cache[op.r.rt]);
+       lightrec_mtc(state, op, state->regs.gpr[op.r.rt]);
 }
 
-static void lightrec_rfe_cb(struct lightrec_state *state, union code op)
+void lightrec_rfe(struct lightrec_state *state)
 {
        u32 status;
 
        /* Read CP0 Status register (r12) */
-       status = state->ops.cop0_ops.mfc(state, op.opcode, 12);
+       status = state->regs.cp0[12];
 
        /* Switch the bits */
        status = ((status & 0x3c) >> 2) | (status & ~0xf);
 
        /* Write it back */
-       state->ops.cop0_ops.ctc(state, op.opcode, 12, status);
+       lightrec_mtc0(state, 12, status);
 }
 
-static void lightrec_cp_cb(struct lightrec_state *state, union code op)
+void lightrec_cp(struct lightrec_state *state, union code op)
 {
-       void (*func)(struct lightrec_state *, u32);
-
-       if ((op.opcode >> 25) & 1)
-               func = state->ops.cop2_ops.op;
-       else
-               func = state->ops.cop0_ops.op;
+       if (op.i.op == OP_CP0) {
+               pr_err("Invalid CP opcode to coprocessor #0\n");
+               return;
+       }
 
-       (*func)(state, op.opcode);
+       (*state->ops.cop2_op)(state, op.opcode);
 }
 
 static void lightrec_syscall_cb(struct lightrec_state *state, union code op)
@@ -429,7 +567,7 @@ struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
 {
        struct block *block = lightrec_find_block(state->block_cache, pc);
 
-       if (block && lightrec_block_is_outdated(block)) {
+       if (block && lightrec_block_is_outdated(state, block)) {
                pr_debug("Block at PC 0x%08x is outdated!\n", block->pc);
 
                /* Make sure the recompiler isn't processing the block we'll
@@ -439,7 +577,7 @@ struct block * lightrec_get_block(struct lightrec_state *state, u32 pc)
 
                lightrec_unregister_block(state->block_cache, block);
                remove_from_code_lut(state->block_cache, block);
-               lightrec_free_block(block);
+               lightrec_free_block(state, block);
                block = NULL;
        }
 
@@ -466,12 +604,17 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
        for (;;) {
                func = state->code_lut[lut_offset(pc)];
                if (func && func != state->get_next_block)
-                       return func;
+                       break;
 
                block = lightrec_get_block(state, pc);
 
                if (unlikely(!block))
-                       return NULL;
+                       break;
+
+               if (OPT_REPLACE_MEMSET && (block->flags & BLOCK_IS_MEMSET)) {
+                       func = state->memset_func;
+                       break;
+               }
 
                should_recompile = block->flags & BLOCK_SHOULD_RECOMPILE &&
                        !(block->flags & BLOCK_IS_DEAD);
@@ -484,66 +627,54 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                        if (ENABLE_THREADED_COMPILER)
                                lightrec_recompiler_add(state->rec, block);
                        else
-                               lightrec_compile_block(block);
+                               lightrec_compile_block(state->cstate, block);
                }
 
                if (ENABLE_THREADED_COMPILER && likely(!should_recompile))
-                       func = lightrec_recompiler_run_first_pass(block, &pc);
+                       func = lightrec_recompiler_run_first_pass(state, block, &pc);
                else
                        func = block->function;
 
                if (likely(func))
-                       return func;
+                       break;
 
-               /* Block wasn't compiled yet - run the interpreter */
-               if (!ENABLE_THREADED_COMPILER &&
-                   ((ENABLE_FIRST_PASS && likely(!should_recompile)) ||
-                    unlikely(block->flags & BLOCK_NEVER_COMPILE)))
-                       pc = lightrec_emulate_block(block, pc);
+               if (unlikely(block->flags & BLOCK_NEVER_COMPILE)) {
+                       pc = lightrec_emulate_block(state, block, pc);
+
+               } else if (!ENABLE_THREADED_COMPILER) {
+                       /* Block wasn't compiled yet - run the interpreter */
+                       if (block->flags & BLOCK_FULLY_TAGGED)
+                               pr_debug("Block fully tagged, skipping first pass\n");
+                       else if (ENABLE_FIRST_PASS && likely(!should_recompile))
+                               pc = lightrec_emulate_block(state, block, pc);
 
-               if (likely(!(block->flags & BLOCK_NEVER_COMPILE))) {
                        /* Then compile it using the profiled data */
-                       if (ENABLE_THREADED_COMPILER)
-                               lightrec_recompiler_add(state->rec, block);
-                       else
-                               lightrec_compile_block(block);
+                       lightrec_compile_block(state->cstate, block);
+               } else {
+                       lightrec_recompiler_add(state->rec, block);
                }
 
                if (state->exit_flags != LIGHTREC_EXIT_NORMAL ||
-                   state->current_cycle >= state->target_cycle) {
-                       state->next_pc = pc;
-                       return NULL;
-               }
+                   state->current_cycle >= state->target_cycle)
+                       break;
        }
-}
 
-static s32 c_generic_function_wrapper(struct lightrec_state *state,
-                                     s32 cycles_delta,
-                                     void (*f)(struct lightrec_state *,
-                                               struct opcode *,
-                                               struct block *),
-                                     struct opcode *op, struct block *block)
-{
-       state->current_cycle = state->target_cycle - cycles_delta;
-
-       (*f)(state, op, block);
-
-       return state->target_cycle - state->current_cycle;
+       state->next_pc = pc;
+       return func;
 }
 
 static s32 c_function_wrapper(struct lightrec_state *state, s32 cycles_delta,
-                             void (*f)(struct lightrec_state *, union code),
-                             union code op)
+                             void (*f)(struct lightrec_state *, u32 d),
+                             u32 d)
 {
        state->current_cycle = state->target_cycle - cycles_delta;
 
-       (*f)(state, op);
+       (*f)(state, d);
 
        return state->target_cycle - state->current_cycle;
 }
 
-static struct block * generate_wrapper(struct lightrec_state *state,
-                                      void *f, bool generic)
+static struct block * generate_wrapper(struct lightrec_state *state)
 {
        struct block *block;
        jit_state_t *_jit;
@@ -594,25 +725,14 @@ static struct block * generate_wrapper(struct lightrec_state *state,
        jit_prepare();
        jit_pushargr(LIGHTREC_REG_STATE);
        jit_pushargr(LIGHTREC_REG_CYCLE);
-       jit_pushargi((uintptr_t)f);
        jit_pushargr(JIT_R0);
-       if (generic) {
-               jit_pushargr(JIT_R1);
-               jit_finishi(c_generic_function_wrapper);
-       } else {
-               jit_finishi(c_function_wrapper);
-       }
-
-#if __WORDSIZE == 64
+       jit_pushargr(JIT_R1);
+       jit_finishi(c_function_wrapper);
        jit_retval_i(LIGHTREC_REG_CYCLE);
-#else
-       jit_retval(LIGHTREC_REG_CYCLE);
-#endif
 
        jit_patch_at(jit_jmpi(), to_fn_epilog);
        jit_epilog();
 
-       block->state = state;
        block->_jit = _jit;
        block->function = jit_emit();
        block->opcode_list = NULL;
@@ -639,11 +759,35 @@ err_no_mem:
        return NULL;
 }
 
+static u32 lightrec_memset(struct lightrec_state *state)
+{
+       u32 kunseg_pc = kunseg(state->regs.gpr[4]);
+       void *host;
+       const struct lightrec_mem_map *map = lightrec_get_map(state, &host, kunseg_pc);
+       u32 length = state->regs.gpr[5] * 4;
+
+       if (!map) {
+               pr_err("Unable to find memory map for memset target address "
+                      "0x%x\n", kunseg_pc);
+               return 0;
+       }
+
+       pr_debug("Calling host memset, PC 0x%x (host address 0x%" PRIxPTR ") for %u bytes\n",
+                kunseg_pc, (uintptr_t)host, length);
+       memset(host, 0, length);
+
+       if (!state->invalidate_from_dma_only)
+               lightrec_invalidate_map(state, map, kunseg_pc, length);
+
+       /* Rough estimation of the number of cycles consumed */
+       return 8 + 5 * (length  + 3 / 4);
+}
+
 static struct block * generate_dispatcher(struct lightrec_state *state)
 {
        struct block *block;
        jit_state_t *_jit;
-       jit_node_t *to_end, *to_end2, *to_c, *loop, *addr, *addr2;
+       jit_node_t *to_end, *to_c, *loop, *addr, *addr2, *addr3;
        unsigned int i;
        u32 offset, ram_len;
        jit_word_t code_size;
@@ -663,11 +807,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        jit_frame(256);
 
        jit_getarg(JIT_R0, jit_arg());
-#if __WORDSIZE == 64
        jit_getarg_i(LIGHTREC_REG_CYCLE, jit_arg());
-#else
-       jit_getarg(LIGHTREC_REG_CYCLE, jit_arg());
-#endif
 
        /* Force all callee-saved registers to be pushed on the stack */
        for (i = 0; i < NUM_REGS; i++)
@@ -682,10 +822,30 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        /* Call the block's code */
        jit_jmpr(JIT_R0);
 
+       if (OPT_REPLACE_MEMSET) {
+               /* Blocks will jump here when they need to call
+                * lightrec_memset() */
+               addr3 = jit_indirect();
+
+               jit_prepare();
+               jit_pushargr(LIGHTREC_REG_STATE);
+               jit_finishi(lightrec_memset);
+
+               jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
+                           offsetof(struct lightrec_state, regs.gpr[31]));
+
+               jit_retval(JIT_R0);
+               jit_subr(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, JIT_R0);
+       }
+
        /* The block will jump here, with the number of cycles remaining in
         * LIGHTREC_REG_CYCLE */
        addr2 = jit_indirect();
 
+       /* Store back the next_pc to the lightrec_state structure */
+       offset = offsetof(struct lightrec_state, next_pc);
+       jit_stxi_i(offset, LIGHTREC_REG_STATE, JIT_V0);
+
        /* Jump to end if state->target_cycle < state->current_cycle */
        to_end = jit_blei(LIGHTREC_REG_CYCLE, 0);
 
@@ -695,9 +855,8 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        to_c = jit_bgei(JIT_R0, ram_len);
 
        /* Fast path: code is running from RAM, use the code LUT */
-#if __WORDSIZE == 64
-       jit_lshi(JIT_R0, JIT_R0, 1);
-#endif
+       if (__WORDSIZE == 64)
+               jit_lshi(JIT_R0, JIT_R0, 1);
        jit_addr(JIT_R0, JIT_R0, LIGHTREC_REG_STATE);
        jit_ldxi(JIT_R0, JIT_R0, offsetof(struct lightrec_state, code_lut));
 
@@ -707,7 +866,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        /* Slow path: call C function get_next_block_func() */
        jit_patch(to_c);
 
-       if (ENABLE_FIRST_PASS) {
+       if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* We may call the interpreter - update state->current_cycle */
                jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
                           offsetof(struct lightrec_state, target_cycle));
@@ -728,7 +887,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        jit_finishi(&get_next_block_func);
        jit_retval(JIT_R0);
 
-       if (ENABLE_FIRST_PASS) {
+       if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* The interpreter may have updated state->current_cycle and
                 * state->target_cycle - recalc the delta */
                jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
@@ -741,22 +900,13 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        /* If we get non-NULL, loop */
        jit_patch_at(jit_bnei(JIT_R0, 0), loop);
 
-       to_end2 = jit_jmpi();
-
        /* When exiting, the recompiled code will jump to that address */
        jit_note(__FILE__, __LINE__);
        jit_patch(to_end);
 
-       /* Store back the next_pc to the lightrec_state structure */
-       offset = offsetof(struct lightrec_state, next_pc);
-       jit_stxi_i(offset, LIGHTREC_REG_STATE, JIT_V0);
-
-       jit_patch(to_end2);
-
        jit_retr(LIGHTREC_REG_CYCLE);
        jit_epilog();
 
-       block->state = state;
        block->_jit = _jit;
        block->function = jit_emit();
        block->opcode_list = NULL;
@@ -769,6 +919,8 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        block->code_size = code_size;
 
        state->eob_wrapper_func = jit_address(addr2);
+       if (OPT_REPLACE_MEMSET)
+               state->memset_func = jit_address(addr3);
        state->get_next_block = jit_address(addr);
 
        if (ENABLE_DISASSEMBLER) {
@@ -789,18 +941,64 @@ err_no_mem:
 
 union code lightrec_read_opcode(struct lightrec_state *state, u32 pc)
 {
-       u32 addr, kunseg_pc = kunseg(pc);
-       const u32 *code;
-       const struct lightrec_mem_map *map = lightrec_get_map(state, kunseg_pc);
+       void *host;
 
-       addr = kunseg_pc - map->pc;
+       lightrec_get_map(state, &host, kunseg(pc));
 
-       while (map->mirror_of)
-               map = map->mirror_of;
+       const u32 *code = (u32 *)host;
+       return (union code) *code;
+}
 
-       code = map->address + addr;
+unsigned int lightrec_cycles_of_opcode(union code code)
+{
+       return 2;
+}
 
-       return (union code) *code;
+void lightrec_free_opcode_list(struct lightrec_state *state, struct block *block)
+{
+       lightrec_free(state, MEM_FOR_IR,
+                     sizeof(*block->opcode_list) * block->nb_ops,
+                     block->opcode_list);
+}
+
+static unsigned int lightrec_get_mips_block_len(const u32 *src)
+{
+       unsigned int i;
+       union code c;
+
+       for (i = 1; ; i++) {
+               c.opcode = LE32TOH(*src++);
+
+               if (is_syscall(c))
+                       return i;
+
+               if (is_unconditional_jump(c))
+                       return i + 1;
+       }
+}
+
+static struct opcode * lightrec_disassemble(struct lightrec_state *state,
+                                           const u32 *src, unsigned int *len)
+{
+       struct opcode *list;
+       unsigned int i, length;
+
+       length = lightrec_get_mips_block_len(src);
+
+       list = lightrec_malloc(state, MEM_FOR_IR, sizeof(*list) * length);
+       if (!list) {
+               pr_err("Unable to allocate memory\n");
+               return NULL;
+       }
+
+       for (i = 0; i < length; i++) {
+               list[i].opcode = LE32TOH(src[i]);
+               list[i].flags = 0;
+       }
+
+       *len = length * sizeof(u32);
+
+       return list;
 }
 
 static struct block * lightrec_precompile_block(struct lightrec_state *state,
@@ -808,21 +1006,15 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
 {
        struct opcode *list;
        struct block *block;
-       const u32 *code;
-       u32 addr, kunseg_pc = kunseg(pc);
-       const struct lightrec_mem_map *map = lightrec_get_map(state, kunseg_pc);
+       void *host;
+       const struct lightrec_mem_map *map = lightrec_get_map(state, &host, kunseg(pc));
+       const u32 *code = (u32 *) host;
        unsigned int length;
+       bool fully_tagged;
 
        if (!map)
                return NULL;
 
-       addr = kunseg_pc - map->pc;
-
-       while (map->mirror_of)
-               map = map->mirror_of;
-
-       code = map->address + addr;
-
        block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block));
        if (!block) {
                pr_err("Unable to recompile block: Out of memory\n");
@@ -836,11 +1028,10 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
        }
 
        block->pc = pc;
-       block->state = state;
        block->_jit = NULL;
        block->function = NULL;
        block->opcode_list = list;
-       block->map = map;
+       block->code = code;
        block->next = NULL;
        block->flags = 0;
        block->code_size = 0;
@@ -849,24 +1040,31 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
 #endif
        block->nb_ops = length / sizeof(u32);
 
-       lightrec_optimize(block);
+       lightrec_optimize(state, block);
 
        length = block->nb_ops * sizeof(u32);
 
        lightrec_register(MEM_FOR_MIPS_CODE, length);
 
        if (ENABLE_DISASSEMBLER) {
-               pr_debug("Disassembled block at PC: 0x%x\n", block->pc);
-               lightrec_print_disassembly(block, code, length);
+               pr_debug("Disassembled block at PC: 0x%08x\n", block->pc);
+               lightrec_print_disassembly(block, code);
        }
 
-       pr_debug("Block size: %lu opcodes\n", block->nb_ops);
+       pr_debug("Block size: %hu opcodes\n", block->nb_ops);
 
        /* If the first opcode is an 'impossible' branch, never compile the
         * block */
-       if (list->flags & LIGHTREC_EMULATE_BRANCH)
+       if (should_emulate(block->opcode_list))
                block->flags |= BLOCK_NEVER_COMPILE;
 
+       fully_tagged = lightrec_block_is_fully_tagged(block);
+       if (fully_tagged)
+               block->flags |= BLOCK_FULLY_TAGGED;
+
+       if (OPT_REPLACE_MEMSET && (block->flags & BLOCK_IS_MEMSET))
+               state->code_lut[lut_offset(pc)] = state->memset_func;
+
        block->hash = lightrec_calculate_block_hash(block);
 
        pr_debug("Recompile count: %u\n", state->nb_precompile++);
@@ -874,11 +1072,14 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
        return block;
 }
 
-static bool lightrec_block_is_fully_tagged(struct block *block)
+static bool lightrec_block_is_fully_tagged(const struct block *block)
 {
-       struct opcode *op;
+       const struct opcode *op;
+       unsigned int i;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               op = &block->opcode_list[i];
 
-       for (op = block->opcode_list; op; op = op->next) {
                /* Verify that all load/stores of the opcode list
                 * Check all loads/stores of the opcode list and mark the
                 * block as fully compiled if they all have been tagged. */
@@ -908,22 +1109,24 @@ static bool lightrec_block_is_fully_tagged(struct block *block)
        return true;
 }
 
-static void lightrec_reap_block(void *data)
+static void lightrec_reap_block(struct lightrec_state *state, void *data)
 {
        struct block *block = data;
 
        pr_debug("Reap dead block at PC 0x%08x\n", block->pc);
-       lightrec_free_block(block);
+       lightrec_unregister_block(state->block_cache, block);
+       lightrec_free_block(state, block);
 }
 
-static void lightrec_reap_jit(void *data)
+static void lightrec_reap_jit(struct lightrec_state *state, void *data)
 {
        _jit_destroy_state(data);
 }
 
-int lightrec_compile_block(struct block *block)
+int lightrec_compile_block(struct lightrec_cstate *cstate,
+                          struct block *block)
 {
-       struct lightrec_state *state = block->state;
+       struct lightrec_state *state = cstate->state;
        struct lightrec_branch_target *target;
        bool op_list_freed = false, fully_tagged = false;
        struct block *block2;
@@ -933,7 +1136,7 @@ int lightrec_compile_block(struct block *block)
        bool skip_next = false;
        jit_word_t code_size;
        unsigned int i, j;
-       u32 next_pc, offset;
+       u32 offset;
 
        fully_tagged = lightrec_block_is_fully_tagged(block);
        if (fully_tagged)
@@ -946,34 +1149,35 @@ int lightrec_compile_block(struct block *block)
        oldjit = block->_jit;
        block->_jit = _jit;
 
-       lightrec_regcache_reset(state->reg_cache);
-       state->cycles = 0;
-       state->nb_branches = 0;
-       state->nb_local_branches = 0;
-       state->nb_targets = 0;
+       lightrec_regcache_reset(cstate->reg_cache);
+       cstate->cycles = 0;
+       cstate->nb_branches = 0;
+       cstate->nb_local_branches = 0;
+       cstate->nb_targets = 0;
 
        jit_prolog();
        jit_tramp(256);
 
        start_of_block = jit_label();
 
-       for (elm = block->opcode_list; elm; elm = elm->next) {
-               next_pc = block->pc + elm->offset * sizeof(u32);
+       for (i = 0; i < block->nb_ops; i++) {
+               elm = &block->opcode_list[i];
 
                if (skip_next) {
                        skip_next = false;
                        continue;
                }
 
-               state->cycles += lightrec_cycles_of_opcode(elm->c);
+               cstate->cycles += lightrec_cycles_of_opcode(elm->c);
 
-               if (elm->flags & LIGHTREC_EMULATE_BRANCH) {
+               if (should_emulate(elm)) {
                        pr_debug("Branch at offset 0x%x will be emulated\n",
-                                elm->offset << 2);
-                       lightrec_emit_eob(block, elm, next_pc);
+                                i << 2);
+
+                       lightrec_emit_eob(cstate, block, i, false);
                        skip_next = !(elm->flags & LIGHTREC_NO_DS);
-               } else if (elm->opcode) {
-                       lightrec_rec_opcode(block, elm, next_pc);
+               } else {
+                       lightrec_rec_opcode(cstate, block, i);
                        skip_next = has_delay_slot(elm->c) &&
                                !(elm->flags & LIGHTREC_NO_DS);
 #if _WIN32
@@ -981,16 +1185,16 @@ int lightrec_compile_block(struct block *block)
                         * mapped registers as temporaries. Until the actual bug
                         * is found and fixed, unconditionally mark our
                         * registers as live here. */
-                       lightrec_regcache_mark_live(state->reg_cache, _jit);
+                       lightrec_regcache_mark_live(cstate->reg_cache, _jit);
 #endif
                }
        }
 
-       for (i = 0; i < state->nb_branches; i++)
-               jit_patch(state->branches[i]);
+       for (i = 0; i < cstate->nb_branches; i++)
+               jit_patch(cstate->branches[i]);
 
-       for (i = 0; i < state->nb_local_branches; i++) {
-               struct lightrec_branch *branch = &state->local_branches[i];
+       for (i = 0; i < cstate->nb_local_branches; i++) {
+               struct lightrec_branch *branch = &cstate->local_branches[i];
 
                pr_debug("Patch local branch to offset 0x%x\n",
                         branch->target << 2);
@@ -1000,15 +1204,15 @@ int lightrec_compile_block(struct block *block)
                        continue;
                }
 
-               for (j = 0; j < state->nb_targets; j++) {
-                       if (state->targets[j].offset == branch->target) {
+               for (j = 0; j < cstate->nb_targets; j++) {
+                       if (cstate->targets[j].offset == branch->target) {
                                jit_patch_at(branch->branch,
-                                            state->targets[j].label);
+                                            cstate->targets[j].label);
                                break;
                        }
                }
 
-               if (j == state->nb_targets)
+               if (j == cstate->nb_targets)
                        pr_err("Unable to find branch target\n");
        }
 
@@ -1026,19 +1230,16 @@ int lightrec_compile_block(struct block *block)
        /* Add compiled function to the LUT */
        state->code_lut[lut_offset(block->pc)] = block->function;
 
-       /* Fill code LUT with the block's entry points */
-       for (i = 0; i < state->nb_targets; i++) {
-               target = &state->targets[i];
-
-               if (target->offset) {
-                       offset = lut_offset(block->pc) + target->offset;
-                       state->code_lut[offset] = jit_address(target->label);
-               }
+       if (ENABLE_THREADED_COMPILER) {
+               /* Since we might try to reap the same block multiple times,
+                * we need the reaper to wait until everything has been
+                * submitted, so that the duplicate entries can be dropped. */
+               lightrec_reaper_pause(state->reaper);
        }
 
        /* Detect old blocks that have been covered by the new one */
-       for (i = 0; i < state->nb_targets; i++) {
-               target = &state->targets[i];
+       for (i = 0; i < cstate->nb_targets; i++) {
+               target = &cstate->targets[i];
 
                if (!target->offset)
                        continue;
@@ -1049,31 +1250,47 @@ int lightrec_compile_block(struct block *block)
                        /* No need to check if block2 is compilable - it must
                         * be, otherwise block wouldn't be compilable either */
 
+                       /* Set the "block dead" flag to prevent the dynarec from
+                        * recompiling this block */
                        block2->flags |= BLOCK_IS_DEAD;
 
+                       /* If block2 was pending for compilation, cancel it.
+                        * If it's being compiled right now, wait until it
+                        * finishes. */
+                       if (ENABLE_THREADED_COMPILER)
+                               lightrec_recompiler_remove(state->rec, block2);
+
+                       /* We know from now on that block2 isn't going to be
+                        * compiled. We can override the LUT entry with our
+                        * new block's entry point. */
+                       offset = lut_offset(block->pc) + target->offset;
+                       state->code_lut[offset] = jit_address(target->label);
+
                        pr_debug("Reap block 0x%08x as it's covered by block "
                                 "0x%08x\n", block2->pc, block->pc);
 
-                       lightrec_unregister_block(state->block_cache, block2);
-
+                       /* Finally, reap the block. */
                        if (ENABLE_THREADED_COMPILER) {
-                               lightrec_recompiler_remove(state->rec, block2);
                                lightrec_reaper_add(state->reaper,
                                                    lightrec_reap_block,
                                                    block2);
                        } else {
-                               lightrec_free_block(block2);
+                               lightrec_unregister_block(state->block_cache, block2);
+                               lightrec_free_block(state, block2);
                        }
                }
        }
 
+       if (ENABLE_DISASSEMBLER)
+               lightrec_reaper_continue(state->reaper);
+
        jit_get_code(&code_size);
        lightrec_register(MEM_FOR_CODE, code_size);
 
        block->code_size = code_size;
 
        if (ENABLE_DISASSEMBLER) {
-               pr_debug("Compiling block at PC: 0x%x\n", block->pc);
+               pr_debug("Compiling block at PC: 0x%08x\n", block->pc);
                jit_disassemble();
        }
 
@@ -1086,7 +1303,7 @@ int lightrec_compile_block(struct block *block)
        if (fully_tagged && !op_list_freed) {
                pr_debug("Block PC 0x%08x is fully tagged"
                         " - free opcode list\n", block->pc);
-               lightrec_free_opcode_list(state, block->opcode_list);
+               lightrec_free_opcode_list(state, block);
                block->opcode_list = NULL;
        }
 
@@ -1104,6 +1321,20 @@ int lightrec_compile_block(struct block *block)
        return 0;
 }
 
+static void lightrec_print_info(struct lightrec_state *state)
+{
+       if ((state->current_cycle & ~0xfffffff) != state->old_cycle_counter) {
+               pr_info("Lightrec RAM usage: IR %u KiB, CODE %u KiB, "
+                       "MIPS %u KiB, TOTAL %u KiB, avg. IPI %f\n",
+                       lightrec_get_mem_usage(MEM_FOR_IR) / 1024,
+                       lightrec_get_mem_usage(MEM_FOR_CODE) / 1024,
+                       lightrec_get_mem_usage(MEM_FOR_MIPS_CODE) / 1024,
+                       lightrec_get_total_mem_usage() / 1024,
+                      lightrec_get_average_ipi());
+               state->old_cycle_counter = state->current_cycle & ~0xfffffff;
+       }
+}
+
 u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
 {
        s32 (*func)(void *, s32) = (void *)state->dispatcher->function;
@@ -1117,6 +1348,7 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
                target_cycle = UINT_MAX;
 
        state->target_cycle = target_cycle;
+       state->next_pc = pc;
 
        block_trace = get_next_block_func(state, pc);
        if (block_trace) {
@@ -1130,6 +1362,9 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle)
        if (ENABLE_THREADED_COMPILER)
                lightrec_reaper_reap(state->reaper);
 
+       if (LOG_LEVEL >= INFO_L)
+               lightrec_print_info(state);
+
        return state->next_pc;
 }
 
@@ -1146,18 +1381,48 @@ u32 lightrec_run_interpreter(struct lightrec_state *state, u32 pc)
 
        state->exit_flags = LIGHTREC_EXIT_NORMAL;
 
-       return lightrec_emulate_block(block, pc);
+       pc = lightrec_emulate_block(state, block, pc);
+
+       if (LOG_LEVEL >= INFO_L)
+               lightrec_print_info(state);
+
+       return pc;
 }
 
-void lightrec_free_block(struct block *block)
+void lightrec_free_block(struct lightrec_state *state, struct block *block)
 {
        lightrec_unregister(MEM_FOR_MIPS_CODE, block->nb_ops * sizeof(u32));
        if (block->opcode_list)
-               lightrec_free_opcode_list(block->state, block->opcode_list);
+               lightrec_free_opcode_list(state, block);
        if (block->_jit)
                _jit_destroy_state(block->_jit);
        lightrec_unregister(MEM_FOR_CODE, block->code_size);
-       lightrec_free(block->state, MEM_FOR_IR, sizeof(*block), block);
+       lightrec_free(state, MEM_FOR_IR, sizeof(*block), block);
+}
+
+struct lightrec_cstate * lightrec_create_cstate(struct lightrec_state *state)
+{
+       struct lightrec_cstate *cstate;
+
+       cstate = lightrec_malloc(state, MEM_FOR_LIGHTREC, sizeof(*cstate));
+       if (!cstate)
+               return NULL;
+
+       cstate->reg_cache = lightrec_regcache_init(state);
+       if (!cstate->reg_cache) {
+               lightrec_free(state, MEM_FOR_LIGHTREC, sizeof(*cstate), cstate);
+               return NULL;
+       }
+
+       cstate->state = state;
+
+       return cstate;
+}
+
+void lightrec_free_cstate(struct lightrec_cstate *cstate)
+{
+       lightrec_free_regcache(cstate->reg_cache);
+       lightrec_free(cstate->state, MEM_FOR_LIGHTREC, sizeof(*cstate), cstate);
 }
 
 struct lightrec_state * lightrec_init(char *argv0,
@@ -1168,11 +1433,7 @@ struct lightrec_state * lightrec_init(char *argv0,
        struct lightrec_state *state;
 
        /* Sanity-check ops */
-       if (!ops ||
-           !ops->cop0_ops.mfc || !ops->cop0_ops.cfc || !ops->cop0_ops.mtc ||
-           !ops->cop0_ops.ctc || !ops->cop0_ops.op ||
-           !ops->cop2_ops.mfc || !ops->cop2_ops.cfc || !ops->cop2_ops.mtc ||
-           !ops->cop2_ops.ctc || !ops->cop2_ops.op) {
+       if (!ops || !ops->cop2_op || !ops->enable_ram) {
                pr_err("Missing callbacks in lightrec_ops structure\n");
                return NULL;
        }
@@ -1197,18 +1458,18 @@ struct lightrec_state * lightrec_init(char *argv0,
        if (!state->block_cache)
                goto err_free_tinymm;
 
-       state->reg_cache = lightrec_regcache_init(state);
-       if (!state->reg_cache)
-               goto err_free_block_cache;
-
        if (ENABLE_THREADED_COMPILER) {
                state->rec = lightrec_recompiler_init(state);
                if (!state->rec)
-                       goto err_free_reg_cache;
+                       goto err_free_block_cache;
 
                state->reaper = lightrec_reaper_init(state);
                if (!state->reaper)
                        goto err_free_recompiler;
+       } else {
+               state->cstate = lightrec_create_cstate(state);
+               if (!state->cstate)
+                       goto err_free_block_cache;
        }
 
        state->nb_maps = nb;
@@ -1220,50 +1481,19 @@ struct lightrec_state * lightrec_init(char *argv0,
        if (!state->dispatcher)
                goto err_free_reaper;
 
-       state->rw_generic_wrapper = generate_wrapper(state,
-                                                    lightrec_rw_generic_cb,
-                                                    true);
-       if (!state->rw_generic_wrapper)
+       state->c_wrapper_block = generate_wrapper(state);
+       if (!state->c_wrapper_block)
                goto err_free_dispatcher;
 
-       state->rw_wrapper = generate_wrapper(state, lightrec_rw_cb, false);
-       if (!state->rw_wrapper)
-               goto err_free_generic_rw_wrapper;
-
-       state->mfc_wrapper = generate_wrapper(state, lightrec_mfc_cb, false);
-       if (!state->mfc_wrapper)
-               goto err_free_rw_wrapper;
-
-       state->mtc_wrapper = generate_wrapper(state, lightrec_mtc_cb, false);
-       if (!state->mtc_wrapper)
-               goto err_free_mfc_wrapper;
-
-       state->rfe_wrapper = generate_wrapper(state, lightrec_rfe_cb, false);
-       if (!state->rfe_wrapper)
-               goto err_free_mtc_wrapper;
-
-       state->cp_wrapper = generate_wrapper(state, lightrec_cp_cb, false);
-       if (!state->cp_wrapper)
-               goto err_free_rfe_wrapper;
-
-       state->syscall_wrapper = generate_wrapper(state, lightrec_syscall_cb,
-                                                 false);
-       if (!state->syscall_wrapper)
-               goto err_free_cp_wrapper;
-
-       state->break_wrapper = generate_wrapper(state, lightrec_break_cb,
-                                               false);
-       if (!state->break_wrapper)
-               goto err_free_syscall_wrapper;
-
-       state->rw_generic_func = state->rw_generic_wrapper->function;
-       state->rw_func = state->rw_wrapper->function;
-       state->mfc_func = state->mfc_wrapper->function;
-       state->mtc_func = state->mtc_wrapper->function;
-       state->rfe_func = state->rfe_wrapper->function;
-       state->cp_func = state->cp_wrapper->function;
-       state->syscall_func = state->syscall_wrapper->function;
-       state->break_func = state->break_wrapper->function;
+       state->c_wrapper = state->c_wrapper_block->function;
+
+       state->c_wrappers[C_WRAPPER_RW] = lightrec_rw_cb;
+       state->c_wrappers[C_WRAPPER_RW_GENERIC] = lightrec_rw_generic_cb;
+       state->c_wrappers[C_WRAPPER_MFC] = lightrec_mfc_cb;
+       state->c_wrappers[C_WRAPPER_MTC] = lightrec_mtc_cb;
+       state->c_wrappers[C_WRAPPER_CP] = lightrec_cp;
+       state->c_wrappers[C_WRAPPER_SYSCALL] = lightrec_syscall_cb;
+       state->c_wrappers[C_WRAPPER_BREAK] = lightrec_break_cb;
 
        map = &state->maps[PSX_MAP_BIOS];
        state->offset_bios = (uintptr_t)map->address - map->pc;
@@ -1279,32 +1509,27 @@ struct lightrec_state * lightrec_init(char *argv0,
            state->maps[PSX_MAP_MIRROR3].address == map->address + 0x600000)
                state->mirrors_mapped = true;
 
+       if (state->offset_bios == 0 &&
+           state->offset_scratch == 0 &&
+           state->offset_ram == 0 &&
+           state->mirrors_mapped) {
+               pr_info("Memory map is perfect. Emitted code will be best.\n");
+       } else {
+               pr_info("Memory map is sub-par. Emitted code will be slow.\n");
+       }
+
        return state;
 
-err_free_syscall_wrapper:
-       lightrec_free_block(state->syscall_wrapper);
-err_free_cp_wrapper:
-       lightrec_free_block(state->cp_wrapper);
-err_free_rfe_wrapper:
-       lightrec_free_block(state->rfe_wrapper);
-err_free_mtc_wrapper:
-       lightrec_free_block(state->mtc_wrapper);
-err_free_mfc_wrapper:
-       lightrec_free_block(state->mfc_wrapper);
-err_free_rw_wrapper:
-       lightrec_free_block(state->rw_wrapper);
-err_free_generic_rw_wrapper:
-       lightrec_free_block(state->rw_generic_wrapper);
 err_free_dispatcher:
-       lightrec_free_block(state->dispatcher);
+       lightrec_free_block(state, state->dispatcher);
 err_free_reaper:
        if (ENABLE_THREADED_COMPILER)
                lightrec_reaper_destroy(state->reaper);
 err_free_recompiler:
        if (ENABLE_THREADED_COMPILER)
                lightrec_free_recompiler(state->rec);
-err_free_reg_cache:
-       lightrec_free_regcache(state->reg_cache);
+       else
+               lightrec_free_cstate(state->cstate);
 err_free_block_cache:
        lightrec_free_block_cache(state->block_cache);
 err_free_tinymm:
@@ -1322,22 +1547,20 @@ err_finish_jit:
 
 void lightrec_destroy(struct lightrec_state *state)
 {
+       /* Force a print info on destroy*/
+       state->current_cycle = ~state->current_cycle;
+       lightrec_print_info(state);
+
        if (ENABLE_THREADED_COMPILER) {
                lightrec_free_recompiler(state->rec);
                lightrec_reaper_destroy(state->reaper);
+       } else {
+               lightrec_free_cstate(state->cstate);
        }
 
-       lightrec_free_regcache(state->reg_cache);
        lightrec_free_block_cache(state->block_cache);
-       lightrec_free_block(state->dispatcher);
-       lightrec_free_block(state->rw_generic_wrapper);
-       lightrec_free_block(state->rw_wrapper);
-       lightrec_free_block(state->mfc_wrapper);
-       lightrec_free_block(state->mtc_wrapper);
-       lightrec_free_block(state->rfe_wrapper);
-       lightrec_free_block(state->cp_wrapper);
-       lightrec_free_block(state->syscall_wrapper);
-       lightrec_free_block(state->break_wrapper);
+       lightrec_free_block(state, state->dispatcher);
+       lightrec_free_block(state, state->c_wrapper_block);
        finish_jit();
 
 #if ENABLE_TINYMM
@@ -1351,22 +1574,16 @@ void lightrec_destroy(struct lightrec_state *state)
 void lightrec_invalidate(struct lightrec_state *state, u32 addr, u32 len)
 {
        u32 kaddr = kunseg(addr & ~0x3);
-       const struct lightrec_mem_map *map = lightrec_get_map(state, kaddr);
+       const struct lightrec_mem_map *map = lightrec_get_map(state, NULL, kaddr);
 
        if (map) {
-               while (map->mirror_of)
-                       map = map->mirror_of;
-
                if (map != &state->maps[PSX_MAP_KERNEL_USER_RAM])
                        return;
 
                /* Handle mirrors */
                kaddr &= (state->maps[PSX_MAP_KERNEL_USER_RAM].length - 1);
 
-               for (; len > 4; len -= 4, kaddr += 4)
-                       lightrec_invalidate_map(state, map, kaddr);
-
-               lightrec_invalidate_map(state, map, kaddr);
+               lightrec_invalidate_map(state, map, kaddr, len);
        }
 }
 
@@ -1396,16 +1613,6 @@ u32 lightrec_exit_flags(struct lightrec_state *state)
        return state->exit_flags;
 }
 
-void lightrec_dump_registers(struct lightrec_state *state, u32 regs[34])
-{
-       memcpy(regs, state->native_reg_cache, sizeof(state->native_reg_cache));
-}
-
-void lightrec_restore_registers(struct lightrec_state *state, u32 regs[34])
-{
-       memcpy(state->native_reg_cache, regs, sizeof(state->native_reg_cache));
-}
-
 u32 lightrec_current_cycle_count(const struct lightrec_state *state)
 {
        return state->current_cycle;
@@ -1428,3 +1635,8 @@ void lightrec_set_target_cycle_count(struct lightrec_state *state, u32 cycles)
                state->target_cycle = cycles;
        }
 }
+
+struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state)
+{
+       return &state->regs;
+}
index d0793c0..e418c70 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2016-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2016-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_H__
@@ -52,9 +43,9 @@ struct lightrec_mem_map;
 
 /* Exit flags */
 #define LIGHTREC_EXIT_NORMAL   (0)
-#define LIGHTREC_EXIT_SYSCALL  (1 << 0)
+#define LIGHTREC_EXIT_CHECK_INTERRUPT  (1 << 0)
 #define LIGHTREC_EXIT_BREAK    (1 << 1)
-#define LIGHTREC_EXIT_CHECK_INTERRUPT  (1 << 2)
+#define LIGHTREC_EXIT_SYSCALL  (1 << 2)
 #define LIGHTREC_EXIT_SEGFAULT (1 << 3)
 
 enum psx_map {
@@ -69,14 +60,6 @@ enum psx_map {
        PSX_MAP_MIRROR3,
 };
 
-enum mem_type {
-       MEM_FOR_CODE,
-       MEM_FOR_MIPS_CODE,
-       MEM_FOR_IR,
-       MEM_FOR_LIGHTREC,
-       MEM_TYPE_END,
-};
-
 struct lightrec_mem_map_ops {
        void (*sb)(struct lightrec_state *, u32 opcode,
                   void *host, u32 addr, u8 data);
@@ -97,17 +80,16 @@ struct lightrec_mem_map {
        const struct lightrec_mem_map *mirror_of;
 };
 
-struct lightrec_cop_ops {
-       u32 (*mfc)(struct lightrec_state *state, u32 op, u8 reg);
-       u32 (*cfc)(struct lightrec_state *state, u32 op, u8 reg);
-       void (*mtc)(struct lightrec_state *state, u32 op, u8 reg, u32 value);
-       void (*ctc)(struct lightrec_state *state, u32 op, u8 reg, u32 value);
-       void (*op)(struct lightrec_state *state, u32 op);
+struct lightrec_ops {
+       void (*cop2_op)(struct lightrec_state *state, u32 op);
+       void (*enable_ram)(struct lightrec_state *state, _Bool enable);
 };
 
-struct lightrec_ops {
-       struct lightrec_cop_ops cop0_ops;
-       struct lightrec_cop_ops cop2_ops;
+struct lightrec_registers {
+       u32 gpr[34];
+       u32 cp0[32];
+       u32 cp2d[32];
+       u32 cp2c[32];
 };
 
 __api struct lightrec_state *lightrec_init(char *argv0,
@@ -130,19 +112,13 @@ __api void lightrec_set_invalidate_mode(struct lightrec_state *state,
 __api void lightrec_set_exit_flags(struct lightrec_state *state, u32 flags);
 __api u32 lightrec_exit_flags(struct lightrec_state *state);
 
-__api void lightrec_dump_registers(struct lightrec_state *state, u32 regs[34]);
-__api void lightrec_restore_registers(struct lightrec_state *state,
-                                     u32 regs[34]);
+__api struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state);
 
 __api u32 lightrec_current_cycle_count(const struct lightrec_state *state);
 __api void lightrec_reset_cycle_count(struct lightrec_state *state, u32 cycles);
 __api void lightrec_set_target_cycle_count(struct lightrec_state *state,
                                           u32 cycles);
 
-__api unsigned int lightrec_get_mem_usage(enum mem_type type);
-__api unsigned int lightrec_get_total_mem_usage(void);
-__api float lightrec_get_average_ipi(void);
-
 #ifdef __cplusplus
 };
 #endif
index 2e6b99b..d39b669 100644 (file)
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
-#include "config.h"
+#include "lightrec-config.h"
 #include "lightrec-private.h"
 #include "memmanager.h"
 
index bd5028d..b14749f 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __MEMMANAGER_H__
@@ -17,6 +8,14 @@
 
 #include "lightrec.h"
 
+enum mem_type {
+       MEM_FOR_CODE,
+       MEM_FOR_MIPS_CODE,
+       MEM_FOR_IR,
+       MEM_FOR_LIGHTREC,
+       MEM_TYPE_END,
+};
+
 void * lightrec_malloc(struct lightrec_state *state,
                       enum mem_type type, unsigned int len);
 void * lightrec_calloc(struct lightrec_state *state,
@@ -27,4 +26,8 @@ void lightrec_free(struct lightrec_state *state,
 void lightrec_register(enum mem_type type, unsigned int len);
 void lightrec_unregister(enum mem_type type, unsigned int len);
 
+unsigned int lightrec_get_mem_usage(enum mem_type type);
+unsigned int lightrec_get_total_mem_usage(void);
+float lightrec_get_average_ipi(void);
+
 #endif /* __MEMMANAGER_H__ */
index cf431f2..98a26f6 100644 (file)
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
+#include "lightrec-config.h"
 #include "disassembler.h"
 #include "lightrec.h"
 #include "memmanager.h"
 #include <errno.h>
 #include <stdbool.h>
 #include <stdlib.h>
+#include <string.h>
+
+#define IF_OPT(opt, ptr) ((opt) ? (ptr) : NULL)
 
 struct optimizer_list {
        void (**optimizers)(struct opcode *);
        unsigned int nb_optimizers;
 };
 
-bool opcode_reads_register(union code op, u8 reg)
+static bool is_nop(union code op);
+
+bool is_unconditional_jump(union code c)
+{
+       switch (c.i.op) {
+       case OP_SPECIAL:
+               return c.r.op == OP_SPECIAL_JR || c.r.op == OP_SPECIAL_JALR;
+       case OP_J:
+       case OP_JAL:
+               return true;
+       case OP_BEQ:
+       case OP_BLEZ:
+               return c.i.rs == c.i.rt;
+       case OP_REGIMM:
+               return (c.r.rt == OP_REGIMM_BGEZ ||
+                       c.r.rt == OP_REGIMM_BGEZAL) && c.i.rs == 0;
+       default:
+               return false;
+       }
+}
+
+bool is_syscall(union code c)
+{
+       return (c.i.op == OP_SPECIAL && c.r.op == OP_SPECIAL_SYSCALL) ||
+               (c.i.op == OP_CP0 && (c.r.rs == OP_CP0_MTC0 ||
+                                       c.r.rs == OP_CP0_CTC0) &&
+                (c.r.rd == 12 || c.r.rd == 13));
+}
+
+static u64 opcode_read_mask(union code op)
 {
        switch (op.i.op) {
        case OP_SPECIAL:
                switch (op.r.op) {
                case OP_SPECIAL_SYSCALL:
                case OP_SPECIAL_BREAK:
-                       return false;
+                       return 0;
                case OP_SPECIAL_JR:
                case OP_SPECIAL_JALR:
                case OP_SPECIAL_MTHI:
                case OP_SPECIAL_MTLO:
-                       return op.r.rs == reg;
+                       return BIT(op.r.rs);
                case OP_SPECIAL_MFHI:
-                       return reg == REG_HI;
+                       return BIT(REG_HI);
                case OP_SPECIAL_MFLO:
-                       return reg == REG_LO;
+                       return BIT(REG_LO);
                case OP_SPECIAL_SLL:
                case OP_SPECIAL_SRL:
                case OP_SPECIAL_SRA:
-                       return op.r.rt == reg;
+                       return BIT(op.r.rt);
                default:
-                       return op.r.rs == reg || op.r.rt == reg;
+                       return BIT(op.r.rs) | BIT(op.r.rt);
                }
        case OP_CP0:
                switch (op.r.rs) {
                case OP_CP0_MTC0:
                case OP_CP0_CTC0:
-                       return op.r.rt == reg;
+                       return BIT(op.r.rt);
                default:
-                       return false;
+                       return 0;
                }
        case OP_CP2:
                if (op.r.op == OP_CP2_BASIC) {
                        switch (op.r.rs) {
                        case OP_CP2_BASIC_MTC2:
                        case OP_CP2_BASIC_CTC2:
-                               return op.r.rt == reg;
+                               return BIT(op.r.rt);
                        default:
-                               return false;
+                               break;
                        }
-               } else {
-                       return false;
                }
+               return 0;
        case OP_J:
        case OP_JAL:
        case OP_LUI:
-               return false;
+               return 0;
        case OP_BEQ:
        case OP_BNE:
        case OP_LWL:
@@ -84,33 +107,45 @@ bool opcode_reads_register(union code op, u8 reg)
        case OP_SWL:
        case OP_SW:
        case OP_SWR:
-               return op.i.rs == reg || op.i.rt == reg;
+               return BIT(op.i.rs) | BIT(op.i.rt);
        default:
-               return op.i.rs == reg;
+               return BIT(op.i.rs);
        }
 }
 
-bool opcode_writes_register(union code op, u8 reg)
+static u64 opcode_write_mask(union code op)
 {
+       u64 flags;
+
        switch (op.i.op) {
        case OP_SPECIAL:
                switch (op.r.op) {
                case OP_SPECIAL_JR:
-               case OP_SPECIAL_JALR:
                case OP_SPECIAL_SYSCALL:
                case OP_SPECIAL_BREAK:
-                       return false;
+                       return 0;
                case OP_SPECIAL_MULT:
                case OP_SPECIAL_MULTU:
                case OP_SPECIAL_DIV:
                case OP_SPECIAL_DIVU:
-                       return reg == REG_LO || reg == REG_HI;
+                       if (!OPT_FLAG_MULT_DIV)
+                               return BIT(REG_LO) | BIT(REG_HI);
+
+                       if (op.r.rd)
+                               flags = BIT(op.r.rd);
+                       else
+                               flags = BIT(REG_LO);
+                       if (op.r.imm)
+                               flags |= BIT(op.r.imm);
+                       else
+                               flags |= BIT(REG_HI);
+                       return flags;
                case OP_SPECIAL_MTHI:
-                       return reg == REG_HI;
+                       return BIT(REG_HI);
                case OP_SPECIAL_MTLO:
-                       return reg == REG_LO;
+                       return BIT(REG_LO);
                default:
-                       return op.r.rd == reg;
+                       return BIT(op.r.rd);
                }
        case OP_ADDI:
        case OP_ADDIU:
@@ -127,34 +162,199 @@ bool opcode_writes_register(union code op, u8 reg)
        case OP_LBU:
        case OP_LHU:
        case OP_LWR:
-               return op.i.rt == reg;
+               return BIT(op.i.rt);
+       case OP_JAL:
+               return BIT(31);
        case OP_CP0:
                switch (op.r.rs) {
                case OP_CP0_MFC0:
                case OP_CP0_CFC0:
-                       return op.i.rt == reg;
+                       return BIT(op.i.rt);
                default:
-                       return false;
+                       return 0;
                }
        case OP_CP2:
                if (op.r.op == OP_CP2_BASIC) {
                        switch (op.r.rs) {
                        case OP_CP2_BASIC_MFC2:
                        case OP_CP2_BASIC_CFC2:
-                               return op.i.rt == reg;
+                               return BIT(op.i.rt);
                        default:
-                               return false;
+                               break;
                        }
-               } else {
-                       return false;
+               }
+               return 0;
+       case OP_REGIMM:
+               switch (op.r.rt) {
+               case OP_REGIMM_BLTZAL:
+               case OP_REGIMM_BGEZAL:
+                       return BIT(31);
+               default:
+                       return 0;
                }
        case OP_META_MOV:
-               return op.r.rd == reg;
+               return BIT(op.r.rd);
        default:
+               return 0;
+       }
+}
+
+bool opcode_reads_register(union code op, u8 reg)
+{
+       return opcode_read_mask(op) & BIT(reg);
+}
+
+bool opcode_writes_register(union code op, u8 reg)
+{
+       return opcode_write_mask(op) & BIT(reg);
+}
+
+static int find_prev_writer(const struct opcode *list, unsigned int offset, u8 reg)
+{
+       union code c;
+       unsigned int i;
+
+       if (list[offset].flags & LIGHTREC_SYNC)
+               return -1;
+
+       for (i = offset; i > 0; i--) {
+               c = list[i - 1].c;
+
+               if (opcode_writes_register(c, reg)) {
+                       if (i > 1 && has_delay_slot(list[i - 2].c))
+                               break;
+
+                       return i - 1;
+               }
+
+               if ((list[i - 1].flags & LIGHTREC_SYNC) ||
+                   has_delay_slot(c) ||
+                   opcode_reads_register(c, reg))
+                       break;
+       }
+
+       return -1;
+}
+
+static int find_next_reader(const struct opcode *list, unsigned int offset, u8 reg)
+{
+       unsigned int i;
+       union code c;
+
+       if (list[offset].flags & LIGHTREC_SYNC)
+               return -1;
+
+       for (i = offset; ; i++) {
+               c = list[i].c;
+
+               if (opcode_reads_register(c, reg)) {
+                       if (i > 0 && has_delay_slot(list[i - 1].c))
+                               break;
+
+                       return i;
+               }
+
+               if ((list[i].flags & LIGHTREC_SYNC) ||
+                   has_delay_slot(c) || opcode_writes_register(c, reg))
+                       break;
+       }
+
+       return -1;
+}
+
+static bool reg_is_dead(const struct opcode *list, unsigned int offset, u8 reg)
+{
+       unsigned int i;
+
+       if (list[offset].flags & LIGHTREC_SYNC)
                return false;
+
+       for (i = offset + 1; ; i++) {
+               if (opcode_reads_register(list[i].c, reg))
+                       return false;
+
+               if (opcode_writes_register(list[i].c, reg))
+                       return true;
+
+               if (has_delay_slot(list[i].c)) {
+                       if (list[i].flags & LIGHTREC_NO_DS)
+                               return false;
+
+                       return opcode_writes_register(list[i + 1].c, reg);
+               }
        }
 }
 
+static bool reg_is_read(const struct opcode *list,
+                       unsigned int a, unsigned int b, u8 reg)
+{
+       /* Return true if reg is read in one of the opcodes of the interval
+        * [a, b[ */
+       for (; a < b; a++) {
+               if (!is_nop(list[a].c) && opcode_reads_register(list[a].c, reg))
+                       return true;
+       }
+
+       return false;
+}
+
+static bool reg_is_written(const struct opcode *list,
+                          unsigned int a, unsigned int b, u8 reg)
+{
+       /* Return true if reg is written in one of the opcodes of the interval
+        * [a, b[ */
+
+       for (; a < b; a++) {
+               if (!is_nop(list[a].c) && opcode_writes_register(list[a].c, reg))
+                       return true;
+       }
+
+       return false;
+}
+
+static bool reg_is_read_or_written(const struct opcode *list,
+                                  unsigned int a, unsigned int b, u8 reg)
+{
+       return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
+}
+
+static bool opcode_is_load(union code op)
+{
+       switch (op.i.op) {
+       case OP_LB:
+       case OP_LH:
+       case OP_LWL:
+       case OP_LW:
+       case OP_LBU:
+       case OP_LHU:
+       case OP_LWR:
+       case OP_LWC2:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool opcode_is_store(union code op)
+{
+       switch (op.i.op) {
+       case OP_SB:
+       case OP_SH:
+       case OP_SW:
+       case OP_SWL:
+       case OP_SWR:
+       case OP_SWC2:
+               return true;
+       default:
+               return false;
+       }
+}
+
+bool opcode_is_io(union code op)
+{
+       return opcode_is_load(op) || opcode_is_store(op);
+}
+
 /* TODO: Complete */
 static bool is_nop(union code op)
 {
@@ -196,6 +396,9 @@ static bool is_nop(union code op)
                case OP_SPECIAL_SRA:
                case OP_SPECIAL_SRL:
                        return op.r.rd == op.r.rt && op.r.imm == 0;
+               case OP_SPECIAL_MFHI:
+               case OP_SPECIAL_MFLO:
+                       return op.r.rd == 0;
                default:
                        return false;
                }
@@ -256,8 +459,13 @@ bool load_in_delay_slot(union code op)
        return false;
 }
 
-static u32 lightrec_propagate_consts(union code c, u32 known, u32 *v)
+static u32 lightrec_propagate_consts(const struct opcode *op, u32 known, u32 *v)
 {
+       union code c = op->c;
+
+       if (op->flags & LIGHTREC_SYNC)
+               return 0;
+
        switch (c.i.op) {
        case OP_SPECIAL:
                switch (c.r.op) {
@@ -478,77 +686,199 @@ static u32 lightrec_propagate_consts(union code c, u32 known, u32 *v)
        return known;
 }
 
-static int lightrec_add_meta(struct block *block,
-                            struct opcode *op, union code code)
+static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset)
 {
-       struct opcode *meta;
+       struct opcode *prev, *prev2 = NULL, *curr = &list[offset];
+       struct opcode *to_change, *to_nop;
+       int idx, idx2;
 
-       meta = lightrec_malloc(block->state, MEM_FOR_IR, sizeof(*meta));
-       if (!meta)
-               return -ENOMEM;
+       if (curr->r.imm != 24 && curr->r.imm != 16)
+               return;
+
+       idx = find_prev_writer(list, offset, curr->r.rt);
+       if (idx < 0)
+               return;
+
+       prev = &list[idx];
+
+       if (prev->i.op != OP_SPECIAL || prev->r.op != OP_SPECIAL_SLL ||
+           prev->r.imm != curr->r.imm || prev->r.rd != curr->r.rt)
+               return;
 
-       meta->c = code;
-       meta->flags = 0;
+       if (prev->r.rd != prev->r.rt && curr->r.rd != curr->r.rt) {
+               /* sll rY, rX, 16
+                * ...
+                * srl rZ, rY, 16 */
 
-       if (op) {
-               meta->offset = op->offset;
-               meta->next = op->next;
-               op->next = meta;
+               if (!reg_is_dead(list, offset, curr->r.rt) ||
+                   reg_is_read_or_written(list, idx, offset, curr->r.rd))
+                       return;
+
+               /* If rY is dead after the SRL, and rZ is not used after the SLL,
+                * we can change rY to rZ */
+
+               pr_debug("Detected SLL/SRA with middle temp register\n");
+               prev->r.rd = curr->r.rd;
+               curr->r.rt = prev->r.rd;
+       }
+
+       /* We got a SLL/SRA combo. If imm #16, that's a cast to u16.
+        * If imm #24 that's a cast to u8.
+        *
+        * First of all, make sure that the target register of the SLL is not
+        * read before the SRA. */
+
+       if (prev->r.rd == prev->r.rt) {
+               /* sll rX, rX, 16
+                * ...
+                * srl rY, rX, 16 */
+               to_change = curr;
+               to_nop = prev;
+
+               /* rX is used after the SRA - we cannot convert it. */
+               if (prev->r.rd != curr->r.rd && !reg_is_dead(list, offset, prev->r.rd))
+                       return;
        } else {
-               meta->offset = 0;
-               meta->next = block->opcode_list;
-               block->opcode_list = meta;
+               /* sll rY, rX, 16
+                * ...
+                * srl rY, rY, 16 */
+               to_change = prev;
+               to_nop = curr;
        }
 
-       return 0;
-}
+       idx2 = find_prev_writer(list, idx, prev->r.rt);
+       if (idx2 >= 0) {
+               /* Note that PSX games sometimes do casts after
+                * a LHU or LBU; in this case we can change the
+                * load opcode to a LH or LB, and the cast can
+                * be changed to a MOV or a simple NOP. */
+
+               prev2 = &list[idx2];
+
+               if (curr->r.rd != prev2->i.rt &&
+                   !reg_is_dead(list, offset, prev2->i.rt))
+                       prev2 = NULL;
+               else if (curr->r.imm == 16 && prev2->i.op == OP_LHU)
+                       prev2->i.op = OP_LH;
+               else if (curr->r.imm == 24 && prev2->i.op == OP_LBU)
+                       prev2->i.op = OP_LB;
+               else
+                       prev2 = NULL;
+
+               if (prev2) {
+                       if (curr->r.rd == prev2->i.rt) {
+                               to_change->opcode = 0;
+                       } else if (reg_is_dead(list, offset, prev2->i.rt) &&
+                                  !reg_is_read_or_written(list, idx2 + 1, offset, curr->r.rd)) {
+                               /* The target register of the SRA is dead after the
+                                * LBU/LHU; we can change the target register of the
+                                * LBU/LHU to the one of the SRA. */
+                               prev2->i.rt = curr->r.rd;
+                               to_change->opcode = 0;
+                       } else {
+                               to_change->i.op = OP_META_MOV;
+                               to_change->r.rd = curr->r.rd;
+                               to_change->r.rs = prev2->i.rt;
+                       }
 
-static int lightrec_add_sync(struct block *block, struct opcode *prev)
-{
-       return lightrec_add_meta(block, prev, (union code){
-                                .j.op = OP_META_SYNC,
-                                });
+                       if (to_nop->r.imm == 24)
+                               pr_debug("Convert LBU+SLL+SRA to LB\n");
+                       else
+                               pr_debug("Convert LHU+SLL+SRA to LH\n");
+               }
+       }
+
+       if (!prev2) {
+               pr_debug("Convert SLL/SRA #%u to EXT%c\n",
+                        prev->r.imm,
+                        prev->r.imm == 24 ? 'C' : 'S');
+
+               if (to_change == prev) {
+                       to_change->i.rs = prev->r.rt;
+                       to_change->i.rt = curr->r.rd;
+               } else {
+                       to_change->i.rt = curr->r.rd;
+                       to_change->i.rs = prev->r.rt;
+               }
+
+               if (to_nop->r.imm == 24)
+                       to_change->i.op = OP_META_EXTC;
+               else
+                       to_change->i.op = OP_META_EXTS;
+       }
+
+       to_nop->opcode = 0;
 }
 
-static int lightrec_transform_ops(struct block *block)
+static int lightrec_transform_ops(struct lightrec_state *state, struct block *block)
 {
        struct opcode *list = block->opcode_list;
+       struct opcode *op;
+       u32 known = BIT(0);
+       u32 values[32] = { 0 };
+       unsigned int i;
+       int reader;
 
-       for (; list; list = list->next) {
+       for (i = 0; i < block->nb_ops; i++) {
+               op = &list[i];
 
                /* Transform all opcodes detected as useless to real NOPs
                 * (0x0: SLL r0, r0, #0) */
-               if (list->opcode != 0 && is_nop(list->c)) {
+               if (op->opcode != 0 && is_nop(op->c)) {
                        pr_debug("Converting useless opcode 0x%08x to NOP\n",
-                                       list->opcode);
-                       list->opcode = 0x0;
+                                       op->opcode);
+                       op->opcode = 0x0;
                }
 
-               if (!list->opcode)
+               if (!op->opcode)
                        continue;
 
-               switch (list->i.op) {
-               /* Transform BEQ / BNE to BEQZ / BNEZ meta-opcodes if one of the
-                * two registers is zero. */
+               /* Register $zero is always, well, zero */
+               known |= BIT(0);
+               values[0] = 0;
+
+               switch (op->i.op) {
                case OP_BEQ:
-                       if ((list->i.rs == 0) ^ (list->i.rt == 0)) {
-                               list->i.op = OP_META_BEQZ;
-                               if (list->i.rs == 0) {
-                                       list->i.rs = list->i.rt;
-                                       list->i.rt = 0;
-                               }
-                       } else if (list->i.rs == list->i.rt) {
-                               list->i.rs = 0;
-                               list->i.rt = 0;
+                       if (op->i.rs == op->i.rt) {
+                               op->i.rs = 0;
+                               op->i.rt = 0;
+                       } else if (op->i.rs == 0) {
+                               op->i.rs = op->i.rt;
+                               op->i.rt = 0;
                        }
                        break;
+
                case OP_BNE:
-                       if (list->i.rs == 0) {
-                               list->i.op = OP_META_BNEZ;
-                               list->i.rs = list->i.rt;
-                               list->i.rt = 0;
-                       } else if (list->i.rt == 0) {
-                               list->i.op = OP_META_BNEZ;
+                       if (op->i.rs == 0) {
+                               op->i.rs = op->i.rt;
+                               op->i.rt = 0;
+                       }
+                       break;
+
+               case OP_LUI:
+                       if (!(op->flags & LIGHTREC_SYNC) &&
+                           (known & BIT(op->i.rt)) &&
+                           values[op->i.rt] == op->i.imm << 16) {
+                               pr_debug("Converting duplicated LUI to NOP\n");
+                               op->opcode = 0x0;
+                       }
+
+                       if (op->i.imm != 0 || op->i.rt == 0)
+                               break;
+
+                       reader = find_next_reader(list, i + 1, op->i.rt);
+                       if (reader > 0 &&
+                           (opcode_writes_register(list[reader].c, op->i.rt) ||
+                            reg_is_dead(list, reader, op->i.rt))) {
+
+                               pr_debug("Removing useless LUI 0x0\n");
+
+                               if (list[reader].i.rs == op->i.rt)
+                                       list[reader].i.rs = 0;
+                               if (list[reader].i.op == OP_SPECIAL &&
+                                   list[reader].i.rt == op->i.rt)
+                                       list[reader].i.rt = 0;
+                               op->opcode = 0x0;
                        }
                        break;
 
@@ -557,36 +887,45 @@ static int lightrec_transform_ops(struct block *block)
                case OP_ORI:
                case OP_ADDI:
                case OP_ADDIU:
-                       if (list->i.imm == 0) {
+                       if (op->i.imm == 0) {
                                pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
-                               list->i.op = OP_META_MOV;
-                               list->r.rd = list->i.rt;
+                               op->i.op = OP_META_MOV;
+                               op->r.rd = op->i.rt;
                        }
                        break;
                case OP_SPECIAL:
-                       switch (list->r.op) {
-                       case OP_SPECIAL_SLL:
+                       switch (op->r.op) {
                        case OP_SPECIAL_SRA:
+                               if (op->r.imm == 0) {
+                                       pr_debug("Convert SRA #0 to MOV\n");
+                                       op->i.op = OP_META_MOV;
+                                       op->r.rs = op->r.rt;
+                                       break;
+                               }
+
+                               lightrec_optimize_sll_sra(block->opcode_list, i);
+                               break;
+                       case OP_SPECIAL_SLL:
                        case OP_SPECIAL_SRL:
-                               if (list->r.imm == 0) {
-                                       pr_debug("Convert SLL/SRL/SRA #0 to MOV\n");
-                                       list->i.op = OP_META_MOV;
-                                       list->r.rs = list->r.rt;
+                               if (op->r.imm == 0) {
+                                       pr_debug("Convert SLL/SRL #0 to MOV\n");
+                                       op->i.op = OP_META_MOV;
+                                       op->r.rs = op->r.rt;
                                }
                                break;
                        case OP_SPECIAL_OR:
                        case OP_SPECIAL_ADD:
                        case OP_SPECIAL_ADDU:
-                               if (list->r.rs == 0) {
+                               if (op->r.rs == 0) {
                                        pr_debug("Convert OR/ADD $zero to MOV\n");
-                                       list->i.op = OP_META_MOV;
-                                       list->r.rs = list->r.rt;
+                                       op->i.op = OP_META_MOV;
+                                       op->r.rs = op->r.rt;
                                }
                        case OP_SPECIAL_SUB: /* fall-through */
                        case OP_SPECIAL_SUBU:
-                               if (list->r.rt == 0) {
+                               if (op->r.rt == 0) {
                                        pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
-                                       list->i.op = OP_META_MOV;
+                                       op->i.op = OP_META_MOV;
                                }
                        default: /* fall-through */
                                break;
@@ -594,27 +933,37 @@ static int lightrec_transform_ops(struct block *block)
                default: /* fall-through */
                        break;
                }
+
+               known = lightrec_propagate_consts(op, known, values);
        }
 
        return 0;
 }
 
-static int lightrec_switch_delay_slots(struct block *block)
+static int lightrec_switch_delay_slots(struct lightrec_state *state, struct block *block)
 {
-       struct opcode *list, *prev;
+       struct opcode *list, *next = &block->opcode_list[0];
+       unsigned int i;
+       union code op, next_op;
        u8 flags;
 
-       for (list = block->opcode_list, prev = NULL; list->next;
-            prev = list, list = list->next) {
-               union code op = list->c;
-               union code next_op = list->next->c;
+       for (i = 0; i < block->nb_ops - 1; i++) {
+               list = next;
+               next = &block->opcode_list[i + 1];
+               next_op = next->c;
+               op = list->c;
 
                if (!has_delay_slot(op) ||
                    list->flags & (LIGHTREC_NO_DS | LIGHTREC_EMULATE_BRANCH) ||
-                   op.opcode == 0)
+                   op.opcode == 0 || next_op.opcode == 0)
+                       continue;
+
+               if (i && has_delay_slot(block->opcode_list[i - 1].c) &&
+                   !(block->opcode_list[i - 1].flags & LIGHTREC_NO_DS))
                        continue;
 
-               if (prev && has_delay_slot(prev->c))
+               if ((list->flags & LIGHTREC_SYNC) ||
+                   (next->flags & LIGHTREC_SYNC))
                        continue;
 
                switch (list->i.op) {
@@ -644,8 +993,6 @@ static int lightrec_switch_delay_slots(struct block *block)
                                continue;
                case OP_BLEZ: /* fall-through */
                case OP_BGTZ:
-               case OP_META_BEQZ:
-               case OP_META_BNEZ:
                        if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
                                continue;
                        break;
@@ -668,27 +1015,60 @@ static int lightrec_switch_delay_slots(struct block *block)
                }
 
                pr_debug("Swap branch and delay slot opcodes "
-                        "at offsets 0x%x / 0x%x\n", list->offset << 2,
-                        list->next->offset << 2);
+                        "at offsets 0x%x / 0x%x\n",
+                        i << 2, (i + 1) << 2);
 
-               flags = list->next->flags;
+               flags = next->flags;
                list->c = next_op;
-               list->next->c = op;
-               list->next->flags = list->flags | LIGHTREC_NO_DS;
+               next->c = op;
+               next->flags = list->flags | LIGHTREC_NO_DS;
                list->flags = flags | LIGHTREC_NO_DS;
-               list->offset++;
-               list->next->offset--;
        }
 
        return 0;
 }
 
-static int lightrec_detect_impossible_branches(struct block *block)
+static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size)
+{
+       struct opcode *list;
+
+       if (new_size >= block->nb_ops) {
+               pr_err("Invalid shrink size (%u vs %u)\n",
+                      new_size, block->nb_ops);
+               return -EINVAL;
+       }
+
+
+       list = lightrec_malloc(state, MEM_FOR_IR,
+                              sizeof(*list) * new_size);
+       if (!list) {
+               pr_err("Unable to allocate memory\n");
+               return -ENOMEM;
+       }
+
+       memcpy(list, block->opcode_list, sizeof(*list) * new_size);
+
+       lightrec_free_opcode_list(state, block);
+       block->opcode_list = list;
+       block->nb_ops = new_size;
+
+       pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n",
+                block->pc, new_size);
+
+       return 0;
+}
+
+static int lightrec_detect_impossible_branches(struct lightrec_state *state,
+                                              struct block *block)
 {
-       struct opcode *op, *next;
+       struct opcode *op, *next = &block->opcode_list[0];
+       unsigned int i;
+       int ret = 0;
+
+       for (i = 0; i < block->nb_ops - 1; i++) {
+               op = next;
+               next = &block->opcode_list[i + 1];
 
-       for (op = block->opcode_list, next = op->next; next;
-            op = next, next = op->next) {
                if (!has_delay_slot(op->c) ||
                    (!load_in_delay_slot(next->c) &&
                     !has_delay_slot(next->c) &&
@@ -702,29 +1082,34 @@ static int lightrec_detect_impossible_branches(struct block *block)
                        continue;
                }
 
+               op->flags |= LIGHTREC_EMULATE_BRANCH;
+
                if (op == block->opcode_list) {
+                       pr_debug("First opcode of block PC 0x%08x is an impossible branch\n",
+                                block->pc);
+
                        /* If the first opcode is an 'impossible' branch, we
                         * only keep the first two opcodes of the block (the
                         * branch itself + its delay slot) */
-                       lightrec_free_opcode_list(block->state, next->next);
-                       next->next = NULL;
-                       block->nb_ops = 2;
+                       if (block->nb_ops > 2)
+                               ret = shrink_opcode_list(state, block, 2);
+                       break;
                }
-
-               op->flags |= LIGHTREC_EMULATE_BRANCH;
        }
 
-       return 0;
+       return ret;
 }
 
-static int lightrec_local_branches(struct block *block)
+static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
 {
-       struct opcode *list, *target, *prev;
+       struct opcode *list;
+       unsigned int i;
        s32 offset;
-       int ret;
 
-       for (list = block->opcode_list; list; list = list->next) {
-               if (list->flags & LIGHTREC_EMULATE_BRANCH)
+       for (i = 0; i < block->nb_ops; i++) {
+               list = &block->opcode_list[i];
+
+               if (should_emulate(list))
                        continue;
 
                switch (list->i.op) {
@@ -733,9 +1118,7 @@ static int lightrec_local_branches(struct block *block)
                case OP_BLEZ:
                case OP_BGTZ:
                case OP_REGIMM:
-               case OP_META_BEQZ:
-               case OP_META_BNEZ:
-                       offset = list->offset + 1 + (s16)list->i.imm;
+                       offset = i + 1 + (s16)list->i.imm;
                        if (offset >= 0 && offset < block->nb_ops)
                                break;
                default: /* fall-through */
@@ -744,37 +1127,20 @@ static int lightrec_local_branches(struct block *block)
 
                pr_debug("Found local branch to offset 0x%x\n", offset << 2);
 
-               for (target = block->opcode_list, prev = NULL;
-                    target; prev = target, target = target->next) {
-                       if (target->offset != offset ||
-                           target->j.op == OP_META_SYNC)
-                               continue;
-
-                       if (target->flags & LIGHTREC_EMULATE_BRANCH) {
-                               pr_debug("Branch target must be emulated"
-                                        " - skip\n");
-                               break;
-                       }
-
-                       if (prev && has_delay_slot(prev->c)) {
-                               pr_debug("Branch target is a delay slot"
-                                        " - skip\n");
-                               break;
-                       }
+               if (should_emulate(&block->opcode_list[offset])) {
+                       pr_debug("Branch target must be emulated - skip\n");
+                       continue;
+               }
 
-                       if (prev && prev->j.op != OP_META_SYNC) {
-                               pr_debug("Adding sync before offset "
-                                        "0x%x\n", offset << 2);
-                               ret = lightrec_add_sync(block, prev);
-                               if (ret)
-                                       return ret;
+               if (offset && has_delay_slot(block->opcode_list[offset - 1].c)) {
+                       pr_debug("Branch target is a delay slot - skip\n");
+                       continue;
+               }
 
-                               prev->next->offset = target->offset;
-                       }
+               pr_debug("Adding sync at offset 0x%x\n", offset << 2);
 
-                       list->flags |= LIGHTREC_LOCAL_BRANCH;
-                       break;
-               }
+               block->opcode_list[offset].flags |= LIGHTREC_SYNC;
+               list->flags |= LIGHTREC_LOCAL_BRANCH;
        }
 
        return 0;
@@ -798,77 +1164,80 @@ bool has_delay_slot(union code op)
        case OP_BLEZ:
        case OP_BGTZ:
        case OP_REGIMM:
-       case OP_META_BEQZ:
-       case OP_META_BNEZ:
                return true;
        default:
                return false;
        }
 }
 
-static int lightrec_add_unload(struct block *block, struct opcode *op, u8 reg)
+bool should_emulate(const struct opcode *list)
 {
-       return lightrec_add_meta(block, op, (union code){
-                                .i.op = OP_META_REG_UNLOAD,
-                                .i.rs = reg,
-                                });
+       return has_delay_slot(list->c) &&
+               (list->flags & LIGHTREC_EMULATE_BRANCH);
 }
 
-static int lightrec_early_unload(struct block *block)
+static void lightrec_add_unload(struct opcode *op, u8 reg)
 {
-       struct opcode *list = block->opcode_list;
-       u8 i;
+       if (op->i.op == OP_SPECIAL && reg == op->r.rd)
+               op->flags |= LIGHTREC_UNLOAD_RD;
 
-       for (i = 1; i < 34; i++) {
-               struct opcode *op, *last_r = NULL, *last_w = NULL;
-               unsigned int last_r_id = 0, last_w_id = 0, id = 0;
-               int ret;
+       if (op->i.rs == reg)
+               op->flags |= LIGHTREC_UNLOAD_RS;
+       if (op->i.rt == reg)
+               op->flags |= LIGHTREC_UNLOAD_RT;
+}
 
-               for (op = list; op->next; op = op->next, id++) {
-                       if (opcode_reads_register(op->c, i)) {
-                               last_r = op;
-                               last_r_id = id;
-                       }
+static int lightrec_early_unload(struct lightrec_state *state, struct block *block)
+{
+       unsigned int i, offset;
+       struct opcode *op;
+       u8 reg;
 
-                       if (opcode_writes_register(op->c, i)) {
-                               last_w = op;
-                               last_w_id = id;
-                       }
+       for (reg = 1; reg < 34; reg++) {
+               int last_r_id = -1, last_w_id = -1;
+
+               for (i = 0; i < block->nb_ops; i++) {
+                       union code c = block->opcode_list[i].c;
+
+                       if (opcode_reads_register(c, reg))
+                               last_r_id = i;
+                       if (opcode_writes_register(c, reg))
+                               last_w_id = i;
                }
 
-               if (last_w_id > last_r_id) {
-                       if (has_delay_slot(last_w->c) &&
-                           !(last_w->flags & LIGHTREC_NO_DS))
-                               last_w = last_w->next;
+               if (last_w_id > last_r_id)
+                       offset = (unsigned int)last_w_id;
+               else if (last_r_id >= 0)
+                       offset = (unsigned int)last_r_id;
+               else
+                       continue;
 
-                       if (last_w->next) {
-                               ret = lightrec_add_unload(block, last_w, i);
-                               if (ret)
-                                       return ret;
-                       }
-               } else if (last_r) {
-                       if (has_delay_slot(last_r->c) &&
-                           !(last_r->flags & LIGHTREC_NO_DS))
-                               last_r = last_r->next;
+               op = &block->opcode_list[offset];
 
-                       if (last_r->next) {
-                               ret = lightrec_add_unload(block, last_r, i);
-                               if (ret)
-                                       return ret;
-                       }
-               }
+               if (has_delay_slot(op->c) && (op->flags & LIGHTREC_NO_DS))
+                       offset++;
+
+               if (offset == block->nb_ops)
+                       continue;
+
+               lightrec_add_unload(&block->opcode_list[offset], reg);
        }
 
        return 0;
 }
 
-static int lightrec_flag_stores(struct block *block)
+static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
 {
+       const struct lightrec_mem_map *map;
        struct opcode *list;
        u32 known = BIT(0);
        u32 values[32] = { 0 };
+       unsigned int i;
+       u32 val;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               list = &block->opcode_list[i];
 
-       for (list = block->opcode_list; list; list = list->next) {
                /* Register $zero is always, well, zero */
                known |= BIT(0);
                values[0] = 0;
@@ -877,145 +1246,464 @@ static int lightrec_flag_stores(struct block *block)
                case OP_SB:
                case OP_SH:
                case OP_SW:
-                       /* Mark all store operations that target $sp or $gp
-                        * as not requiring code invalidation. This is based
-                        * on the heuristic that stores using one of these
-                        * registers as address will never hit a code page. */
-                       if (list->i.rs >= 28 && list->i.rs <= 29 &&
-                           !block->state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
-                               pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n",
-                                        list->opcode);
-                               list->flags |= LIGHTREC_NO_INVALIDATE;
-                       }
-
-                       /* Detect writes whose destination address is inside the
-                        * current block, using constant propagation. When these
-                        * occur, we mark the blocks as not compilable. */
-                       if ((known & BIT(list->i.rs)) &&
-                           kunseg(values[list->i.rs]) >= kunseg(block->pc) &&
-                           kunseg(values[list->i.rs]) < (kunseg(block->pc) +
-                                                         block->nb_ops * 4)) {
-                               pr_debug("Self-modifying block detected\n");
-                               block->flags |= BLOCK_NEVER_COMPILE;
-                               list->flags |= LIGHTREC_SMC;
+                       if (OPT_FLAG_STORES) {
+                               /* Mark all store operations that target $sp or $gp
+                                * as not requiring code invalidation. This is based
+                                * on the heuristic that stores using one of these
+                                * registers as address will never hit a code page. */
+                               if (list->i.rs >= 28 && list->i.rs <= 29 &&
+                                   !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
+                                       pr_debug("Flaging opcode 0x%08x as not "
+                                                "requiring invalidation\n",
+                                                list->opcode);
+                                       list->flags |= LIGHTREC_NO_INVALIDATE;
+                               }
+
+                               /* Detect writes whose destination address is inside the
+                                * current block, using constant propagation. When these
+                                * occur, we mark the blocks as not compilable. */
+                               if ((known & BIT(list->i.rs)) &&
+                                   kunseg(values[list->i.rs]) >= kunseg(block->pc) &&
+                                   kunseg(values[list->i.rs]) < (kunseg(block->pc) +
+                                                                 block->nb_ops * 4)) {
+                                       pr_debug("Self-modifying block detected\n");
+                                       block->flags |= BLOCK_NEVER_COMPILE;
+                                       list->flags |= LIGHTREC_SMC;
+                               }
+                       }
+               case OP_SWL: /* fall-through */
+               case OP_SWR:
+               case OP_SWC2:
+               case OP_LB:
+               case OP_LBU:
+               case OP_LH:
+               case OP_LHU:
+               case OP_LW:
+               case OP_LWL:
+               case OP_LWR:
+               case OP_LWC2:
+                       if (OPT_FLAG_IO && (known & BIT(list->i.rs))) {
+                               val = kunseg(values[list->i.rs] + (s16) list->i.imm);
+                               map = lightrec_get_map(state, NULL, val);
+
+                               if (!map || map->ops ||
+                                   map == &state->maps[PSX_MAP_PARALLEL_PORT]) {
+                                       pr_debug("Flagging opcode %u as accessing I/O registers\n",
+                                                i);
+                                       list->flags |= LIGHTREC_HW_IO;
+                               } else {
+                                       pr_debug("Flaging opcode %u as direct memory access\n", i);
+                                       list->flags |= LIGHTREC_DIRECT_IO;
+                               }
                        }
                default: /* fall-through */
                        break;
                }
 
-               known = lightrec_propagate_consts(list->c, known, values);
+               known = lightrec_propagate_consts(list, known, values);
        }
 
        return 0;
 }
 
-static bool is_mult32(const struct block *block, const struct opcode *op)
+static u8 get_mfhi_mflo_reg(const struct block *block, u16 offset,
+                           const struct opcode *last,
+                           u32 mask, bool sync, bool mflo, bool another)
 {
-       const struct opcode *next, *last = NULL;
-       u32 offset;
+       const struct opcode *op, *next = &block->opcode_list[offset];
+       u32 old_mask;
+       u8 reg2, reg = mflo ? REG_LO : REG_HI;
+       u16 branch_offset;
+       unsigned int i;
+
+       for (i = offset; i < block->nb_ops; i++) {
+               op = next;
+               next = &block->opcode_list[i + 1];
+               old_mask = mask;
+
+               /* If any other opcode writes or reads to the register
+                * we'd use, then we cannot use it anymore. */
+               mask |= opcode_read_mask(op->c);
+               mask |= opcode_write_mask(op->c);
+
+               if (op->flags & LIGHTREC_SYNC)
+                       sync = true;
 
-       for (op = op->next; op != last; op = op->next) {
                switch (op->i.op) {
                case OP_BEQ:
                case OP_BNE:
                case OP_BLEZ:
                case OP_BGTZ:
                case OP_REGIMM:
-               case OP_META_BEQZ:
-               case OP_META_BNEZ:
                        /* TODO: handle backwards branches too */
-                       if ((op->flags & LIGHTREC_LOCAL_BRANCH) &&
+                       if (!last &&
+                           (op->flags & LIGHTREC_LOCAL_BRANCH) &&
                            (s16)op->c.i.imm >= 0) {
-                               offset = op->offset + 1 + (s16)op->c.i.imm;
-
-                               for (next = op; next->offset != offset;
-                                    next = next->next);
-
-                               if (!is_mult32(block, next))
-                                       return false;
-
-                               last = next;
-                               continue;
-                       } else {
-                               return false;
+                               branch_offset = i + 1 + (s16)op->c.i.imm
+                                       - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
+
+                               reg = get_mfhi_mflo_reg(block, branch_offset, NULL,
+                                                       mask, sync, mflo, false);
+                               reg2 = get_mfhi_mflo_reg(block, offset + 1, next,
+                                                        mask, sync, mflo, false);
+                               if (reg > 0 && reg == reg2)
+                                       return reg;
+                               if (!reg && !reg2)
+                                       return 0;
                        }
+
+                       return mflo ? REG_LO : REG_HI;
                case OP_SPECIAL:
                        switch (op->r.op) {
                        case OP_SPECIAL_MULT:
                        case OP_SPECIAL_MULTU:
                        case OP_SPECIAL_DIV:
                        case OP_SPECIAL_DIVU:
+                               return 0;
                        case OP_SPECIAL_MTHI:
-                               return true;
+                               if (!mflo)
+                                       return 0;
+                               continue;
+                       case OP_SPECIAL_MTLO:
+                               if (mflo)
+                                       return 0;
+                               continue;
                        case OP_SPECIAL_JR:
-                               return op->r.rs == 31 &&
-                                       ((op->flags & LIGHTREC_NO_DS) ||
-                                        !(op->next->i.op == OP_SPECIAL &&
-                                          op->next->r.op == OP_SPECIAL_MFHI));
+                               if (op->r.rs != 31)
+                                       return reg;
+
+                               if (!sync &&
+                                   !(op->flags & LIGHTREC_NO_DS) &&
+                                   (next->i.op == OP_SPECIAL) &&
+                                   ((!mflo && next->r.op == OP_SPECIAL_MFHI) ||
+                                   (mflo && next->r.op == OP_SPECIAL_MFLO)))
+                                       return next->r.rd;
+
+                               return 0;
                        case OP_SPECIAL_JALR:
+                               return reg;
                        case OP_SPECIAL_MFHI:
-                               return false;
-                       default:
+                               if (!mflo) {
+                                       if (another)
+                                               return op->r.rd;
+                                       /* Must use REG_HI if there is another MFHI target*/
+                                       reg2 = get_mfhi_mflo_reg(block, i + 1, next,
+                                                        0, sync, mflo, true);
+                                       if (reg2 > 0 && reg2 != REG_HI)
+                                               return REG_HI;
+
+                                       if (!sync && !(old_mask & BIT(op->r.rd)))
+                                               return op->r.rd;
+                                       else
+                                               return REG_HI;
+                               }
+                               continue;
+                       case OP_SPECIAL_MFLO:
+                               if (mflo) {
+                                       if (another)
+                                               return op->r.rd;
+                                       /* Must use REG_LO if there is another MFLO target*/
+                                       reg2 = get_mfhi_mflo_reg(block, i + 1, next,
+                                                        0, sync, mflo, true);
+                                       if (reg2 > 0 && reg2 != REG_LO)
+                                               return REG_LO;
+
+                                       if (!sync && !(old_mask & BIT(op->r.rd)))
+                                               return op->r.rd;
+                                       else
+                                               return REG_LO;
+                               }
                                continue;
+                       default:
+                               break;
                        }
+
+                       /* fall-through */
                default:
                        continue;
                }
        }
 
-       return last != NULL;
+       return reg;
+}
+
+static void lightrec_replace_lo_hi(struct block *block, u16 offset,
+                                  u16 last, bool lo)
+{
+       unsigned int i;
+       u32 branch_offset;
+
+       /* This function will remove the following MFLO/MFHI. It must be called
+        * only if get_mfhi_mflo_reg() returned a non-zero value. */
+
+       for (i = offset; i < last; i++) {
+               struct opcode *op = &block->opcode_list[i];
+
+               switch (op->i.op) {
+               case OP_BEQ:
+               case OP_BNE:
+               case OP_BLEZ:
+               case OP_BGTZ:
+               case OP_REGIMM:
+                       /* TODO: handle backwards branches too */
+                       if ((op->flags & LIGHTREC_LOCAL_BRANCH) &&
+                           (s16)op->c.i.imm >= 0) {
+                               branch_offset = i + 1 + (s16)op->c.i.imm
+                                       - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
+
+                               lightrec_replace_lo_hi(block, branch_offset, last, lo);
+                               lightrec_replace_lo_hi(block, i + 1, branch_offset, lo);
+                       }
+                       break;
+
+               case OP_SPECIAL:
+                       if (lo && op->r.op == OP_SPECIAL_MFLO) {
+                               pr_debug("Removing MFLO opcode at offset 0x%x\n",
+                                        i << 2);
+                               op->opcode = 0;
+                               return;
+                       } else if (!lo && op->r.op == OP_SPECIAL_MFHI) {
+                               pr_debug("Removing MFHI opcode at offset 0x%x\n",
+                                        i << 2);
+                               op->opcode = 0;
+                               return;
+                       }
+
+                       /* fall-through */
+               default:
+                       break;
+               }
+       }
 }
 
-static int lightrec_flag_mults(struct block *block)
+static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *block)
 {
-       struct opcode *list, *prev;
+       struct opcode *list;
+       u8 reg_hi, reg_lo;
+       unsigned int i;
+
+       for (i = 0; i < block->nb_ops - 1; i++) {
+               list = &block->opcode_list[i];
 
-       for (list = block->opcode_list, prev = NULL; list;
-            prev = list, list = list->next) {
                if (list->i.op != OP_SPECIAL)
                        continue;
 
                switch (list->r.op) {
                case OP_SPECIAL_MULT:
                case OP_SPECIAL_MULTU:
+               case OP_SPECIAL_DIV:
+               case OP_SPECIAL_DIVU:
                        break;
                default:
                        continue;
                }
 
-               /* Don't support MULT(U) opcodes in delay slots */
-               if (prev && has_delay_slot(prev->c))
+               /* Don't support opcodes in delay slots */
+               if ((i && has_delay_slot(block->opcode_list[i - 1].c)) ||
+                   (list->flags & LIGHTREC_NO_DS))
                        continue;
 
-               if (is_mult32(block, list)) {
-                       pr_debug("Mark MULT(U) opcode at offset 0x%x as"
-                                " 32-bit\n", list->offset << 2);
-                       list->flags |= LIGHTREC_MULT32;
+               reg_lo = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, true, false);
+               if (reg_lo == 0) {
+                       pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
+                                " not writing LO\n", i << 2);
+                       list->flags |= LIGHTREC_NO_LO;
+               }
+
+               reg_hi = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, false, false);
+               if (reg_hi == 0) {
+                       pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
+                                " not writing HI\n", i << 2);
+                       list->flags |= LIGHTREC_NO_HI;
+               }
+
+               if (!reg_lo && !reg_hi) {
+                       pr_debug("Both LO/HI unused in this block, they will "
+                                "probably be used in parent block - removing "
+                                "flags.\n");
+                       list->flags &= ~(LIGHTREC_NO_LO | LIGHTREC_NO_HI);
+               }
+
+               if (reg_lo > 0 && reg_lo != REG_LO) {
+                       pr_debug("Found register %s to hold LO (rs = %u, rt = %u)\n",
+                                lightrec_reg_name(reg_lo), list->r.rs, list->r.rt);
+
+                       lightrec_replace_lo_hi(block, i + 1, block->nb_ops, true);
+                       list->r.rd = reg_lo;
+               } else {
+                       list->r.rd = 0;
+               }
+
+               if (reg_hi > 0 && reg_hi != REG_HI) {
+                       pr_debug("Found register %s to hold HI (rs = %u, rt = %u)\n",
+                                lightrec_reg_name(reg_hi), list->r.rs, list->r.rt);
+
+                       lightrec_replace_lo_hi(block, i + 1, block->nb_ops, false);
+                       list->r.imm = reg_hi;
+               } else {
+                       list->r.imm = 0;
+               }
+       }
+
+       return 0;
+}
+
+static bool remove_div_sequence(struct block *block, unsigned int offset)
+{
+       struct opcode *op;
+       unsigned int i, found = 0;
+
+       /*
+        * Scan for the zero-checking sequence that GCC automatically introduced
+        * after most DIV/DIVU opcodes. This sequence checks the value of the
+        * divisor, and if zero, executes a BREAK opcode, causing the BIOS
+        * handler to crash the PS1.
+        *
+        * For DIV opcodes, this sequence additionally checks that the signed
+        * operation does not overflow.
+        *
+        * With the assumption that the games never crashed the PS1, we can
+        * therefore assume that the games never divided by zero or overflowed,
+        * and these sequences can be removed.
+        */
+
+       for (i = offset; i < block->nb_ops; i++) {
+               op = &block->opcode_list[i];
+
+               if (!found) {
+                       if (op->i.op == OP_SPECIAL &&
+                           (op->r.op == OP_SPECIAL_DIV || op->r.op == OP_SPECIAL_DIVU))
+                               break;
+
+                       if ((op->opcode & 0xfc1fffff) == 0x14000002) {
+                               /* BNE ???, zero, +8 */
+                               found++;
+                       } else {
+                               offset++;
+                       }
+               } else if (found == 1 && !op->opcode) {
+                       /* NOP */
+                       found++;
+               } else if (found == 2 && op->opcode == 0x0007000d) {
+                       /* BREAK 0x1c00 */
+                       found++;
+               } else if (found == 3 && op->opcode == 0x2401ffff) {
+                       /* LI at, -1 */
+                       found++;
+               } else if (found == 4 && (op->opcode & 0xfc1fffff) == 0x14010004) {
+                       /* BNE ???, at, +16 */
+                       found++;
+               } else if (found == 5 && op->opcode == 0x3c018000) {
+                       /* LUI at, 0x8000 */
+                       found++;
+               } else if (found == 6 && (op->opcode & 0x141fffff) == 0x14010002) {
+                       /* BNE ???, at, +16 */
+                       found++;
+               } else if (found == 7 && !op->opcode) {
+                       /* NOP */
+                       found++;
+               } else if (found == 8 && op->opcode == 0x0006000d) {
+                       /* BREAK 0x1800 */
+                       found++;
+                       break;
+               } else {
+                       break;
+               }
+       }
+
+       if (found >= 3) {
+               if (found != 9)
+                       found = 3;
+
+               pr_debug("Removing DIV%s sequence at offset 0x%x\n",
+                        found == 9 ? "" : "U", offset << 2);
+
+               for (i = 0; i < found; i++)
+                       block->opcode_list[offset + i].opcode = 0;
+
+               return true;
+       }
+
+       return false;
+}
+
+static int lightrec_remove_div_by_zero_check_sequence(struct lightrec_state *state,
+                                                     struct block *block)
+{
+       struct opcode *op;
+       unsigned int i;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               op = &block->opcode_list[i];
+
+               if (op->i.op == OP_SPECIAL &&
+                   (op->r.op == OP_SPECIAL_DIVU || op->r.op == OP_SPECIAL_DIV) &&
+                   remove_div_sequence(block, i + 1))
+                       op->flags |= LIGHTREC_NO_DIV_CHECK;
+       }
+
+       return 0;
+}
+
+static const u32 memset_code[] = {
+       0x10a00006,     // beqz         a1, 2f
+       0x24a2ffff,     // addiu        v0,a1,-1
+       0x2403ffff,     // li           v1,-1
+       0xac800000,     // 1: sw        zero,0(a0)
+       0x2442ffff,     // addiu        v0,v0,-1
+       0x1443fffd,     // bne          v0,v1, 1b
+       0x24840004,     // addiu        a0,a0,4
+       0x03e00008,     // 2: jr        ra
+       0x00000000,     // nop
+};
+
+static int lightrec_replace_memset(struct lightrec_state *state, struct block *block)
+{
+       unsigned int i;
+       union code c;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               c = block->opcode_list[i].c;
+
+               if (c.opcode != memset_code[i])
+                       return 0;
+
+               if (i == ARRAY_SIZE(memset_code) - 1) {
+                       /* success! */
+                       pr_debug("Block at PC 0x%x is a memset\n", block->pc);
+                       block->flags |= BLOCK_IS_MEMSET | BLOCK_NEVER_COMPILE;
+
+                       /* Return non-zero to skip other optimizers. */
+                       return 1;
                }
        }
 
        return 0;
 }
 
-static int (*lightrec_optimizers[])(struct block *) = {
-       &lightrec_detect_impossible_branches,
-       &lightrec_transform_ops,
-       &lightrec_local_branches,
-       &lightrec_switch_delay_slots,
-       &lightrec_flag_stores,
-       &lightrec_flag_mults,
-       &lightrec_early_unload,
+static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block *) = {
+       IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
+       IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
+       IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
+       IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
+       IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
+       IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
+       IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io),
+       IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
+       IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
 };
 
-int lightrec_optimize(struct block *block)
+int lightrec_optimize(struct lightrec_state *state, struct block *block)
 {
        unsigned int i;
+       int ret;
 
        for (i = 0; i < ARRAY_SIZE(lightrec_optimizers); i++) {
-               int ret = lightrec_optimizers[i](block);
-
-               if (ret)
-                       return ret;
+               if (lightrec_optimizers[i]) {
+                       ret = (*lightrec_optimizers[i])(state, block);
+                       if (ret)
+                               return ret;
+               }
        }
 
        return 0;
index 84a8fc9..c829028 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __OPTIMIZER_H__
 #include "disassembler.h"
 
 struct block;
+struct opcode;
 
 _Bool opcode_reads_register(union code op, u8 reg);
 _Bool opcode_writes_register(union code op, u8 reg);
 _Bool has_delay_slot(union code op);
 _Bool load_in_delay_slot(union code op);
+_Bool opcode_is_io(union code op);
+_Bool is_unconditional_jump(union code c);
+_Bool is_syscall(union code c);
 
-int lightrec_optimize(struct block *block);
+_Bool should_emulate(const struct opcode *op);
+
+int lightrec_optimize(struct lightrec_state *state, struct block *block);
 
 #endif /* __OPTIMIZER_H__ */
index 377685c..2e32cae 100644 (file)
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2020-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "blockcache.h"
@@ -21,6 +12,7 @@
 
 #include <errno.h>
 #include <pthread.h>
+#include <stdatomic.h>
 #include <stdbool.h>
 
 struct reaper_elm {
@@ -33,6 +25,8 @@ struct reaper {
        struct lightrec_state *state;
        pthread_mutex_t mutex;
        struct slist_elm reap_list;
+
+       atomic_uint sem;
 };
 
 struct reaper *lightrec_reaper_init(struct lightrec_state *state)
@@ -47,6 +41,7 @@ struct reaper *lightrec_reaper_init(struct lightrec_state *state)
        }
 
        reaper->state = state;
+       reaper->sem = 0;
        slist_init(&reaper->reap_list);
 
        ret = pthread_mutex_init(&reaper->mutex, NULL);
@@ -98,6 +93,11 @@ out_unlock:
        return ret;
 }
 
+static bool lightrec_reaper_can_reap(struct reaper *reaper)
+{
+       return !atomic_load_explicit(&reaper->sem, memory_order_relaxed);
+}
+
 void lightrec_reaper_reap(struct reaper *reaper)
 {
        struct reaper_elm *reaper_elm;
@@ -105,13 +105,14 @@ void lightrec_reaper_reap(struct reaper *reaper)
 
        pthread_mutex_lock(&reaper->mutex);
 
-       while (!!(elm = slist_first(&reaper->reap_list))) {
+       while (lightrec_reaper_can_reap(reaper) &&
+              !!(elm = slist_first(&reaper->reap_list))) {
                slist_remove(&reaper->reap_list, elm);
                pthread_mutex_unlock(&reaper->mutex);
 
                reaper_elm = container_of(elm, struct reaper_elm, slist);
 
-               (*reaper_elm->func)(reaper_elm->data);
+               (*reaper_elm->func)(reaper->state, reaper_elm->data);
 
                lightrec_free(reaper->state, MEM_FOR_LIGHTREC,
                              sizeof(*reaper_elm), reaper_elm);
@@ -121,3 +122,13 @@ void lightrec_reaper_reap(struct reaper *reaper)
 
        pthread_mutex_unlock(&reaper->mutex);
 }
+
+void lightrec_reaper_pause(struct reaper *reaper)
+{
+       atomic_fetch_add_explicit(&reaper->sem, 1, memory_order_relaxed);
+}
+
+void lightrec_reaper_continue(struct reaper *reaper)
+{
+       atomic_fetch_sub_explicit(&reaper->sem, 1, memory_order_relaxed);
+}
index 0309b64..49b6a1a 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2020-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_REAPER_H__
@@ -18,7 +9,7 @@
 struct lightrec_state;
 struct reaper;
 
-typedef void (*reap_func_t)(void *);
+typedef void (*reap_func_t)(struct lightrec_state *state, void *);
 
 struct reaper *lightrec_reaper_init(struct lightrec_state *state);
 void lightrec_reaper_destroy(struct reaper *reaper);
@@ -26,4 +17,7 @@ void lightrec_reaper_destroy(struct reaper *reaper);
 int lightrec_reaper_add(struct reaper *reaper, reap_func_t f, void *data);
 void lightrec_reaper_reap(struct reaper *reaper);
 
+void lightrec_reaper_pause(struct reaper *reaper);
+void lightrec_reaper_continue(struct reaper *reaper);
+
 #endif /* __LIGHTREC_REAPER_H__ */
index 634d3d0..a6c8cd1 100644 (file)
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "debug.h"
 #include <stdbool.h>
 #include <stdlib.h>
 #include <pthread.h>
+#ifdef __linux__
+#include <unistd.h>
+#endif
 
 struct block_rec {
        struct block *block;
        struct slist_elm slist;
+       bool compiling;
+};
+
+struct recompiler_thd {
+       struct lightrec_cstate *cstate;
+       unsigned int tid;
+       pthread_t thd;
 };
 
 struct recompiler {
        struct lightrec_state *state;
-       pthread_t thd;
        pthread_cond_t cond;
+       pthread_cond_t cond2;
        pthread_mutex_t mutex;
        bool stop;
-       struct block *current_block;
        struct slist_elm slist;
+
+       unsigned int nb_recs;
+       struct recompiler_thd thds[];
 };
 
-static void lightrec_compile_list(struct recompiler *rec)
+static unsigned int get_processors_count(void)
+{
+       unsigned int nb;
+
+#if defined(PTW32_VERSION)
+        nb = pthread_num_processors_np();
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+        int count;
+        size_t size = sizeof(count);
+
+        nb = sysctlbyname("hw.ncpu", &count, &size, NULL, 0) ? 1 : count;
+#elif defined(__linux__)
+       nb = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+
+       return nb < 1 ? 1 : nb;
+}
+
+static struct slist_elm * lightrec_get_first_elm(struct slist_elm *head)
+{
+       struct block_rec *block_rec;
+       struct slist_elm *elm;
+
+       for (elm = slist_first(head); elm; elm = elm->next) {
+               block_rec = container_of(elm, struct block_rec, slist);
+
+               if (!block_rec->compiling)
+                       return elm;
+       }
+
+       return NULL;
+}
+
+static void lightrec_compile_list(struct recompiler *rec,
+                                 struct recompiler_thd *thd)
 {
        struct block_rec *block_rec;
        struct slist_elm *next;
        struct block *block;
        int ret;
 
-       while (!!(next = slist_first(&rec->slist))) {
+       while (!!(next = lightrec_get_first_elm(&rec->slist))) {
                block_rec = container_of(next, struct block_rec, slist);
+               block_rec->compiling = true;
                block = block_rec->block;
-               rec->current_block = block;
 
                pthread_mutex_unlock(&rec->mutex);
 
-               ret = lightrec_compile_block(block);
-               if (ret) {
-                       pr_err("Unable to compile block at PC 0x%x: %d\n",
-                              block->pc, ret);
+               if (likely(!(block->flags & BLOCK_IS_DEAD))) {
+                       ret = lightrec_compile_block(thd->cstate, block);
+                       if (ret) {
+                               pr_err("Unable to compile block at PC 0x%x: %d\n",
+                                      block->pc, ret);
+                       }
                }
 
                pthread_mutex_lock(&rec->mutex);
@@ -64,15 +103,14 @@ static void lightrec_compile_list(struct recompiler *rec)
                slist_remove(&rec->slist, next);
                lightrec_free(rec->state, MEM_FOR_LIGHTREC,
                              sizeof(*block_rec), block_rec);
-               pthread_cond_signal(&rec->cond);
+               pthread_cond_signal(&rec->cond2);
        }
-
-       rec->current_block = NULL;
 }
 
 static void * lightrec_recompiler_thd(void *d)
 {
-       struct recompiler *rec = d;
+       struct recompiler_thd *thd = d;
+       struct recompiler *rec = container_of(thd, struct recompiler, thds[thd->tid]);
 
        pthread_mutex_lock(&rec->mutex);
 
@@ -85,7 +123,7 @@ static void * lightrec_recompiler_thd(void *d)
 
                } while (slist_empty(&rec->slist));
 
-               lightrec_compile_list(rec);
+               lightrec_compile_list(rec, thd);
        }
 
 out_unlock:
@@ -96,60 +134,104 @@ out_unlock:
 struct recompiler *lightrec_recompiler_init(struct lightrec_state *state)
 {
        struct recompiler *rec;
+       unsigned int i, nb_recs, nb_cpus;
        int ret;
 
-       rec = lightrec_malloc(state, MEM_FOR_LIGHTREC, sizeof(*rec));
+       nb_cpus = get_processors_count();
+       nb_recs = nb_cpus < 2 ? 1 : nb_cpus - 1;
+
+       rec = lightrec_malloc(state, MEM_FOR_LIGHTREC, sizeof(*rec)
+                             + nb_recs * sizeof(*rec->thds));
        if (!rec) {
                pr_err("Cannot create recompiler: Out of memory\n");
                return NULL;
        }
 
+       for (i = 0; i < nb_recs; i++) {
+               rec->thds[i].tid = i;
+               rec->thds[i].cstate = NULL;
+       }
+
+       for (i = 0; i < nb_recs; i++) {
+               rec->thds[i].cstate = lightrec_create_cstate(state);
+               if (!rec->state) {
+                       pr_err("Cannot create recompiler: Out of memory\n");
+                       goto err_free_cstates;
+               }
+       }
+
        rec->state = state;
        rec->stop = false;
-       rec->current_block = NULL;
+       rec->nb_recs = nb_recs;
        slist_init(&rec->slist);
 
        ret = pthread_cond_init(&rec->cond, NULL);
        if (ret) {
                pr_err("Cannot init cond variable: %d\n", ret);
-               goto err_free_rec;
+               goto err_free_cstates;
        }
 
-       ret = pthread_mutex_init(&rec->mutex, NULL);
+       ret = pthread_cond_init(&rec->cond2, NULL);
        if (ret) {
-               pr_err("Cannot init mutex variable: %d\n", ret);
+               pr_err("Cannot init cond variable: %d\n", ret);
                goto err_cnd_destroy;
        }
 
-       ret = pthread_create(&rec->thd, NULL, lightrec_recompiler_thd, rec);
+       ret = pthread_mutex_init(&rec->mutex, NULL);
        if (ret) {
-               pr_err("Cannot create recompiler thread: %d\n", ret);
-               goto err_mtx_destroy;
+               pr_err("Cannot init mutex variable: %d\n", ret);
+               goto err_cnd2_destroy;
        }
 
+       for (i = 0; i < nb_recs; i++) {
+               ret = pthread_create(&rec->thds[i].thd, NULL,
+                                    lightrec_recompiler_thd, &rec->thds[i]);
+               if (ret) {
+                       pr_err("Cannot create recompiler thread: %d\n", ret);
+                       /* TODO: Handle cleanup properly */
+                       goto err_mtx_destroy;
+               }
+       }
+
+       pr_info("Threaded recompiler started with %u workers.\n", nb_recs);
+
        return rec;
 
 err_mtx_destroy:
        pthread_mutex_destroy(&rec->mutex);
+err_cnd2_destroy:
+       pthread_cond_destroy(&rec->cond2);
 err_cnd_destroy:
        pthread_cond_destroy(&rec->cond);
-err_free_rec:
+err_free_cstates:
+       for (i = 0; i < nb_recs; i++) {
+               if (rec->thds[i].cstate)
+                       lightrec_free_cstate(rec->thds[i].cstate);
+       }
        lightrec_free(state, MEM_FOR_LIGHTREC, sizeof(*rec), rec);
        return NULL;
 }
 
 void lightrec_free_recompiler(struct recompiler *rec)
 {
+       unsigned int i;
+
        rec->stop = true;
 
        /* Stop the thread */
        pthread_mutex_lock(&rec->mutex);
-       pthread_cond_signal(&rec->cond);
+       pthread_cond_broadcast(&rec->cond);
        pthread_mutex_unlock(&rec->mutex);
-       pthread_join(rec->thd, NULL);
+
+       for (i = 0; i < rec->nb_recs; i++)
+               pthread_join(rec->thds[i].thd, NULL);
+
+       for (i = 0; i < rec->nb_recs; i++)
+               lightrec_free_cstate(rec->thds[i].cstate);
 
        pthread_mutex_destroy(&rec->mutex);
        pthread_cond_destroy(&rec->cond);
+       pthread_cond_destroy(&rec->cond2);
        lightrec_free(rec->state, MEM_FOR_LIGHTREC, sizeof(*rec), rec);
 }
 
@@ -174,7 +256,8 @@ int lightrec_recompiler_add(struct recompiler *rec, struct block *block)
                        /* The block to compile is already in the queue - bump
                         * it to the top of the list, unless the block is being
                         * recompiled. */
-                       if (prev && !(block->flags & BLOCK_SHOULD_RECOMPILE)) {
+                       if (prev && !block_rec->compiling &&
+                           !(block->flags & BLOCK_SHOULD_RECOMPILE)) {
                                slist_remove_next(prev);
                                slist_append(&rec->slist, elm);
                        }
@@ -198,6 +281,7 @@ int lightrec_recompiler_add(struct recompiler *rec, struct block *block)
        pr_debug("Adding block PC 0x%x to recompiler\n", block->pc);
 
        block_rec->block = block;
+       block_rec->compiling = false;
 
        elm = &rec->slist;
 
@@ -213,6 +297,7 @@ int lightrec_recompiler_add(struct recompiler *rec, struct block *block)
 
 out_unlock:
        pthread_mutex_unlock(&rec->mutex);
+
        return ret;
 }
 
@@ -223,36 +308,59 @@ void lightrec_recompiler_remove(struct recompiler *rec, struct block *block)
 
        pthread_mutex_lock(&rec->mutex);
 
-       for (elm = slist_first(&rec->slist); elm; elm = elm->next) {
-               block_rec = container_of(elm, struct block_rec, slist);
+       while (true) {
+               for (elm = slist_first(&rec->slist); elm; elm = elm->next) {
+                       block_rec = container_of(elm, struct block_rec, slist);
 
-               if (block_rec->block == block) {
-                       if (block == rec->current_block) {
+                       if (block_rec->block != block)
+                               continue;
+
+                       if (block_rec->compiling) {
                                /* Block is being recompiled - wait for
                                 * completion */
-                               do {
-                                       pthread_cond_wait(&rec->cond,
-                                                         &rec->mutex);
-                               } while (block == rec->current_block);
+                               pthread_cond_wait(&rec->cond2, &rec->mutex);
+
+                               /* We can't guarantee the signal was for us.
+                                * Since block_rec may have been removed while
+                                * we were waiting on the condition, we cannot
+                                * check block_rec->compiling again. The best
+                                * thing is just to restart the function. */
+                               break;
                        } else {
                                /* Block is not yet being processed - remove it
                                 * from the list */
                                slist_remove(&rec->slist, elm);
                                lightrec_free(rec->state, MEM_FOR_LIGHTREC,
                                              sizeof(*block_rec), block_rec);
+
+                               goto out_unlock;
                        }
+               }
 
+               if (!elm)
                        break;
-               }
        }
 
+out_unlock:
        pthread_mutex_unlock(&rec->mutex);
 }
 
-void * lightrec_recompiler_run_first_pass(struct block *block, u32 *pc)
+void * lightrec_recompiler_run_first_pass(struct lightrec_state *state,
+                                         struct block *block, u32 *pc)
 {
        bool freed;
 
+       /* There's no point in running the first pass if the block will never
+        * be compiled. Let the main loop run the interpreter instead. */
+       if (block->flags & BLOCK_NEVER_COMPILE)
+               return NULL;
+
+       /* If the block is already fully tagged, there is no point in running
+        * the first pass. Request a recompilation of the block, and maybe the
+        * interpreter will run the block in the meantime. */
+       if (block->flags & BLOCK_FULLY_TAGGED)
+               lightrec_recompiler_add(state->rec, block);
+
        if (likely(block->function)) {
                if (block->flags & BLOCK_FULLY_TAGGED) {
                        freed = atomic_flag_test_and_set(&block->op_list_freed);
@@ -263,8 +371,7 @@ void * lightrec_recompiler_run_first_pass(struct block *block, u32 *pc)
 
                                /* The block was already compiled but the opcode list
                                 * didn't get freed yet - do it now */
-                               lightrec_free_opcode_list(block->state,
-                                                         block->opcode_list);
+                               lightrec_free_opcode_list(state, block);
                                block->opcode_list = NULL;
                        }
                }
@@ -277,7 +384,7 @@ void * lightrec_recompiler_run_first_pass(struct block *block, u32 *pc)
        freed = atomic_flag_test_and_set(&block->op_list_freed);
 
        /* Block wasn't compiled yet - run the interpreter */
-       *pc = lightrec_emulate_block(block, *pc);
+       *pc = lightrec_emulate_block(state, block, *pc);
 
        if (!freed)
                atomic_flag_clear(&block->op_list_freed);
@@ -289,7 +396,7 @@ void * lightrec_recompiler_run_first_pass(struct block *block, u32 *pc)
                pr_debug("Block PC 0x%08x is fully tagged"
                         " - free opcode list\n", block->pc);
 
-               lightrec_free_opcode_list(block->state, block->opcode_list);
+               lightrec_free_opcode_list(state, block);
                block->opcode_list = NULL;
        }
 
index 999a49f..9bc522d 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2019-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2019-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_RECOMPILER_H__
@@ -24,6 +15,7 @@ void lightrec_free_recompiler(struct recompiler *rec);
 int lightrec_recompiler_add(struct recompiler *rec, struct block *block);
 void lightrec_recompiler_remove(struct recompiler *rec, struct block *block);
 
-void * lightrec_recompiler_run_first_pass(struct block *block, u32 *pc);
+void * lightrec_recompiler_run_first_pass(struct lightrec_state *state,
+                                         struct block *block, u32 *pc);
 
 #endif /* __LIGHTREC_RECOMPILER_H__ */
index 0256015..c018870 100644 (file)
@@ -1,27 +1,19 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #include "debug.h"
 #include "memmanager.h"
+#include "lightning-wrapper.h"
 #include "regcache.h"
 
-#include <lightning.h>
 #include <stdbool.h>
 #include <stddef.h>
 
 struct native_register {
-       bool used, loaded, dirty, output, extend, extended, locked;
+       bool used, loaded, dirty, output, extend, extended,
+            zero_extend, zero_extended, locked;
        s8 emulated_register;
 };
 
@@ -48,6 +40,24 @@ const char * lightrec_reg_name(u8 reg)
        return mips_regs[reg];
 }
 
+static inline bool lightrec_reg_is_zero(u8 jit_reg)
+{
+#if defined(__mips__) || defined(__alpha__) || defined(__riscv)
+       if (jit_reg == _ZERO)
+               return true;
+#endif
+       return false;
+}
+
+static inline s8 lightrec_get_hardwired_reg(u8 reg)
+{
+#if defined(__mips__) || defined(__alpha__) || defined(__riscv)
+       if (reg == 0)
+               return _ZERO;
+#endif
+       return -1;
+}
+
 static inline u8 lightrec_reg_number(const struct regcache *cache,
                const struct native_register *nreg)
 {
@@ -79,6 +89,34 @@ static inline struct native_register * lightning_reg_to_lightrec(
        }
 }
 
+u8 lightrec_get_reg_in_flags(struct regcache *cache, u8 jit_reg)
+{
+       struct native_register *reg;
+       u8 flags = 0;
+
+       if (lightrec_reg_is_zero(jit_reg))
+               return REG_EXT | REG_ZEXT;
+
+       reg = lightning_reg_to_lightrec(cache, jit_reg);
+       if (reg->extended)
+               flags |= REG_EXT;
+       if (reg->zero_extended)
+               flags |= REG_ZEXT;
+
+       return flags;
+}
+
+void lightrec_set_reg_out_flags(struct regcache *cache, u8 jit_reg, u8 flags)
+{
+       struct native_register *reg;
+
+       if (!lightrec_reg_is_zero(jit_reg)) {
+               reg = lightning_reg_to_lightrec(cache, jit_reg);
+               reg->extend = flags & REG_EXT;
+               reg->zero_extend = flags & REG_ZEXT;
+       }
+}
+
 static struct native_register * alloc_temp(struct regcache *cache)
 {
        unsigned int i;
@@ -157,6 +195,7 @@ static struct native_register * alloc_in_out(struct regcache *cache,
 static void lightrec_discard_nreg(struct native_register *nreg)
 {
        nreg->extended = false;
+       nreg->zero_extended = false;
        nreg->loaded = false;
        nreg->output = false;
        nreg->dirty = false;
@@ -170,7 +209,7 @@ static void lightrec_unload_nreg(struct regcache *cache, jit_state_t *_jit,
 {
        /* If we get a dirty register, store back the old value */
        if (nreg->dirty) {
-               s16 offset = offsetof(struct lightrec_state, native_reg_cache)
+               s16 offset = offsetof(struct lightrec_state, regs.gpr)
                        + (nreg->emulated_register << 2);
 
                jit_stxi_i(offset, LIGHTREC_REG_STATE, jit_reg);
@@ -181,6 +220,9 @@ static void lightrec_unload_nreg(struct regcache *cache, jit_state_t *_jit,
 
 void lightrec_unload_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
 {
+       if (lightrec_reg_is_zero(jit_reg))
+               return;
+
        lightrec_unload_nreg(cache, _jit,
                        lightning_reg_to_lightrec(cache, jit_reg), jit_reg);
 }
@@ -189,8 +231,12 @@ void lightrec_unload_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
  * A locked register cannot only be used as input, not output. */
 void lightrec_lock_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
 {
-       struct native_register *reg = lightning_reg_to_lightrec(cache, jit_reg);
+       struct native_register *reg;
+
+       if (lightrec_reg_is_zero(jit_reg))
+               return;
 
+       reg = lightning_reg_to_lightrec(cache, jit_reg);
        lightrec_clean_reg(cache, _jit, jit_reg);
 
        reg->locked = true;
@@ -198,8 +244,12 @@ void lightrec_lock_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
 
 u8 lightrec_alloc_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
 {
-       struct native_register *reg = lightning_reg_to_lightrec(cache, jit_reg);
+       struct native_register *reg;
 
+       if (lightrec_reg_is_zero(jit_reg))
+               return jit_reg;
+
+       reg = lightning_reg_to_lightrec(cache, jit_reg);
        lightrec_unload_nreg(cache, _jit, reg, jit_reg);
 
        reg->used = true;
@@ -223,10 +273,18 @@ u8 lightrec_alloc_reg_temp(struct regcache *cache, jit_state_t *_jit)
        return jit_reg;
 }
 
-u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, u8 reg)
+u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit,
+                         u8 reg, u8 flags)
 {
+       struct native_register *nreg;
        u8 jit_reg;
-       struct native_register *nreg = alloc_in_out(cache, reg, true);
+       s8 hw_reg;
+
+       hw_reg = lightrec_get_hardwired_reg(reg);
+       if (hw_reg >= 0)
+               return (u8) hw_reg;
+
+       nreg = alloc_in_out(cache, reg, true);
        if (!nreg) {
                /* No free register, no dirty register to free. */
                pr_err("No more registers! Abandon ship!\n");
@@ -240,18 +298,27 @@ u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, u8 reg)
        if (nreg->emulated_register != reg)
                lightrec_unload_nreg(cache, _jit, nreg, jit_reg);
 
-       nreg->extend = false;
        nreg->used = true;
        nreg->output = true;
        nreg->emulated_register = reg;
+       nreg->extend = flags & REG_EXT;
+       nreg->zero_extend = flags & REG_ZEXT;
        return jit_reg;
 }
 
-u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, u8 reg)
+u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit,
+                        u8 reg, u8 flags)
 {
+       struct native_register *nreg;
        u8 jit_reg;
        bool reg_changed;
-       struct native_register *nreg = alloc_in_out(cache, reg, false);
+       s8 hw_reg;
+
+       hw_reg = lightrec_get_hardwired_reg(reg);
+       if (hw_reg >= 0)
+               return (u8) hw_reg;
+
+       nreg = alloc_in_out(cache, reg, false);
        if (!nreg) {
                /* No free register, no dirty register to free. */
                pr_err("No more registers! Abandon ship!\n");
@@ -267,55 +334,44 @@ u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, u8 reg)
                lightrec_unload_nreg(cache, _jit, nreg, jit_reg);
 
        if (!nreg->loaded && !nreg->dirty && reg != 0) {
-               s16 offset = offsetof(struct lightrec_state, native_reg_cache)
+               s16 offset = offsetof(struct lightrec_state, regs.gpr)
                        + (reg << 2);
 
+               nreg->zero_extended = flags & REG_ZEXT;
+               nreg->extended = !nreg->zero_extended;
+
                /* Load previous value from register cache */
-               jit_ldxi_i(jit_reg, LIGHTREC_REG_STATE, offset);
+               if (nreg->zero_extended)
+                       jit_ldxi_ui(jit_reg, LIGHTREC_REG_STATE, offset);
+               else
+                       jit_ldxi_i(jit_reg, LIGHTREC_REG_STATE, offset);
+
                nreg->loaded = true;
-               nreg->extended = true;
        }
 
        /* Clear register r0 before use */
        if (reg == 0 && (!nreg->loaded || nreg->dirty)) {
                jit_movi(jit_reg, 0);
                nreg->extended = true;
+               nreg->zero_extended = true;
                nreg->loaded = true;
        }
 
        nreg->used = true;
        nreg->output = false;
        nreg->emulated_register = reg;
-       return jit_reg;
-}
-
-u8 lightrec_alloc_reg_out_ext(struct regcache *cache, jit_state_t *_jit, u8 reg)
-{
-       struct native_register *nreg;
-       u8 jit_reg;
-
-       jit_reg = lightrec_alloc_reg_out(cache, _jit, reg);
-       nreg = lightning_reg_to_lightrec(cache, jit_reg);
-
-       nreg->extend = true;
 
-       return jit_reg;
-}
-
-u8 lightrec_alloc_reg_in_ext(struct regcache *cache, jit_state_t *_jit, u8 reg)
-{
-       struct native_register *nreg;
-       u8 jit_reg;
-
-       jit_reg = lightrec_alloc_reg_in(cache, _jit, reg);
-       nreg = lightning_reg_to_lightrec(cache, jit_reg);
-
-#if __WORDSIZE == 64
-       if (!nreg->extended) {
+       if ((flags & REG_EXT) && !nreg->extended &&
+           (!nreg->zero_extended || !(flags & REG_ZEXT))) {
                nreg->extended = true;
+               nreg->zero_extended = false;
                jit_extr_i(jit_reg, jit_reg);
+       } else if (!(flags & REG_EXT) && (flags & REG_ZEXT) &&
+                  !nreg->zero_extended) {
+               nreg->zero_extended = true;
+               nreg->extended = false;
+               jit_extr_ui(jit_reg, jit_reg);
        }
-#endif
 
        return jit_reg;
 }
@@ -337,10 +393,11 @@ u8 lightrec_request_reg_in(struct regcache *cache, jit_state_t *_jit,
        lightrec_unload_nreg(cache, _jit, nreg, jit_reg);
 
        /* Load previous value from register cache */
-       offset = offsetof(struct lightrec_state, native_reg_cache) + (reg << 2);
+       offset = offsetof(struct lightrec_state, regs.gpr) + (reg << 2);
        jit_ldxi_i(jit_reg, LIGHTREC_REG_STATE, offset);
 
        nreg->extended = true;
+       nreg->zero_extended = false;
        nreg->used = true;
        nreg->loaded = true;
        nreg->emulated_register = reg;
@@ -353,14 +410,17 @@ static void free_reg(struct native_register *nreg)
        /* Set output registers as dirty */
        if (nreg->used && nreg->output && nreg->emulated_register > 0)
                nreg->dirty = true;
-       if (nreg->output)
+       if (nreg->output) {
                nreg->extended = nreg->extend;
+               nreg->zero_extended = nreg->zero_extend;
+       }
        nreg->used = false;
 }
 
 void lightrec_free_reg(struct regcache *cache, u8 jit_reg)
 {
-       free_reg(lightning_reg_to_lightrec(cache, jit_reg));
+       if (!lightrec_reg_is_zero(jit_reg))
+               free_reg(lightning_reg_to_lightrec(cache, jit_reg));
 }
 
 void lightrec_free_regs(struct regcache *cache)
@@ -375,7 +435,7 @@ static void clean_reg(jit_state_t *_jit,
                struct native_register *nreg, u8 jit_reg, bool clean)
 {
        if (nreg->dirty) {
-               s16 offset = offsetof(struct lightrec_state, native_reg_cache)
+               s16 offset = offsetof(struct lightrec_state, regs.gpr)
                        + (nreg->emulated_register << 2);
 
                jit_stxi_i(offset, LIGHTREC_REG_STATE, jit_reg);
@@ -408,8 +468,12 @@ void lightrec_clean_regs(struct regcache *cache, jit_state_t *_jit)
 
 void lightrec_clean_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
 {
-       struct native_register *reg = lightning_reg_to_lightrec(cache, jit_reg);
-       clean_reg(_jit, reg, jit_reg, true);
+       struct native_register *reg;
+
+       if (!lightrec_reg_is_zero(jit_reg)) {
+               reg = lightning_reg_to_lightrec(cache, jit_reg);
+               clean_reg(_jit, reg, jit_reg, true);
+       }
 }
 
 void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit,
index 8678cc6..835c9c9 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __REGCACHE_H__
@@ -22,8 +13,9 @@
 #define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1))
 #define LIGHTREC_REG_CYCLE (JIT_V(JIT_V_NUM - 2))
 
-#define REG_LO 32
-#define REG_HI 33
+/* Flags for lightrec_alloc_reg_in / lightrec_alloc_reg_out. */
+#define REG_EXT                BIT(0) /* register is sign-extended */
+#define REG_ZEXT       BIT(1) /* register is zero-extended */
 
 struct register_value {
        _Bool known;
@@ -35,15 +27,17 @@ struct regcache;
 
 u8 lightrec_alloc_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg);
 u8 lightrec_alloc_reg_temp(struct regcache *cache, jit_state_t *_jit);
-u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, u8 reg);
-u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, u8 reg);
-u8 lightrec_alloc_reg_out_ext(struct regcache *cache,
-                             jit_state_t *_jit, u8 reg);
-u8 lightrec_alloc_reg_in_ext(struct regcache *cache, jit_state_t *_jit, u8 reg);
+u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit,
+                         u8 reg, u8 flags);
+u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit,
+                        u8 reg, u8 flags);
 
 u8 lightrec_request_reg_in(struct regcache *cache, jit_state_t *_jit,
                           u8 reg, u8 jit_reg);
 
+u8 lightrec_get_reg_in_flags(struct regcache *cache, u8 jit_reg);
+void lightrec_set_reg_out_flags(struct regcache *cache, u8 jit_reg, u8 flags);
+
 void lightrec_regcache_reset(struct regcache *cache);
 
 void lightrec_lock_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg);
index 18195e8..ae7e5d3 100644 (file)
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
 /*
- * Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * Copyright (C) 2020-2021 Paul Cercueil <paul@crapouillou.net>
  */
 
 #ifndef __LIGHTREC_SLIST_H__