git subrepo pull --force deps/lightrec
authorPaul Cercueil <paul@crapouillou.net>
Sun, 9 Jul 2023 11:56:01 +0000 (13:56 +0200)
committerPaul Cercueil <paul@crapouillou.net>
Sun, 9 Jul 2023 11:57:44 +0000 (13:57 +0200)
subrepo:
  subdir:   "deps/lightrec"
  merged:   "fcf239e7e9"
upstream:
  origin:   "https://github.com/pcercuei/lightrec.git"
  branch:   "master"
  commit:   "fcf239e7e9"
git-subrepo:
  version:  "0.4.3"
  origin:   "https://github.com/ingydotnet/git-subrepo.git"
  commit:   "2f68596"

21 files changed:
deps/lightrec/.gitrepo
deps/lightrec/CMakeLists.txt
deps/lightrec/README.md
deps/lightrec/constprop.c
deps/lightrec/constprop.h
deps/lightrec/disassembler.c
deps/lightrec/disassembler.h
deps/lightrec/emitter.c
deps/lightrec/emitter.h
deps/lightrec/interpreter.c
deps/lightrec/interpreter.h
deps/lightrec/lightning-wrapper.h
deps/lightrec/lightrec-config.h.cmakein
deps/lightrec/lightrec-private.h
deps/lightrec/lightrec.c
deps/lightrec/lightrec.h
deps/lightrec/memmanager.c
deps/lightrec/optimizer.c
deps/lightrec/optimizer.h
deps/lightrec/regcache.c
deps/lightrec/regcache.h

index 6e8794f..8a344c4 100644 (file)
@@ -6,7 +6,7 @@
 [subrepo]
        remote = https://github.com/pcercuei/lightrec.git
        branch = master
-       commit = 3ff589bcb7d52b3a091fe0b922ba02a0b1a7f095
-       parent = aced3eb3fcaa0fe13c44c4dd196cdab42555fd98
+       commit = fcf239e7e9d42fedb7a8de64057d6895acf3ceee
+       parent = 03ec8a8c606eb87642be336632e1792ab89650d8
        method = merge
        cmdver = 0.4.3
index 12da14e..9518a9a 100644 (file)
@@ -66,11 +66,11 @@ endif (ENABLE_THREADED_COMPILER)
 option(OPT_REMOVE_DIV_BY_ZERO_SEQ "(optimization) Remove div-by-zero check sequence" ON)
 option(OPT_REPLACE_MEMSET "(optimization) Detect and replace memset with host variant" ON)
 option(OPT_DETECT_IMPOSSIBLE_BRANCHES "(optimization) Detect impossible branches" ON)
+option(OPT_HANDLE_LOAD_DELAYS "(optimization) Detect load delays" ON)
 option(OPT_TRANSFORM_OPS "(optimization) Transform opcodes" ON)
 option(OPT_LOCAL_BRANCHES "(optimization) Detect local branches" ON)
 option(OPT_SWITCH_DELAY_SLOTS "(optimization) Switch delay slots" ON)
-option(OPT_FLAG_STORES "(optimization) Flag stores that don't require invalidation" ON)
-option(OPT_FLAG_IO "(optimization) Flag I/O opcodes whose target is known" ON)
+option(OPT_FLAG_IO "(optimization) Flag I/O opcodes when the target can be detected" ON)
 option(OPT_FLAG_MULT_DIV "(optimization) Flag MULT/DIV that only use one of HI/LO" ON)
 option(OPT_EARLY_UNLOAD "(optimization) Unload registers early" ON)
 
index ab2c13b..449e06c 100644 (file)
@@ -17,8 +17,7 @@ a form of Intermediate Representation (IR).
 Basically, just a single-linked list of structures representing the
 instructions. On that list, several optimization steps are performed:
 instructions are modified, reordered, tagged; new meta-instructions
-can be added, for instance to tell the code generator that a certain
-register won't be used anymore.
+can also be added.
 
 * __Lazy compilation__.
 If Lightrec detects a block of code that would be very hard to
@@ -46,10 +45,12 @@ typically happens when a lot of new code is run.
 
 Lightrec has been ported to the following emulators:
 
-* [__PCSX-ReArmed__ (my own fork)](https://github.com/pcercuei/pcsx_rearmed)
+* [__PCSX-ReArmed__ (libretro)](https://github.com/libretro/pcsx_rearmed)
 
 * [__pcsx4all__ (my own fork)](https://github.com/pcercuei/pcsx4all)
 
 * [__Beetle__ (libretro)](https://github.com/libretro/beetle-psx-libretro/)
 
+* [__CubeSX/WiiSX__](https://github.com/emukidid/pcsxgc/)
+
 [![Star History Chart](https://api.star-history.com/svg?repos=pcercuei/lightrec&type=Date)](https://star-history.com/#pcercuei/lightrec&Date)
index 353f42f..8499c6e 100644 (file)
@@ -243,12 +243,13 @@ static void lightrec_propagate_slt(u32 rs, u32 rd, bool is_signed,
        }
 }
 
-void lightrec_consts_propagate(const struct opcode *list,
+void lightrec_consts_propagate(const struct block *block,
                               unsigned int idx,
                               struct constprop_data *v)
 {
+       const struct opcode *list = block->opcode_list;
        union code c;
-       u32 imm;
+       u32 imm, flags;
 
        if (idx == 0)
                return;
@@ -263,8 +264,13 @@ void lightrec_consts_propagate(const struct opcode *list,
                return;
        }
 
-       if (idx > 1 && !op_flag_sync(list[idx - 1].flags)) {
-               c = list[idx - 2].c;
+       flags = list[idx - 1].flags;
+
+       if (idx > 1 && !op_flag_sync(flags)) {
+               if (op_flag_no_ds(flags))
+                       c = list[idx - 1].c;
+               else
+                       c = list[idx - 2].c;
 
                switch (c.i.op) {
                case OP_BNE:
@@ -449,6 +455,13 @@ void lightrec_consts_propagate(const struct opcode *list,
                        v[c.r.rd].known = 0;
                        v[c.r.rd].sign = 0;
                        break;
+
+               case OP_SPECIAL_JALR:
+                       v[c.r.rd].known = 0xffffffff;
+                       v[c.r.rd].sign = 0;
+                       v[c.r.rd].value = block->pc + (idx + 2 << 2);
+                       break;
+
                default:
                        break;
                }
@@ -644,7 +657,7 @@ void lightrec_consts_propagate(const struct opcode *list,
                                imm = imm ? GENMASK(31, 32 - imm) : 0;
                                v[c.i.rt].sign = 0;
                        }
-                       v[c.i.rt].known &= ~imm;
+                       v[c.i.rt].known &= imm;
                        break;
                }
                fallthrough;
@@ -652,30 +665,48 @@ void lightrec_consts_propagate(const struct opcode *list,
                v[c.i.rt].known = 0;
                v[c.i.rt].sign = 0;
                break;
-       case OP_META_MOV:
-               v[c.r.rd] = v[c.r.rs];
-               break;
-       case OP_META_EXTC:
-               v[c.i.rt].value = (s32)(s8)v[c.i.rs].value;
-               if (v[c.i.rs].known & BIT(7)) {
-                       v[c.i.rt].known = v[c.i.rs].known | 0xffffff00;
-                       v[c.i.rt].sign = 0;
-               } else {
-                       v[c.i.rt].known = v[c.i.rs].known & 0x7f;
-                       v[c.i.rt].sign = 0xffffff80;
-               }
-               break;
+       case OP_META:
+               switch (c.m.op) {
+               case OP_META_MOV:
+                       v[c.m.rd] = v[c.m.rs];
+                       break;
 
-       case OP_META_EXTS:
-               v[c.i.rt].value = (s32)(s16)v[c.i.rs].value;
-               if (v[c.i.rs].known & BIT(15)) {
-                       v[c.i.rt].known = v[c.i.rs].known | 0xffff0000;
-                       v[c.i.rt].sign = 0;
-               } else {
-                       v[c.i.rt].known = v[c.i.rs].known & 0x7fff;
-                       v[c.i.rt].sign = 0xffff8000;
+               case OP_META_EXTC:
+                       v[c.m.rd].value = (s32)(s8)v[c.m.rs].value;
+                       if (v[c.m.rs].known & BIT(7)) {
+                               v[c.m.rd].known = v[c.m.rs].known | 0xffffff00;
+                               v[c.m.rd].sign = 0;
+                       } else {
+                               v[c.m.rd].known = v[c.m.rs].known & 0x7f;
+                               v[c.m.rd].sign = 0xffffff80;
+                       }
+                       break;
+
+               case OP_META_EXTS:
+                       v[c.m.rd].value = (s32)(s16)v[c.m.rs].value;
+                       if (v[c.m.rs].known & BIT(15)) {
+                               v[c.m.rd].known = v[c.m.rs].known | 0xffff0000;
+                               v[c.m.rd].sign = 0;
+                       } else {
+                               v[c.m.rd].known = v[c.m.rs].known & 0x7fff;
+                               v[c.m.rd].sign = 0xffff8000;
+                       }
+                       break;
+
+               case OP_META_COM:
+                       v[c.m.rd].known = v[c.m.rs].known;
+                       v[c.m.rd].value = ~v[c.m.rs].value;
+                       v[c.m.rd].sign = v[c.m.rs].sign;
+                       break;
+               default:
+                       break;
                }
                break;
+       case OP_JAL:
+               v[31].known = 0xffffffff;
+               v[31].sign = 0;
+               v[31].value = block->pc + (idx + 2 << 2);
+               break;
 
        default:
                break;
index cebf0b3..9f9ecc3 100644 (file)
@@ -10,7 +10,7 @@
 
 #define LIGHTREC_CONSTPROP_INITIALIZER { { 0, 0xffffffff, 0 }, }
 
-struct opcode;
+struct block;
 
 struct constprop_data {
        u32 value;
@@ -34,7 +34,7 @@ static inline _Bool is_known_zero(const struct constprop_data *v, u8 reg)
        return bits_are_known_zero(v, reg, 0xffffffff);
 }
 
-void lightrec_consts_propagate(const struct opcode *list,
+void lightrec_consts_propagate(const struct block *block,
                               unsigned int idx,
                               struct constprop_data *v);
 
index bef9594..f687d28 100644 (file)
@@ -120,6 +120,13 @@ static const char * const cp2_opcodes[] = {
        [OP_CP2_NCCT]           = "ncct    ",
 };
 
+static const char * const meta_opcodes[] = {
+       [OP_META_MOV]           = "move    ",
+       [OP_META_EXTC]          = "extc    ",
+       [OP_META_EXTS]          = "exts    ",
+       [OP_META_COM]           = "com     ",
+};
+
 static const char * const mult2_opcodes[] = {
        "mult2   ", "multu2  ",
 };
@@ -133,6 +140,7 @@ static const char * const opcode_io_flags[] = {
        "self-modifying code",
        "no invalidation",
        "no mask",
+       "load delay",
 };
 
 static const char * const opcode_io_modes[] = {
@@ -444,18 +452,11 @@ static int print_op(union code c, u32 pc, char *buf, size_t len,
                                lightrec_reg_name(c.i.rt),
                                (s16)c.i.imm,
                                lightrec_reg_name(c.i.rs));
-       case OP_META_MOV:
-               return snprintf(buf, len, "move    %s,%s",
-                               lightrec_reg_name(c.r.rd),
-                               lightrec_reg_name(c.r.rs));
-       case OP_META_EXTC:
-               return snprintf(buf, len, "extc    %s,%s",
-                               lightrec_reg_name(c.i.rt),
-                               lightrec_reg_name(c.i.rs));
-       case OP_META_EXTS:
-               return snprintf(buf, len, "exts    %s,%s",
-                               lightrec_reg_name(c.i.rt),
-                               lightrec_reg_name(c.i.rs));
+       case OP_META:
+               return snprintf(buf, len, "%s%s,%s",
+                               meta_opcodes[c.m.op],
+                               lightrec_reg_name(c.m.rd),
+                               lightrec_reg_name(c.m.rs));
        case OP_META_MULT2:
        case OP_META_MULTU2:
                *flags_ptr = opcode_multdiv_flags;
index e4685a9..9e39484 100644 (file)
 #define LIGHTREC_SMC           BIT(2)
 #define LIGHTREC_NO_INVALIDATE BIT(3)
 #define LIGHTREC_NO_MASK       BIT(4)
+#define LIGHTREC_LOAD_DELAY    BIT(5)
 
 /* I/O mode for load/store opcodes */
-#define LIGHTREC_IO_MODE_LSB   5
+#define LIGHTREC_IO_MODE_LSB   6
 #define LIGHTREC_IO_MODE(x)    ((x) << LIGHTREC_IO_MODE_LSB)
 #define LIGHTREC_IO_UNKNOWN    0x0
 #define LIGHTREC_IO_DIRECT     0x1
@@ -107,10 +108,7 @@ enum standard_opcodes {
        OP_LWC2                 = 0x32,
        OP_SWC2                 = 0x3a,
 
-       OP_META_MOV             = 0x16,
-
-       OP_META_EXTC            = 0x17,
-       OP_META_EXTS            = 0x18,
+       OP_META                 = 0x3b,
 
        OP_META_MULT2           = 0x19,
        OP_META_MULTU2          = 0x1a,
@@ -195,6 +193,15 @@ enum cp2_basic_opcodes {
        OP_CP2_BASIC_CTC2       = 0x06,
 };
 
+enum meta_opcodes {
+       OP_META_MOV             = 0x00,
+
+       OP_META_EXTC            = 0x01,
+       OP_META_EXTS            = 0x02,
+
+       OP_META_COM             = 0x03,
+};
+
 struct opcode_r {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        u32 zero :6;
@@ -237,12 +244,31 @@ struct opcode_j {
 #endif
 } __packed;
 
+struct opcode_m {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+       u32 meta :6;
+       u32 rs   :5;
+       u32 rt   :5;
+       u32 rd   :5;
+       u32 imm  :6;
+       u32 op   :5;
+#else
+       u32 op   :5;
+       u32 imm  :6;
+       u32 rd   :5;
+       u32 rt   :5;
+       u32 rs   :5;
+       u32 meta :6;
+#endif
+};
+
 union code {
        /* Keep in sync with struct opcode */
        u32 opcode;
        struct opcode_r r;
        struct opcode_i i;
        struct opcode_j j;
+       struct opcode_m m;
 };
 
 struct opcode {
@@ -255,6 +281,7 @@ struct opcode {
                struct opcode_r r;
                struct opcode_i i;
                struct opcode_j j;
+               struct opcode_m m;
        };
        u32 flags;
 };
@@ -278,13 +305,12 @@ static inline _Bool op_flag_sync(u32 flags)
 
 static inline _Bool op_flag_smc(u32 flags)
 {
-       return OPT_FLAG_STORES && (flags & LIGHTREC_SMC);
+       return OPT_FLAG_IO && (flags & LIGHTREC_SMC);
 }
 
 static inline _Bool op_flag_no_invalidate(u32 flags)
 {
-       return (OPT_FLAG_IO || OPT_FLAG_STORES) &&
-               (flags & LIGHTREC_NO_INVALIDATE);
+       return OPT_FLAG_IO && (flags & LIGHTREC_NO_INVALIDATE);
 }
 
 static inline _Bool op_flag_no_mask(u32 flags)
@@ -292,6 +318,11 @@ static inline _Bool op_flag_no_mask(u32 flags)
        return OPT_FLAG_IO && (flags & LIGHTREC_NO_MASK);
 }
 
+static inline _Bool op_flag_load_delay(u32 flags)
+{
+       return OPT_HANDLE_LOAD_DELAYS && (flags & LIGHTREC_LOAD_DELAY);
+}
+
 static inline _Bool op_flag_emulate_branch(u32 flags)
 {
        return OPT_DETECT_IMPOSSIBLE_BRANCHES &&
index 14820e5..a6d4355 100644 (file)
@@ -21,6 +21,7 @@ static void rec_SPECIAL(struct lightrec_cstate *state, const struct block *block
 static void rec_REGIMM(struct lightrec_cstate *state, const struct block *block, u16 offset);
 static void rec_CP0(struct lightrec_cstate *state, const struct block *block, u16 offset);
 static void rec_CP2(struct lightrec_cstate *state, const struct block *block, u16 offset);
+static void rec_META(struct lightrec_cstate *state, const struct block *block, u16 offset);
 static void rec_cp2_do_mtc2(struct lightrec_cstate *state,
                            const struct block *block, u16 offset, u8 reg, u8 in_reg);
 static void rec_cp2_do_mfc2(struct lightrec_cstate *state,
@@ -35,12 +36,24 @@ static void unknown_opcode(struct lightrec_cstate *state, const struct block *bl
 }
 
 static void
-lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit)
+lightrec_jump_to_fn(jit_state_t *_jit, void (*fn)(void))
 {
        /* Prevent jit_jmpi() from using our cycles register as a temporary */
        jit_live(LIGHTREC_REG_CYCLE);
 
-       jit_patch_abs(jit_jmpi(), state->state->eob_wrapper_func);
+       jit_patch_abs(jit_jmpi(), fn);
+}
+
+static void
+lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit)
+{
+       lightrec_jump_to_fn(_jit, state->state->eob_wrapper_func);
+}
+
+static void
+lightrec_jump_to_ds_check(struct lightrec_cstate *state, jit_state_t *_jit)
+{
+       lightrec_jump_to_fn(_jit, state->state->ds_check_func);
 }
 
 static void update_ra_register(struct regcache *reg_cache, jit_state_t *_jit,
@@ -61,7 +74,7 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state,
        struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
        const struct opcode *op = &block->opcode_list[offset],
-                           *next = &block->opcode_list[offset + 1];
+                           *ds = get_delay_slot(block->opcode_list, offset);
        u32 cycles = state->cycles + lightrec_cycles_of_opcode(op->c);
 
        jit_note(__FILE__, __LINE__);
@@ -83,10 +96,10 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state,
 
        if (has_delay_slot(op->c) &&
            !op_flag_no_ds(op->flags) && !op_flag_local_branch(op->flags)) {
-               cycles += lightrec_cycles_of_opcode(next->c);
+               cycles += lightrec_cycles_of_opcode(ds->c);
 
                /* Recompile the delay slot */
-               if (next->c.opcode)
+               if (ds->c.opcode)
                        lightrec_rec_opcode(state, block, offset + 1);
        }
 
@@ -98,11 +111,41 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state,
                pr_debug("EOB: %u cycles\n", cycles);
        }
 
-       lightrec_jump_to_eob(state, _jit);
+       if (op_flag_load_delay(ds->flags)
+           && opcode_is_load(ds->c) && !state->no_load_delay) {
+               /* If the delay slot is a load opcode, its target register
+                * will be written after the first opcode of the target is
+                * executed. Handle this by jumping to a special section of
+                * the dispatcher. It expects the loaded value to be in
+                * REG_TEMP, and the target register number to be in JIT_V1.*/
+               jit_movi(JIT_V1, ds->c.i.rt);
+
+               lightrec_jump_to_ds_check(state, _jit);
+       } else {
+               lightrec_jump_to_eob(state, _jit);
+       }
 }
 
-void lightrec_emit_eob(struct lightrec_cstate *state,
-                      const struct block *block, u16 offset)
+void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state,
+                                      const struct block *block, u16 offset)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       jit_state_t *_jit = block->_jit;
+
+       lightrec_clean_regs(reg_cache, _jit);
+
+       /* Call the interpreter with the block's address in JIT_V1 and the
+        * PC (which might have an offset) in JIT_V0. */
+       lightrec_load_imm(reg_cache, _jit, JIT_V0, block->pc,
+                         block->pc + (offset << 2));
+       jit_movi(JIT_V1, (uintptr_t)block);
+
+       jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles);
+       lightrec_jump_to_fn(_jit, state->state->interpreter_func);
+}
+
+static void lightrec_emit_eob(struct lightrec_cstate *state,
+                             const struct block *block, u16 offset)
 {
        struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
@@ -198,9 +241,9 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
        jit_state_t *_jit = block->_jit;
        struct lightrec_branch *branch;
        const struct opcode *op = &block->opcode_list[offset],
-                           *next = &block->opcode_list[offset + 1];
+                           *ds = get_delay_slot(block->opcode_list, offset);
        jit_node_t *addr;
-       bool is_forward = (s16)op->i.imm >= -1;
+       bool is_forward = (s16)op->i.imm >= 0;
        int op_cycles = lightrec_cycles_of_opcode(op->c);
        u32 target_offset, cycles = state->cycles + op_cycles;
        bool no_indirection = false;
@@ -210,7 +253,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
        jit_note(__FILE__, __LINE__);
 
        if (!op_flag_no_ds(op->flags))
-               cycles += lightrec_cycles_of_opcode(next->c);
+               cycles += lightrec_cycles_of_opcode(ds->c);
 
        state->cycles = -op_cycles;
 
@@ -224,7 +267,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
                        lightrec_do_early_unload(state, block, offset);
 
                if (op_flag_local_branch(op->flags) &&
-                   (op_flag_no_ds(op->flags) || !next->opcode) &&
+                   (op_flag_no_ds(op->flags) || !ds->opcode) &&
                    is_forward && !lightrec_has_dirty_regs(reg_cache))
                        no_indirection = true;
 
@@ -246,8 +289,11 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
 
        if (op_flag_local_branch(op->flags)) {
                /* Recompile the delay slot */
-               if (!op_flag_no_ds(op->flags) && next->opcode)
+               if (!op_flag_no_ds(op->flags) && ds->opcode) {
+                       /* Never handle load delays with local branches. */
+                       state->no_load_delay = true;
                        lightrec_rec_opcode(state, block, offset + 1);
+               }
 
                if (link)
                        update_ra_register(reg_cache, _jit, 31, block->pc, link);
@@ -274,6 +320,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
 
        if (!op_flag_local_branch(op->flags) || !is_forward) {
                next_pc = get_branch_pc(block, offset, 1 + (s16)op->i.imm);
+               state->no_load_delay = op_flag_local_branch(op->flags);
                lightrec_emit_end_of_block(state, block, offset, -1, next_pc,
                                           31, link, false);
        }
@@ -287,8 +334,10 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16
                if (bz && link)
                        update_ra_register(reg_cache, _jit, 31, block->pc, link);
 
-               if (!op_flag_no_ds(op->flags) && next->opcode)
+               if (!op_flag_no_ds(op->flags) && ds->opcode) {
+                       state->no_load_delay = true;
                        lightrec_rec_opcode(state, block, offset + 1);
+               }
        }
 }
 
@@ -1090,6 +1139,7 @@ static void rec_io(struct lightrec_cstate *state,
        u32 flags = block->opcode_list[offset].flags;
        bool is_tagged = LIGHTREC_FLAGS_GET_IO_MODE(flags);
        u32 lut_entry;
+       u8 zero;
 
        jit_note(__FILE__, __LINE__);
 
@@ -1100,6 +1150,16 @@ static void rec_io(struct lightrec_cstate *state,
        else if (load_rt)
                lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
 
+       if (op_flag_load_delay(flags) && !state->no_load_delay) {
+               /* Clear state->in_delay_slot_n. This notifies the lightrec_rw
+                * wrapper that it should write the REG_TEMP register instead of
+                * the actual output register of the opcode. */
+               zero = lightrec_alloc_reg_in(reg_cache, _jit, 0, 0);
+               jit_stxi_c(offsetof(struct lightrec_state, in_delay_slot_n),
+                           LIGHTREC_REG_STATE, zero);
+               lightrec_free_reg(reg_cache, zero);
+       }
+
        if (is_tagged) {
                call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_RW);
        } else {
@@ -1143,7 +1203,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate,
                ((imm & 0x3) || simm + lut_offt != (s16)(simm + lut_offt))));
        bool need_tmp = !no_mask || addr_offset || add_imm || invalidate;
        bool swc2 = c.i.op == OP_SWC2;
-       u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+       u8 in_reg = swc2 ? REG_TEMP : c.i.rt;
 
        rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0);
        rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
@@ -1202,7 +1262,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate,
                if (addr_reg == rs && c.i.rs == 0) {
                        addr_reg = LIGHTREC_REG_STATE;
                } else {
-                       jit_addr(tmp, addr_reg, LIGHTREC_REG_STATE);
+                       jit_add_state(tmp, addr_reg);
                        addr_reg = tmp;
                }
 
@@ -1268,14 +1328,15 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate,
        jit_state_t *_jit = block->_jit;
        jit_node_t *to_not_ram, *to_end;
        bool swc2 = c.i.op == OP_SWC2;
-       u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+       bool offset_ram_or_scratch = state->offset_ram || state->offset_scratch;
+       u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_TEMP : c.i.rt;
        s16 imm;
 
        jit_note(__FILE__, __LINE__);
        rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
        tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
 
-       if (state->offset_ram || state->offset_scratch)
+       if (offset_ram_or_scratch)
                tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
 
        /* Convert to KUNSEG and avoid RAM mirrors */
@@ -1307,7 +1368,7 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate,
                jit_movi(tmp2, state->offset_ram);
        }
 
-       if (state->offset_ram || state->offset_scratch) {
+       if (offset_ram_or_scratch) {
                jit_addr(tmp, tmp, tmp2);
                lightrec_free_reg(reg_cache, tmp2);
        }
@@ -1340,7 +1401,7 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block
        jit_node_t *to_not_ram, *to_end;
        bool swc2 = c.i.op == OP_SWC2;
        u8 tmp, tmp2, tmp3, masked_reg, rs, rt;
-       u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+       u8 in_reg = swc2 ? REG_TEMP : c.i.rt;
 
        jit_note(__FILE__, __LINE__);
 
@@ -1376,7 +1437,7 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block
 
        if (!lut_is_32bit(state))
                jit_lshi(tmp, tmp, 1);
-       jit_addr(tmp, LIGHTREC_REG_STATE, tmp);
+       jit_add_state(tmp, tmp);
 
        /* Write NULL to the code LUT to invalidate any block that's there */
        if (lut_is_32bit(state))
@@ -1437,7 +1498,7 @@ static void rec_store(struct lightrec_cstate *state,
                case LIGHTREC_IO_SCRATCH:
                case LIGHTREC_IO_DIRECT:
                case LIGHTREC_IO_DIRECT_HW:
-                       rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_CP2_TEMP);
+                       rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_TEMP);
                        break;
                default:
                        break;
@@ -1469,7 +1530,7 @@ static void rec_store(struct lightrec_cstate *state,
        }
 
        if (is_swc2)
-               lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP);
+               lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP);
 }
 
 static void rec_SB(struct lightrec_cstate *state,
@@ -1519,14 +1580,15 @@ static void rec_load_memory(struct lightrec_cstate *cstate,
 {
        struct regcache *reg_cache = cstate->reg_cache;
        struct opcode *op = &block->opcode_list[offset];
+       bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay;
        jit_state_t *_jit = block->_jit;
        u8 rs, rt, out_reg, addr_reg, flags = REG_EXT;
        bool no_mask = op_flag_no_mask(op->flags);
        union code c = op->c;
        s16 imm;
 
-       if (c.i.op == OP_LWC2)
-               out_reg = REG_CP2_TEMP;
+       if (load_delay || c.i.op == OP_LWC2)
+               out_reg = REG_TEMP;
        else if (c.i.rt)
                out_reg = c.i.rt;
        else
@@ -1619,14 +1681,16 @@ static void rec_load_direct(struct lightrec_cstate *cstate,
 {
        struct lightrec_state *state = cstate->state;
        struct regcache *reg_cache = cstate->reg_cache;
-       union code c = block->opcode_list[offset].c;
+       struct opcode *op = &block->opcode_list[offset];
+       bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay;
        jit_state_t *_jit = block->_jit;
        jit_node_t *to_not_ram, *to_not_bios, *to_end, *to_end2;
        u8 tmp, rs, rt, out_reg, addr_reg, flags = REG_EXT;
+       union code c = op->c;
        s16 imm;
 
-       if (c.i.op == OP_LWC2)
-               out_reg = REG_CP2_TEMP;
+       if (load_delay || c.i.op == OP_LWC2)
+               out_reg = REG_TEMP;
        else if (c.i.rt)
                out_reg = c.i.rt;
        else
@@ -1754,8 +1818,8 @@ static void rec_load(struct lightrec_cstate *state, const struct block *block,
        }
 
        if (op->i.op == OP_LWC2) {
-               rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_CP2_TEMP);
-               lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP);
+               rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_TEMP);
+               lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP);
        }
 }
 
@@ -1827,6 +1891,15 @@ static void rec_break_syscall(struct lightrec_cstate *state,
        jit_stxi_i(offsetof(struct lightrec_state, exit_flags),
                   LIGHTREC_REG_STATE, tmp);
 
+       jit_ldxi_i(tmp, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, target_cycle));
+       jit_subr(tmp, tmp, LIGHTREC_REG_CYCLE);
+       jit_movi(LIGHTREC_REG_CYCLE, 0);
+       jit_stxi_i(offsetof(struct lightrec_state, target_cycle),
+                  LIGHTREC_REG_STATE, tmp);
+       jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+                  LIGHTREC_REG_STATE, tmp);
+
        lightrec_free_reg(reg_cache, tmp);
 
        /* TODO: the return address should be "pc - 4" if we're a delay slot */
@@ -1872,6 +1945,7 @@ static void rec_mtc(struct lightrec_cstate *state, const struct block *block, u1
        jit_note(__FILE__, __LINE__);
        lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rs, false);
        lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
+       lightrec_clean_reg_if_loaded(reg_cache, _jit, REG_TEMP, false);
 
        call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_MTC);
 
@@ -1901,13 +1975,16 @@ rec_mfc0(struct lightrec_cstate *state, const struct block *block, u16 offset)
        lightrec_free_reg(reg_cache, rt);
 }
 
-static bool block_in_bios(const struct lightrec_cstate *state,
-                         const struct block *block)
+static bool block_uses_icache(const struct lightrec_cstate *state,
+                             const struct block *block)
 {
-       const struct lightrec_mem_map *bios = &state->state->maps[PSX_MAP_BIOS];
+       const struct lightrec_mem_map *map = &state->state->maps[PSX_MAP_KERNEL_USER_RAM];
        u32 pc = kunseg(block->pc);
 
-       return pc >= bios->pc && pc < bios->pc + bios->length;
+       if (pc < map->pc || pc >= map->pc + map->length)
+               return false;
+
+       return (block->pc >> 28) < 0xa;
 }
 
 static void
@@ -1933,11 +2010,11 @@ rec_mtc0(struct lightrec_cstate *state, const struct block *block, u16 offset)
                break;
        }
 
-       if (/*block_in_bios(state, block) &&*/ c.r.rd == 12) {
-               /* If we are running code from the BIOS, handle writes to the
-                * Status register in C. BIOS code may toggle bit 16 which will
-                * map/unmap the RAM, while game code cannot do that. */
-               /*  ^ wrong, it can execute from 0xa0000000 with isolated cache */
+       if (!block_uses_icache(state, block) && c.r.rd == 12) {
+               /* If we are not running code from the RAM through kuseg or
+                * kseg0, handle writes to the Status register in C; as the
+                * code may toggle bit 16 which isolates the cache. Code
+                * running from kuseg or kseg0 in RAM cannot do that. */
                rec_mtc(state, block, offset);
                return;
        }
@@ -2193,7 +2270,6 @@ static void rec_cp2_do_mtc2(struct lightrec_cstate *state,
 {
        struct regcache *reg_cache = state->reg_cache;
        jit_state_t *_jit = block->_jit;
-       jit_node_t *loop, *to_loop;
        u8 rt, tmp, tmp2, flags = 0;
 
        _jit_name(block->_jit, __func__);
@@ -2246,30 +2322,20 @@ static void rec_cp2_do_mtc2(struct lightrec_cstate *state,
                break;
        case 30:
                tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
-               tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
 
                /* if (rt < 0) rt = ~rt; */
                jit_rshi(tmp, rt, 31);
                jit_xorr(tmp, rt, tmp);
 
-               /* We know the sign bit is 0. Left-shift by 1 to start the algorithm */
-               jit_lshi(tmp, tmp, 1);
-               jit_movi(tmp2, 33);
-
-               /* Decrement tmp2 and right-shift the value by 1 until it equals zero */
-               loop = jit_label();
-               jit_subi(tmp2, tmp2, 1);
-               jit_rshi_u(tmp, tmp, 1);
-               to_loop = jit_bnei(tmp, 0);
-
-               jit_patch_at(to_loop, loop);
+               /* Count leading zeros */
+               jit_clzr(tmp, tmp);
+               if (__WORDSIZE != 32)
+                       jit_subi(tmp, tmp, __WORDSIZE - 32);
 
-               jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp2);
-               jit_stxi_i(cp2d_i_offset(30), LIGHTREC_REG_STATE, rt);
+               jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp);
 
                lightrec_free_reg(reg_cache, tmp);
-               lightrec_free_reg(reg_cache, tmp2);
-               break;
+               fallthrough;
        default:
                jit_stxi_i(cp2d_i_offset(reg), LIGHTREC_REG_STATE, rt);
                break;
@@ -2406,34 +2472,44 @@ static void rec_meta_MOV(struct lightrec_cstate *state,
        unload_rd = OPT_EARLY_UNLOAD
                && LIGHTREC_FLAGS_GET_RD(op->flags) == LIGHTREC_REG_UNLOAD;
 
-       if (c.r.rs || unload_rd)
-               rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+       if (c.m.rs && !lightrec_reg_is_loaded(reg_cache, c.m.rs)) {
+               /* The source register is not yet loaded - we can load its value
+                * from the register cache directly into the target register. */
+               rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
+
+               jit_ldxi_i(rd, LIGHTREC_REG_STATE,
+                          offsetof(struct lightrec_state, regs.gpr) + (c.m.rs << 2));
 
-       if (unload_rd) {
+               lightrec_free_reg(reg_cache, rd);
+       } else if (unload_rd) {
                /* If the destination register will be unloaded right after the
                 * MOV meta-opcode, we don't actually need to write any host
                 * register - we can just store the source register directly to
                 * the register cache, at the offset corresponding to the
                 * destination register. */
-               lightrec_discard_reg_if_loaded(reg_cache, c.r.rd);
+               lightrec_discard_reg_if_loaded(reg_cache, c.m.rd);
+
+               rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
 
                jit_stxi_i(offsetof(struct lightrec_state, regs.gpr)
-                          + c.r.rd << 2, LIGHTREC_REG_STATE, rs);
+                          + (c.m.rd << 2), LIGHTREC_REG_STATE, rs);
 
                lightrec_free_reg(reg_cache, rs);
        } else {
-               rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT);
+               if (c.m.rs)
+                       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+
+               rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
 
-               if (c.r.rs == 0)
+               if (c.m.rs == 0) {
                        jit_movi(rd, 0);
-               else
+               } else {
                        jit_extr_i(rd, rs);
+                       lightrec_free_reg(reg_cache, rs);
+               }
 
                lightrec_free_reg(reg_cache, rd);
        }
-
-       if (c.r.rs || unload_rd)
-               lightrec_free_reg(reg_cache, rs);
 }
 
 static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state,
@@ -2443,21 +2519,21 @@ static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state,
        struct regcache *reg_cache = state->reg_cache;
        union code c = block->opcode_list[offset].c;
        jit_state_t *_jit = block->_jit;
-       u8 rs, rt;
+       u8 rs, rd;
 
        _jit_name(block->_jit, __func__);
        jit_note(__FILE__, __LINE__);
 
-       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
-       rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, REG_EXT);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
 
-       if (c.i.op == OP_META_EXTC)
-               jit_extr_c(rt, rs);
+       if (c.m.op == OP_META_EXTC)
+               jit_extr_c(rd, rs);
        else
-               jit_extr_s(rt, rs);
+               jit_extr_s(rd, rs);
 
        lightrec_free_reg(reg_cache, rs);
-       lightrec_free_reg(reg_cache, rt);
+       lightrec_free_reg(reg_cache, rd);
 }
 
 static void rec_meta_MULT2(struct lightrec_cstate *state,
@@ -2524,6 +2600,29 @@ static void rec_meta_MULT2(struct lightrec_cstate *state,
        jit_note(__FILE__, __LINE__);
 }
 
+static void rec_meta_COM(struct lightrec_cstate *state,
+                        const struct block *block, u16 offset)
+{
+       struct regcache *reg_cache = state->reg_cache;
+       union code c = block->opcode_list[offset].c;
+       jit_state_t *_jit = block->_jit;
+       u8 rd, rs, flags;
+
+       jit_note(__FILE__, __LINE__);
+       rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+       rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, 0);
+
+       flags = lightrec_get_reg_in_flags(reg_cache, rs);
+
+       lightrec_set_reg_out_flags(reg_cache, rd,
+                                  flags & REG_EXT);
+
+       jit_comr(rd, rs);
+
+       lightrec_free_reg(reg_cache, rs);
+       lightrec_free_reg(reg_cache, rd);
+}
+
 static const lightrec_rec_func_t rec_standard[64] = {
        SET_DEFAULT_ELM(rec_standard, unknown_opcode),
        [OP_SPECIAL]            = rec_SPECIAL,
@@ -2559,9 +2658,7 @@ static const lightrec_rec_func_t rec_standard[64] = {
        [OP_LWC2]               = rec_LW,
        [OP_SWC2]               = rec_SW,
 
-       [OP_META_MOV]           = rec_meta_MOV,
-       [OP_META_EXTC]          = rec_meta_EXTC_EXTS,
-       [OP_META_EXTS]          = rec_meta_EXTC_EXTS,
+       [OP_META]               = rec_META,
        [OP_META_MULT2]         = rec_meta_MULT2,
        [OP_META_MULTU2]        = rec_meta_MULT2,
 };
@@ -2623,6 +2720,14 @@ static const lightrec_rec_func_t rec_cp2_basic[64] = {
        [OP_CP2_BASIC_CTC2]     = rec_cp2_basic_CTC2,
 };
 
+static const lightrec_rec_func_t rec_meta[64] = {
+       SET_DEFAULT_ELM(rec_meta, unknown_opcode),
+       [OP_META_MOV]           = rec_meta_MOV,
+       [OP_META_EXTC]          = rec_meta_EXTC_EXTS,
+       [OP_META_EXTS]          = rec_meta_EXTC_EXTS,
+       [OP_META_COM]           = rec_meta_COM,
+};
+
 static void rec_SPECIAL(struct lightrec_cstate *state,
                        const struct block *block, u16 offset)
 {
@@ -2676,6 +2781,18 @@ static void rec_CP2(struct lightrec_cstate *state,
        rec_CP(state, block, offset);
 }
 
+static void rec_META(struct lightrec_cstate *state,
+                    const struct block *block, u16 offset)
+{
+       union code c = block->opcode_list[offset].c;
+       lightrec_rec_func_t f = rec_meta[c.m.op];
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
+               unknown_opcode(state, block, offset);
+       else
+               (*f)(state, block, offset);
+}
+
 void lightrec_rec_opcode(struct lightrec_cstate *state,
                         const struct block *block, u16 offset)
 {
@@ -2715,4 +2832,6 @@ void lightrec_rec_opcode(struct lightrec_cstate *state,
 
                lightrec_do_early_unload(state, block, unload_offset);
        }
+
+       state->no_load_delay = false;
 }
index 4cbe8da..c960a7f 100644 (file)
@@ -13,7 +13,7 @@ struct lightrec_cstate;
 struct opcode;
 
 void lightrec_rec_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset);
-void lightrec_emit_eob(struct lightrec_cstate *state,
-                      const struct block *block, u16 offset);
+void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state,
+                                      const struct block *block, u16 offset);
 
 #endif /* __EMITTER_H__ */
index ea8098c..80a07f3 100644 (file)
@@ -16,6 +16,7 @@ struct interpreter;
 static u32 int_CP0(struct interpreter *inter);
 static u32 int_CP2(struct interpreter *inter);
 static u32 int_SPECIAL(struct interpreter *inter);
+static u32 int_META(struct interpreter *inter);
 static u32 int_REGIMM(struct interpreter *inter);
 static u32 int_branch(struct interpreter *inter, u32 pc,
                      union code code, bool branch);
@@ -45,7 +46,7 @@ static inline u32 int_get_ds_pc(const struct interpreter *inter, s16 imm)
 
 static inline struct opcode *next_op(const struct interpreter *inter)
 {
-       return &inter->block->opcode_list[inter->offset + 1];
+       return &inter->op[1];
 }
 
 static inline u32 execute(lightrec_int_func_t func, struct interpreter *inter)
@@ -186,7 +187,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
         * interpreter in that case.
         * Same goes for when we have a branch in a delay slot of another
         * branch. */
-       load_in_ds = load_in_delay_slot(op->c);
+       load_in_ds = opcode_is_load(op->c) || opcode_is_mfc(op->c);
        branch_in_ds = has_delay_slot(op->c);
 
        if (branch) {
@@ -241,6 +242,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
                        new_op.c = op_next;
                        new_op.flags = 0;
                        inter2.op = &new_op;
+                       inter2.offset = 0;
 
                        /* Execute the first opcode of the next block */
                        lightrec_int_op(&inter2);
@@ -259,6 +261,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch)
        inter2.block = inter->block;
        inter2.op = op;
        inter2.cycles = inter->cycles;
+       inter2.offset = inter->offset + 1;
 
        if (dummy_ld)
                new_rt = reg_cache[op->r.rt];
@@ -351,11 +354,6 @@ static u32 int_jumpr(struct interpreter *inter, u8 link_reg)
        u32 old_pc = int_get_branch_pc(inter);
        u32 next_pc = state->regs.gpr[inter->op->r.rs];
 
-       if (op_flag_emulate_branch(inter->op->flags) && inter->offset) {
-               inter->cycles -= lightrec_cycles_of_opcode(inter->op->c);
-               return old_pc;
-       }
-
        if (link_reg)
                state->regs.gpr[link_reg] = old_pc + 8;
 
@@ -391,11 +389,6 @@ static u32 int_branch(struct interpreter *inter, u32 pc,
 {
        u32 next_pc = pc + 4 + ((s16)code.i.imm << 2);
 
-       if (op_flag_emulate_branch(inter->op->flags) && inter->offset) {
-               inter->cycles -= lightrec_cycles_of_opcode(inter->op->c);
-               return pc;
-       }
-
        update_cycles_before_branch(inter);
 
        if (op_flag_no_ds(inter->op->flags)) {
@@ -605,11 +598,14 @@ static u32 int_io(struct interpreter *inter, bool is_load)
 {
        struct opcode_i *op = &inter->op->i;
        u32 *reg_cache = inter->state->regs.gpr;
-       u32 val;
+       u32 val, *flags = NULL;
+
+       if (inter->block)
+               flags = &inter->op->flags;
 
        val = lightrec_rw(inter->state, inter->op->c,
                          reg_cache[op->rs], reg_cache[op->rt],
-                         &inter->op->flags, inter->block);
+                         flags, inter->block, inter->offset);
 
        if (is_load && op->rt)
                reg_cache[op->rt] = val;
@@ -632,7 +628,7 @@ static u32 int_store(struct interpreter *inter)
        lightrec_rw(inter->state, inter->op->c,
                    inter->state->regs.gpr[inter->op->i.rs],
                    inter->state->regs.gpr[inter->op->i.rt],
-                   &inter->op->flags, inter->block);
+                   &inter->op->flags, inter->block, inter->offset);
 
        next_pc = int_get_ds_pc(inter, 1);
 
@@ -717,9 +713,9 @@ static u32 int_syscall_break(struct interpreter *inter)
 {
 
        if (inter->op->r.op == OP_SPECIAL_BREAK)
-               inter->state->exit_flags |= LIGHTREC_EXIT_BREAK;
+               lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_BREAK);
        else
-               inter->state->exit_flags |= LIGHTREC_EXIT_SYSCALL;
+               lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_SYSCALL);
 
        return int_get_ds_pc(inter, 0);
 }
@@ -955,7 +951,7 @@ static u32 int_special_SLTU(struct interpreter *inter)
 static u32 int_META_MOV(struct interpreter *inter)
 {
        u32 *reg_cache = inter->state->regs.gpr;
-       struct opcode_r *op = &inter->op->r;
+       struct opcode_m *op = &inter->op->m;
 
        if (likely(op->rd))
                reg_cache[op->rd] = reg_cache[op->rs];
@@ -966,10 +962,10 @@ static u32 int_META_MOV(struct interpreter *inter)
 static u32 int_META_EXTC(struct interpreter *inter)
 {
        u32 *reg_cache = inter->state->regs.gpr;
-       struct opcode_i *op = &inter->op->i;
+       struct opcode_m *op = &inter->op->m;
 
-       if (likely(op->rt))
-               reg_cache[op->rt] = (u32)(s32)(s8)reg_cache[op->rs];
+       if (likely(op->rd))
+               reg_cache[op->rd] = (u32)(s32)(s8)reg_cache[op->rs];
 
        return jump_next(inter);
 }
@@ -977,10 +973,10 @@ static u32 int_META_EXTC(struct interpreter *inter)
 static u32 int_META_EXTS(struct interpreter *inter)
 {
        u32 *reg_cache = inter->state->regs.gpr;
-       struct opcode_i *op = &inter->op->i;
+       struct opcode_m *op = &inter->op->m;
 
-       if (likely(op->rt))
-               reg_cache[op->rt] = (u32)(s32)(s16)reg_cache[op->rs];
+       if (likely(op->rd))
+               reg_cache[op->rd] = (u32)(s32)(s16)reg_cache[op->rs];
 
        return jump_next(inter);
 }
@@ -1012,6 +1008,17 @@ static u32 int_META_MULT2(struct interpreter *inter)
        return jump_next(inter);
 }
 
+static u32 int_META_COM(struct interpreter *inter)
+{
+       u32 *reg_cache = inter->state->regs.gpr;
+       union code c = inter->op->c;
+
+       if (likely(c.m.rd))
+               reg_cache[c.m.rd] = ~reg_cache[c.m.rs];
+
+       return jump_next(inter);
+}
+
 static const lightrec_int_func_t int_standard[64] = {
        SET_DEFAULT_ELM(int_standard, int_unimplemented),
        [OP_SPECIAL]            = int_SPECIAL,
@@ -1047,9 +1054,7 @@ static const lightrec_int_func_t int_standard[64] = {
        [OP_LWC2]               = int_LWC2,
        [OP_SWC2]               = int_store,
 
-       [OP_META_MOV]           = int_META_MOV,
-       [OP_META_EXTC]          = int_META_EXTC,
-       [OP_META_EXTS]          = int_META_EXTS,
+       [OP_META]               = int_META,
        [OP_META_MULT2]         = int_META_MULT2,
        [OP_META_MULTU2]        = int_META_MULT2,
 };
@@ -1111,6 +1116,14 @@ static const lightrec_int_func_t int_cp2_basic[64] = {
        [OP_CP2_BASIC_CTC2]     = int_ctc,
 };
 
+static const lightrec_int_func_t int_meta[64] = {
+       SET_DEFAULT_ELM(int_meta, int_unimplemented),
+       [OP_META_MOV]           = int_META_MOV,
+       [OP_META_EXTC]          = int_META_EXTC,
+       [OP_META_EXTS]          = int_META_EXTS,
+       [OP_META_COM]           = int_META_COM,
+};
+
 static u32 int_SPECIAL(struct interpreter *inter)
 {
        lightrec_int_func_t f = int_special[inter->op->r.op];
@@ -1152,6 +1165,16 @@ static u32 int_CP2(struct interpreter *inter)
        return int_CP(inter);
 }
 
+static u32 int_META(struct interpreter *inter)
+{
+       lightrec_int_func_t f = int_meta[inter->op->m.op];
+
+       if (!HAS_DEFAULT_ELM && unlikely(!f))
+               return int_unimplemented(inter);
+
+       return execute(f, inter);
+}
+
 static u32 lightrec_emulate_block_list(struct lightrec_state *state,
                                       struct block *block, u32 offset)
 {
@@ -1188,3 +1211,75 @@ u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u3
 
        return 0;
 }
+
+static u32 branch_get_next_pc(struct lightrec_state *state, union code c, u32 pc)
+{
+       switch (c.i.op) {
+       case OP_SPECIAL:
+               /* JR / JALR */
+               return state->regs.gpr[c.r.rs];
+       case OP_J:
+       case OP_JAL:
+               return (pc & 0xf0000000) | (c.j.imm << 2);
+       default:
+               /* Branch opcodes */
+               return pc + 4 + ((s16)c.i.imm << 2);
+       }
+}
+
+u32 lightrec_handle_load_delay(struct lightrec_state *state,
+                              struct block *block, u32 pc, u32 reg)
+{
+       union code c = lightrec_read_opcode(state, pc);
+       struct opcode op[2] = {
+               {
+                       .c = c,
+                       .flags = 0,
+               },
+               {
+                       .flags = 0,
+               },
+       };
+       struct interpreter inter = {
+               .block = block,
+               .state = state,
+               .offset = 0,
+               .op = op,
+               .cycles = 0,
+       };
+       bool branch_taken;
+       u32 reg_mask, next_pc;
+
+       if (has_delay_slot(c)) {
+               op[1].c = lightrec_read_opcode(state, pc + 4);
+
+               branch_taken = is_branch_taken(state->regs.gpr, c);
+               next_pc = branch_get_next_pc(state, c, pc);
+
+               /* Branch was evaluated, we can write the load opcode's target
+                * register now. */
+               state->regs.gpr[reg] = state->temp_reg;
+
+               /* Handle JALR / regimm opcodes setting $ra (or any other
+                * register in the case of JALR) */
+               reg_mask = (u32)opcode_write_mask(c);
+               if (reg_mask)
+                       state->regs.gpr[ctz32(reg_mask)] = pc + 8;
+
+               /* Handle delay slot of the branch opcode */
+               pc = int_delay_slot(&inter, next_pc, branch_taken);
+       } else {
+               /* Make sure we only run one instruction */
+               inter.delay_slot = true;
+
+               lightrec_int_op(&inter);
+               pc += 4;
+
+               if (!opcode_writes_register(c, reg))
+                       state->regs.gpr[reg] = state->temp_reg;
+       }
+
+       state->current_cycle += inter.cycles;
+
+       return pc;
+}
index 96600bf..51c5390 100644 (file)
@@ -11,5 +11,7 @@
 struct block;
 
 u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u32 pc);
+u32 lightrec_handle_load_delay(struct lightrec_state *state,
+                              struct block *block, u32 pc, u32 reg);
 
 #endif /* __LIGHTREC_INTERPRETER_H__ */
index b0e8bf3..4cb97d3 100644 (file)
 
 #define jit_b()                        jit_beqr(0, 0)
 
+#if defined(__sh__)
+#define jit_add_state(u,v)                                             \
+       do {                                                            \
+               jit_new_node_ww(jit_code_movr,_R0,LIGHTREC_REG_STATE);  \
+               jit_new_node_www(jit_code_addr,u,v,_R0);                \
+       } while (0)
+#else
+#define jit_add_state(u,v)     jit_addr(u,v,LIGHTREC_REG_STATE)
+#endif
+
 #endif /* __LIGHTNING_WRAPPER_H__ */
index 1188665..ed29ee4 100644 (file)
 #cmakedefine01 OPT_REMOVE_DIV_BY_ZERO_SEQ
 #cmakedefine01 OPT_REPLACE_MEMSET
 #cmakedefine01 OPT_DETECT_IMPOSSIBLE_BRANCHES
+#cmakedefine01 OPT_HANDLE_LOAD_DELAYS
 #cmakedefine01 OPT_TRANSFORM_OPS
 #cmakedefine01 OPT_LOCAL_BRANCHES
 #cmakedefine01 OPT_SWITCH_DELAY_SLOTS
-#cmakedefine01 OPT_FLAG_STORES
 #cmakedefine01 OPT_FLAG_IO
 #cmakedefine01 OPT_FLAG_MULT_DIV
 #cmakedefine01 OPT_EARLY_UNLOAD
index e67d406..12e953a 100644 (file)
@@ -81,7 +81,7 @@
 
 #define REG_LO 32
 #define REG_HI 33
-#define REG_CP2_TEMP (offsetof(struct lightrec_state, cp2_temp_reg) / sizeof(u32))
+#define REG_TEMP (offsetof(struct lightrec_state, temp_reg) / sizeof(u32))
 
 /* Definition of jit_state_t (avoids inclusion of <lightning.h>) */
 struct jit_node;
@@ -149,13 +149,16 @@ struct lightrec_cstate {
        unsigned int cycles;
 
        struct regcache *reg_cache;
+
+       _Bool no_load_delay;
 };
 
 struct lightrec_state {
        struct lightrec_registers regs;
-       u32 cp2_temp_reg;
+       u32 temp_reg;
        u32 next_pc;
        uintptr_t wrapper_regs[NUM_TEMPS];
+       u8 in_delay_slot_n;
        u32 current_cycle;
        u32 target_cycle;
        u32 exit_flags;
@@ -169,10 +172,13 @@ struct lightrec_state {
        struct reaper *reaper;
        void *tlsf;
        void (*eob_wrapper_func)(void);
+       void (*interpreter_func)(void);
+       void (*ds_check_func)(void);
        void (*memset_func)(void);
        void (*get_next_block)(void);
        struct lightrec_ops ops;
        unsigned int nb_precompile;
+       unsigned int nb_compile;
        unsigned int nb_maps;
        const struct lightrec_mem_map *maps;
        uintptr_t offset_ram, offset_bios, offset_scratch, offset_io;
@@ -182,9 +188,8 @@ struct lightrec_state {
        void *code_lut[];
 };
 
-u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u32 *flags,
-               struct block *block);
+u32 lightrec_rw(struct lightrec_state *state, union code op, u32 addr,
+               u32 data, u32 *flags, struct block *block, u16 offset);
 
 void lightrec_free_block(struct lightrec_state *state, struct block *block);
 
@@ -285,7 +290,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, struct block *block);
 void lightrec_free_opcode_list(struct lightrec_state *state,
                               struct opcode *list);
 
-unsigned int lightrec_cycles_of_opcode(union code code);
+__cnst unsigned int lightrec_cycles_of_opcode(union code code);
 
 static inline u8 get_mult_div_lo(union code c)
 {
@@ -349,4 +354,10 @@ static inline _Bool can_zero_extend(u32 value, u8 order)
       return (value >> order) == 0;
 }
 
+static inline const struct opcode *
+get_delay_slot(const struct opcode *list, u16 i)
+{
+       return op_flag_no_ds(list[i].flags) ? &list[i - 1] : &list[i + 1];
+}
+
 #endif /* __LIGHTREC_PRIVATE_H__ */
index b9e82fb..d5b1de9 100644 (file)
@@ -237,26 +237,43 @@ lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr)
        return map;
 }
 
-u32 lightrec_rw(struct lightrec_state *state, union code op,
-               u32 addr, u32 data, u32 *flags, struct block *block)
+u32 lightrec_rw(struct lightrec_state *state, union code op, u32 base,
+               u32 data, u32 *flags, struct block *block, u16 offset)
 {
        const struct lightrec_mem_map *map;
        const struct lightrec_mem_map_ops *ops;
        u32 opcode = op.opcode;
+       bool was_tagged = true;
+       u16 old_flags;
+       u32 addr;
        void *host;
 
-       addr += (s16) op.i.imm;
+       addr = kunseg(base + (s16) op.i.imm);
 
-       map = lightrec_get_map(state, &host, kunseg(addr));
+       map = lightrec_get_map(state, &host, addr);
        if (!map) {
                __segfault_cb(state, addr, block);
                return 0;
        }
 
+       if (flags)
+               was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(*flags);
 
        if (likely(!map->ops)) {
-               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
-                       *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+               if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags)) {
+                       /* Force parallel port accesses as HW accesses, because
+                        * the direct-I/O emitters can't differenciate it. */
+                       if (unlikely(map == &state->maps[PSX_MAP_PARALLEL_PORT]))
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+                       /* If the base register is 0x0, be extra suspicious.
+                        * Some games (e.g. Sled Storm) actually do segmentation
+                        * faults by using uninitialized pointers, which are
+                        * later initialized to point to hardware registers. */
+                       else if (op.i.rs && base == 0x0)
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+                       else
+                               *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+               }
 
                ops = &lightrec_default_ops;
        } else if (flags &&
@@ -269,6 +286,17 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
                ops = map->ops;
        }
 
+       if (!was_tagged) {
+               old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
+
+               if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
+                       pr_debug("Opcode of block at PC 0x%08x has been tagged"
+                                " - flag for recompilation\n", block->pc);
+
+                       lut_write(state, lut_offset(block->pc), NULL);
+               }
+       }
+
        switch (op.i.op) {
        case OP_SB:
                ops->sb(state, opcode, host, addr, (u8) data);
@@ -311,10 +339,10 @@ u32 lightrec_rw(struct lightrec_state *state, union code op,
 
 static void lightrec_rw_helper(struct lightrec_state *state,
                               union code op, u32 *flags,
-                              struct block *block)
+                              struct block *block, u16 offset)
 {
        u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs],
-                             state->regs.gpr[op.i.rt], flags, block);
+                             state->regs.gpr[op.i.rt], flags, block, offset);
 
        switch (op.i.op) {
        case OP_LB:
@@ -324,8 +352,12 @@ static void lightrec_rw_helper(struct lightrec_state *state,
        case OP_LWL:
        case OP_LWR:
        case OP_LW:
-               if (op.i.rt)
+               if (OPT_HANDLE_LOAD_DELAYS && unlikely(!state->in_delay_slot_n)) {
+                       state->temp_reg = ret;
+                       state->in_delay_slot_n = 0xff;
+               } else if (op.i.rt) {
                        state->regs.gpr[op.i.rt] = ret;
+               }
                fallthrough;
        default:
                break;
@@ -334,16 +366,14 @@ static void lightrec_rw_helper(struct lightrec_state *state,
 
 static void lightrec_rw_cb(struct lightrec_state *state, u32 arg)
 {
-       lightrec_rw_helper(state, (union code) arg, NULL, NULL);
+       lightrec_rw_helper(state, (union code) arg, NULL, NULL, 0);
 }
 
 static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
 {
        struct block *block;
        struct opcode *op;
-       bool was_tagged;
        u16 offset = (u16)arg;
-       u16 old_flags;
 
        block = lightrec_find_block_from_lut(state->block_cache,
                                             arg >> 16, state->next_pc);
@@ -355,20 +385,7 @@ static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
        }
 
        op = &block->opcode_list[offset];
-       was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(op->flags);
-
-       lightrec_rw_helper(state, op->c, &op->flags, block);
-
-       if (!was_tagged) {
-               old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
-
-               if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
-                       pr_debug("Opcode of block at PC 0x%08x has been tagged"
-                                " - flag for recompilation\n", block->pc);
-
-                       lut_write(state, lut_offset(block->pc), NULL);
-               }
-       }
+       lightrec_rw_helper(state, op->c, &op->flags, block, offset);
 }
 
 static u32 clamp_s32(s32 val, s32 min, s32 max)
@@ -462,7 +479,7 @@ static void lightrec_mfc_cb(struct lightrec_state *state, union code op)
        u32 rt = lightrec_mfc(state, op);
 
        if (op.i.op == OP_SWC2)
-               state->cp2_temp_reg = rt;
+               state->temp_reg = rt;
        else if (op.r.rt)
                state->regs.gpr[op.r.rt] = rt;
 }
@@ -603,7 +620,7 @@ static void lightrec_mtc_cb(struct lightrec_state *state, u32 arg)
        u8 reg;
 
        if (op.i.op == OP_LWC2) {
-               data = state->cp2_temp_reg;
+               data = state->temp_reg;
                reg = op.i.rt;
        } else {
                data = state->regs.gpr[op.r.rt];
@@ -703,6 +720,7 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc)
                }
 
                should_recompile = block_has_flag(block, BLOCK_SHOULD_RECOMPILE) &&
+                       !block_has_flag(block, BLOCK_NEVER_COMPILE) &&
                        !block_has_flag(block, BLOCK_IS_DEAD);
 
                if (unlikely(should_recompile)) {
@@ -803,6 +821,8 @@ static void lightrec_free_code(struct lightrec_state *state, void *ptr)
                lightrec_code_alloc_unlock(state);
 }
 
+static char lightning_code_data[0x80000];
+
 static void * lightrec_emit_code(struct lightrec_state *state,
                                 const struct block *block,
                                 jit_state_t *_jit, unsigned int *size)
@@ -813,7 +833,9 @@ static void * lightrec_emit_code(struct lightrec_state *state,
 
        jit_realize();
 
-       if (!ENABLE_DISASSEMBLER)
+       if (ENABLE_DISASSEMBLER)
+               jit_set_data(lightning_code_data, sizeof(lightning_code_data), 0);
+       else
                jit_set_data(NULL, 0, JIT_DISABLE_DATA | JIT_DISABLE_NOTE);
 
        if (has_code_buffer) {
@@ -872,6 +894,15 @@ static struct block * generate_wrapper(struct lightrec_state *state)
        unsigned int i;
        jit_node_t *addr[C_WRAPPERS_COUNT - 1];
        jit_node_t *to_end[C_WRAPPERS_COUNT - 1];
+       u8 tmp = JIT_R1;
+
+#ifdef __sh__
+       /* On SH, GBR-relative loads target the r0 register.
+        * Use it as the temporary register to factorize the move to
+        * JIT_R1. */
+       if (LIGHTREC_REG_STATE == _GBR)
+               tmp = _R0;
+#endif
 
        block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block));
        if (!block)
@@ -890,17 +921,18 @@ static struct block * generate_wrapper(struct lightrec_state *state)
 
        /* Add entry points */
        for (i = C_WRAPPERS_COUNT - 1; i > 0; i--) {
-               jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+               jit_ldxi(tmp, LIGHTREC_REG_STATE,
                         offsetof(struct lightrec_state, c_wrappers[i]));
                to_end[i - 1] = jit_b();
                addr[i - 1] = jit_indirect();
        }
 
-       jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+       jit_ldxi(tmp, LIGHTREC_REG_STATE,
                 offsetof(struct lightrec_state, c_wrappers[0]));
 
        for (i = 0; i < C_WRAPPERS_COUNT - 1; i++)
                jit_patch(to_end[i]);
+       jit_movr(JIT_R1, tmp);
 
        jit_epilog();
        jit_prolog();
@@ -1002,11 +1034,54 @@ static u32 lightrec_memset(struct lightrec_state *state)
        return 8 + 5 * (length  + 3 / 4);
 }
 
+static u32 lightrec_check_load_delay(struct lightrec_state *state, u32 pc, u8 reg)
+{
+       struct block *block;
+       union code first_op;
+
+       first_op = lightrec_read_opcode(state, pc);
+
+       if (likely(!opcode_reads_register(first_op, reg))) {
+               state->regs.gpr[reg] = state->temp_reg;
+       } else {
+               block = lightrec_get_block(state, pc);
+               if (unlikely(!block)) {
+                       pr_err("Unable to get block at PC 0x%08x\n", pc);
+                       lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
+                       pc = 0;
+               } else {
+                       pc = lightrec_handle_load_delay(state, block, pc, reg);
+               }
+       }
+
+       return pc;
+}
+
+static void update_cycle_counter_before_c(jit_state_t *_jit)
+{
+       /* update state->current_cycle */
+       jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, target_cycle));
+       jit_subr(JIT_R1, JIT_R2, LIGHTREC_REG_CYCLE);
+       jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+                  LIGHTREC_REG_STATE, JIT_R1);
+}
+
+static void update_cycle_counter_after_c(jit_state_t *_jit)
+{
+       /* Recalc the delta */
+       jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, current_cycle));
+       jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+                  offsetof(struct lightrec_state, target_cycle));
+       jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+}
+
 static struct block * generate_dispatcher(struct lightrec_state *state)
 {
        struct block *block;
        jit_state_t *_jit;
-       jit_node_t *to_end, *loop, *addr, *addr2, *addr3;
+       jit_node_t *to_end, *loop, *addr, *addr2, *addr3, *addr4, *addr5, *jmp, *jmp2;
        unsigned int i;
        u32 offset;
 
@@ -1047,13 +1122,70 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
 
                jit_prepare();
                jit_pushargr(LIGHTREC_REG_STATE);
+
                jit_finishi(lightrec_memset);
+               jit_retval(LIGHTREC_REG_CYCLE);
 
                jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
                            offsetof(struct lightrec_state, regs.gpr[31]));
-
-               jit_retval(LIGHTREC_REG_CYCLE);
                jit_subr(LIGHTREC_REG_CYCLE, JIT_V1, LIGHTREC_REG_CYCLE);
+
+               if (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)
+                       jmp = jit_b();
+       }
+
+       if (OPT_DETECT_IMPOSSIBLE_BRANCHES) {
+               /* Blocks will jump here when they reach a branch that should
+                * be executed with the interpreter, passing the branch's PC
+                * in JIT_V0 and the address of the block in JIT_V1. */
+               addr4 = jit_indirect();
+
+               update_cycle_counter_before_c(_jit);
+
+               jit_prepare();
+               jit_pushargr(LIGHTREC_REG_STATE);
+               jit_pushargr(JIT_V1);
+               jit_pushargr(JIT_V0);
+               jit_finishi(lightrec_emulate_block);
+
+               jit_retval(JIT_V0);
+
+               update_cycle_counter_after_c(_jit);
+
+               if (OPT_HANDLE_LOAD_DELAYS)
+                       jmp2 = jit_b();
+
+       }
+
+       if (OPT_HANDLE_LOAD_DELAYS) {
+               /* Blocks will jump here when they reach a branch with a load
+                * opcode in its delay slot. The delay slot has already been
+                * executed; the load value is in (state->temp_reg), and the
+                * register number is in JIT_V1.
+                * Jump to a C function which will evaluate the branch target's
+                * first opcode, to make sure that it does not read the register
+                * in question; and if it does, handle it accordingly. */
+               addr5 = jit_indirect();
+
+               update_cycle_counter_before_c(_jit);
+
+               jit_prepare();
+               jit_pushargr(LIGHTREC_REG_STATE);
+               jit_pushargr(JIT_V0);
+               jit_pushargr(JIT_V1);
+               jit_finishi(lightrec_check_load_delay);
+
+               jit_retval(JIT_V0);
+
+               update_cycle_counter_after_c(_jit);
+
+               if (OPT_DETECT_IMPOSSIBLE_BRANCHES)
+                       jit_patch(jmp2);
+       }
+
+       if (OPT_REPLACE_MEMSET
+           && (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)) {
+               jit_patch(jmp);
        }
 
        /* The block will jump here, with the number of cycles remaining in
@@ -1077,7 +1209,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        /* If possible, use the code LUT */
        if (!lut_is_32bit(state))
                jit_lshi(JIT_V1, JIT_V1, 1);
-       jit_addr(JIT_V1, JIT_V1, LIGHTREC_REG_STATE);
+       jit_add_state(JIT_V1, JIT_V1);
 
        offset = offsetof(struct lightrec_state, code_lut);
        if (lut_is_32bit(state))
@@ -1097,11 +1229,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
 
        if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* We may call the interpreter - update state->current_cycle */
-               jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, target_cycle));
-               jit_subr(JIT_V1, JIT_R2, LIGHTREC_REG_CYCLE);
-               jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
-                          LIGHTREC_REG_STATE, JIT_V1);
+               update_cycle_counter_before_c(_jit);
        }
 
        jit_prepare();
@@ -1119,11 +1247,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
        if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
                /* The interpreter may have updated state->current_cycle and
                 * state->target_cycle - recalc the delta */
-               jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, current_cycle));
-               jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
-                          offsetof(struct lightrec_state, target_cycle));
-               jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+               update_cycle_counter_after_c(_jit);
        } else {
                jit_movr(LIGHTREC_REG_CYCLE, JIT_V0);
        }
@@ -1153,6 +1277,10 @@ static struct block * generate_dispatcher(struct lightrec_state *state)
                goto err_free_block;
 
        state->eob_wrapper_func = jit_address(addr2);
+       if (OPT_DETECT_IMPOSSIBLE_BRANCHES)
+               state->interpreter_func = jit_address(addr4);
+       if (OPT_HANDLE_LOAD_DELAYS)
+               state->ds_check_func = jit_address(addr5);
        if (OPT_REPLACE_MEMSET)
                state->memset_func = jit_address(addr3);
        state->get_next_block = jit_address(addr);
@@ -1183,7 +1311,7 @@ union code lightrec_read_opcode(struct lightrec_state *state, u32 pc)
        return (union code) LE32TOH(*code);
 }
 
-unsigned int lightrec_cycles_of_opcode(union code code)
+__cnst unsigned int lightrec_cycles_of_opcode(union code code)
 {
        return 2;
 }
@@ -1291,11 +1419,6 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
 
        pr_debug("Block size: %hu opcodes\n", block->nb_ops);
 
-       /* If the first opcode is an 'impossible' branch, never compile the
-        * block */
-       if (should_emulate(block->opcode_list))
-               block_flags |= BLOCK_NEVER_COMPILE;
-
        fully_tagged = lightrec_block_is_fully_tagged(block);
        if (fully_tagged)
                block_flags |= BLOCK_FULLY_TAGGED;
@@ -1311,7 +1434,7 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state,
                addr = state->get_next_block;
        lut_write(state, lut_offset(pc), addr);
 
-       pr_debug("Recompile count: %u\n", state->nb_precompile++);
+       pr_debug("Blocks created: %u\n", ++state->nb_precompile);
 
        return block;
 }
@@ -1324,8 +1447,12 @@ static bool lightrec_block_is_fully_tagged(const struct block *block)
        for (i = 0; i < block->nb_ops; i++) {
                op = &block->opcode_list[i];
 
-               /* Verify that all load/stores of the opcode list
-                * Check all loads/stores of the opcode list and mark the
+               /* If we have one branch that must be emulated, we cannot trash
+                * the opcode list. */
+               if (should_emulate(op))
+                       return false;
+
+               /* Check all loads/stores of the opcode list and mark the
                 * block as fully compiled if they all have been tagged. */
                switch (op->c.i.op) {
                case OP_LB:
@@ -1421,6 +1548,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
        cstate->cycles = 0;
        cstate->nb_local_branches = 0;
        cstate->nb_targets = 0;
+       cstate->no_load_delay = false;
 
        jit_prolog();
        jit_tramp(256);
@@ -1439,7 +1567,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                        pr_debug("Branch at offset 0x%x will be emulated\n",
                                 i << 2);
 
-                       lightrec_emit_eob(cstate, block, i);
+                       lightrec_emit_jump_to_interpreter(cstate, block, i);
                        skip_next = !op_flag_no_ds(elm->flags);
                } else {
                        lightrec_rec_opcode(cstate, block, i);
@@ -1603,6 +1731,8 @@ int lightrec_compile_block(struct lightrec_cstate *cstate,
                lightrec_unregister(MEM_FOR_CODE, old_code_size);
        }
 
+       pr_debug("Blocks compiled: %u\n", ++state->nb_compile);
+
        return 0;
 }
 
@@ -1775,6 +1905,7 @@ struct lightrec_state * lightrec_init(char *argv0,
 
        state->tlsf = tlsf;
        state->with_32bit_lut = with_32bit_lut;
+       state->in_delay_slot_n = 0xff;
 
        state->block_cache = lightrec_blockcache_init(state);
        if (!state->block_cache)
index 9cd7f47..bd878c8 100644 (file)
@@ -28,6 +28,21 @@ extern "C" {
 #   define __api
 #endif
 
+#ifndef __cnst
+#   ifdef __GNUC__
+#      define __cnst __attribute__((const))
+#   else
+#      define __cnst
+#   endif
+#endif
+#ifndef __pure
+#   ifdef __GNUC__
+#      define __pure __attribute__((pure))
+#   else
+#      define __pure
+#   endif
+#endif
+
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
@@ -119,7 +134,8 @@ __api void lightrec_set_invalidate_mode(struct lightrec_state *state,
 __api void lightrec_set_exit_flags(struct lightrec_state *state, u32 flags);
 __api u32 lightrec_exit_flags(struct lightrec_state *state);
 
-__api struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state);
+__api __cnst struct lightrec_registers *
+lightrec_get_registers(struct lightrec_state *state);
 
 __api u32 lightrec_current_cycle_count(const struct lightrec_state *state);
 __api void lightrec_reset_cycle_count(struct lightrec_state *state, u32 cycles);
index c7502cd..2934d4c 100644 (file)
@@ -9,7 +9,7 @@
 
 #include <stdlib.h>
 
-#ifdef ENABLE_THREADED_COMPILER
+#if ENABLE_THREADED_COMPILER
 #include <stdatomic.h>
 
 static atomic_uint lightrec_bytes[MEM_TYPE_END];
index 04d9d80..5ce58ad 100644 (file)
@@ -115,6 +115,8 @@ static u64 opcode_read_mask(union code op)
        case OP_SW:
        case OP_SWR:
                return BIT(op.i.rs) | BIT(op.i.rt);
+       case OP_META:
+               return BIT(op.m.rs);
        default:
                return BIT(op.i.rs);
        }
@@ -139,12 +141,14 @@ static u64 mult_div_write_mask(union code op)
        return flags;
 }
 
-static u64 opcode_write_mask(union code op)
+u64 opcode_write_mask(union code op)
 {
        switch (op.i.op) {
        case OP_META_MULT2:
        case OP_META_MULTU2:
                return mult_div_write_mask(op);
+       case OP_META:
+               return BIT(op.m.rd);
        case OP_SPECIAL:
                switch (op.r.op) {
                case OP_SPECIAL_JR:
@@ -182,8 +186,6 @@ static u64 opcode_write_mask(union code op)
        case OP_LBU:
        case OP_LHU:
        case OP_LWR:
-       case OP_META_EXTC:
-       case OP_META_EXTS:
                return BIT(op.i.rt);
        case OP_JAL:
                return BIT(31);
@@ -214,8 +216,6 @@ static u64 opcode_write_mask(union code op)
                default:
                        return 0;
                }
-       case OP_META_MOV:
-               return BIT(op.r.rd);
        default:
                return 0;
        }
@@ -339,7 +339,39 @@ static bool reg_is_read_or_written(const struct opcode *list,
        return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
 }
 
-static bool opcode_is_load(union code op)
+bool opcode_is_mfc(union code op)
+{
+       switch (op.i.op) {
+       case OP_CP0:
+               switch (op.r.rs) {
+               case OP_CP0_MFC0:
+               case OP_CP0_CFC0:
+                       return true;
+               default:
+                       break;
+               }
+
+               break;
+       case OP_CP2:
+               if (op.r.op == OP_CP2_BASIC) {
+                       switch (op.r.rs) {
+                       case OP_CP2_BASIC_MFC2:
+                       case OP_CP2_BASIC_CFC2:
+                               return true;
+                       default:
+                               break;
+                       }
+               }
+
+               break;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+bool opcode_is_load(union code op)
 {
        switch (op.i.op) {
        case OP_LB:
@@ -456,46 +488,6 @@ static bool is_nop(union code op)
        }
 }
 
-bool load_in_delay_slot(union code op)
-{
-       switch (op.i.op) {
-       case OP_CP0:
-               switch (op.r.rs) {
-               case OP_CP0_MFC0:
-               case OP_CP0_CFC0:
-                       return true;
-               default:
-                       break;
-               }
-
-               break;
-       case OP_CP2:
-               if (op.r.op == OP_CP2_BASIC) {
-                       switch (op.r.rs) {
-                       case OP_CP2_BASIC_MFC2:
-                       case OP_CP2_BASIC_CFC2:
-                               return true;
-                       default:
-                               break;
-                       }
-               }
-
-               break;
-       case OP_LB:
-       case OP_LH:
-       case OP_LW:
-       case OP_LWL:
-       case OP_LWR:
-       case OP_LBU:
-       case OP_LHU:
-               return true;
-       default:
-               break;
-       }
-
-       return false;
-}
-
 static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset,
                                      struct constprop_data *v)
 {
@@ -592,9 +584,10 @@ static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset,
                                ldop->i.rt = next->r.rd;
                                to_change->opcode = 0;
                        } else {
-                               to_change->i.op = OP_META_MOV;
-                               to_change->r.rd = next->r.rd;
-                               to_change->r.rs = ldop->i.rt;
+                               to_change->i.op = OP_META;
+                               to_change->m.op = OP_META_MOV;
+                               to_change->m.rd = next->r.rd;
+                               to_change->m.rs = ldop->i.rt;
                        }
 
                        if (to_nop->r.imm == 24)
@@ -611,18 +604,9 @@ static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset,
                pr_debug("Convert SLL/SRA #%u to EXT%c\n",
                         curr->r.imm, curr->r.imm == 24 ? 'C' : 'S');
 
-               if (to_change == curr) {
-                       to_change->i.rs = curr->r.rt;
-                       to_change->i.rt = next->r.rd;
-               } else {
-                       to_change->i.rt = next->r.rd;
-                       to_change->i.rs = curr->r.rt;
-               }
-
-               if (to_nop->r.imm == 24)
-                       to_change->i.op = OP_META_EXTC;
-               else
-                       to_change->i.op = OP_META_EXTS;
+               to_change->m.rs = curr->r.rt;
+               to_change->m.op = to_nop->r.imm == 24 ? OP_META_EXTC : OP_META_EXTS;
+               to_change->i.op = OP_META;
        }
 
        to_nop->opcode = 0;
@@ -678,6 +662,12 @@ static void lightrec_modify_lui(struct block *block, unsigned int offset)
                        break;
 
                if (opcode_writes_register(c, lui->i.rt)) {
+                       if (c.i.op == OP_LWL || c.i.op == OP_LWR) {
+                               /* LWL/LWR only partially write their target register;
+                                * therefore the LUI should not write a different value. */
+                               break;
+                       }
+
                        pr_debug("Convert LUI at offset 0x%x to kuseg\n",
                                 i - 1 << 2);
                        lui->i.imm = kunseg(lui->i.imm << 16) >> 16;
@@ -796,13 +786,11 @@ static void lightrec_patch_known_zero(struct opcode *op,
        case OP_ANDI:
        case OP_ORI:
        case OP_XORI:
-       case OP_META_MOV:
-       case OP_META_EXTC:
-       case OP_META_EXTS:
        case OP_META_MULT2:
        case OP_META_MULTU2:
-               if (is_known_zero(v, op->i.rs))
-                       op->i.rs = 0;
+       case OP_META:
+               if (is_known_zero(v, op->m.rs))
+                       op->m.rs = 0;
                break;
        case OP_SB:
        case OP_SH:
@@ -842,9 +830,14 @@ static void lightrec_reset_syncs(struct block *block)
        for (i = 0; i < block->nb_ops; i++) {
                op = &list[i];
 
-               if (op_flag_local_branch(op->flags) && has_delay_slot(op->c)) {
-                       offset = i + 1 + (s16)op->i.imm;
-                       list[offset].flags |= LIGHTREC_SYNC;
+               if (has_delay_slot(op->c)) {
+                       if (op_flag_local_branch(op->flags)) {
+                               offset = i + 1 - op_flag_no_ds(op->flags) + (s16)op->i.imm;
+                               list[offset].flags |= LIGHTREC_SYNC;
+                       }
+
+                       if (op_flag_emulate_branch(op->flags) && i + 2 < block->nb_ops)
+                               list[i + 2].flags |= LIGHTREC_SYNC;
                }
        }
 }
@@ -860,7 +853,7 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
        for (i = 0; i < block->nb_ops; i++) {
                op = &list[i];
 
-               lightrec_consts_propagate(list, i, v);
+               lightrec_consts_propagate(block, i, v);
 
                lightrec_patch_known_zero(op, v);
 
@@ -963,8 +956,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
                case OP_ADDIU:
                        if (op->i.imm == 0) {
                                pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
-                               op->i.op = OP_META_MOV;
-                               op->r.rd = op->i.rt;
+                               op->m.rd = op->i.rt;
+                               op->m.op = OP_META_MOV;
+                               op->i.op = OP_META;
                        }
                        break;
                case OP_ANDI:
@@ -974,8 +968,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
                                if (op->i.rs == op->i.rt) {
                                        op->opcode = 0;
                                } else {
-                                       op->i.op = OP_META_MOV;
-                                       op->r.rd = op->i.rt;
+                                       op->m.rd = op->i.rt;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                }
                        }
                        break;
@@ -1023,8 +1018,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
                        case OP_SPECIAL_SRA:
                                if (op->r.imm == 0) {
                                        pr_debug("Convert SRA #0 to MOV\n");
-                                       op->i.op = OP_META_MOV;
-                                       op->r.rs = op->r.rt;
+                                       op->m.rs = op->r.rt;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                        break;
                                }
                                break;
@@ -1041,8 +1037,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
                        case OP_SPECIAL_SLL:
                                if (op->r.imm == 0) {
                                        pr_debug("Convert SLL #0 to MOV\n");
-                                       op->i.op = OP_META_MOV;
-                                       op->r.rs = op->r.rt;
+                                       op->m.rs = op->r.rt;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                }
 
                                lightrec_optimize_sll_sra(block->opcode_list, i, v);
@@ -1060,8 +1057,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
                        case OP_SPECIAL_SRL:
                                if (op->r.imm == 0) {
                                        pr_debug("Convert SRL #0 to MOV\n");
-                                       op->i.op = OP_META_MOV;
-                                       op->r.rs = op->r.rt;
+                                       op->m.rs = op->r.rt;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                }
                                break;
 
@@ -1087,20 +1085,31 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl
 
                                op->r.op = ctz32(v[op->r.rt].value);
                                break;
+                       case OP_SPECIAL_NOR:
+                               if (op->r.rs == 0 || op->r.rt == 0) {
+                                       pr_debug("Convert NOR $zero to COM\n");
+                                       op->i.op = OP_META;
+                                       op->m.op = OP_META_COM;
+                                       if (!op->m.rs)
+                                               op->m.rs = op->r.rt;
+                               }
+                               break;
                        case OP_SPECIAL_OR:
                        case OP_SPECIAL_ADD:
                        case OP_SPECIAL_ADDU:
                                if (op->r.rs == 0) {
                                        pr_debug("Convert OR/ADD $zero to MOV\n");
-                                       op->i.op = OP_META_MOV;
-                                       op->r.rs = op->r.rt;
+                                       op->m.rs = op->r.rt;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                }
                                fallthrough;
                        case OP_SPECIAL_SUB:
                        case OP_SPECIAL_SUBU:
                                if (op->r.rt == 0) {
                                        pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
-                                       op->i.op = OP_META_MOV;
+                                       op->m.op = OP_META_MOV;
+                                       op->i.op = OP_META;
                                }
                                fallthrough;
                        default:
@@ -1197,6 +1206,9 @@ static int lightrec_switch_delay_slots(struct lightrec_state *state, struct bloc
                if (op_flag_sync(next->flags))
                        continue;
 
+               if (op_flag_load_delay(next->flags) && opcode_is_load(next_op))
+                       continue;
+
                if (!lightrec_can_switch_delay_slot(list->c, next_op))
                        continue;
 
@@ -1214,52 +1226,20 @@ static int lightrec_switch_delay_slots(struct lightrec_state *state, struct bloc
        return 0;
 }
 
-static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size)
-{
-       struct opcode_list *list, *old_list;
-
-       if (new_size >= block->nb_ops) {
-               pr_err("Invalid shrink size (%u vs %u)\n",
-                      new_size, block->nb_ops);
-               return -EINVAL;
-       }
-
-       list = lightrec_malloc(state, MEM_FOR_IR,
-                              sizeof(*list) + sizeof(struct opcode) * new_size);
-       if (!list) {
-               pr_err("Unable to allocate memory\n");
-               return -ENOMEM;
-       }
-
-       old_list = container_of(block->opcode_list, struct opcode_list, ops);
-       memcpy(list->ops, old_list->ops, sizeof(struct opcode) * new_size);
-
-       lightrec_free_opcode_list(state, block->opcode_list);
-       list->nb_ops = new_size;
-       block->nb_ops = new_size;
-       block->opcode_list = list->ops;
-
-       pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n",
-                block->pc, new_size);
-
-       return 0;
-}
-
 static int lightrec_detect_impossible_branches(struct lightrec_state *state,
                                               struct block *block)
 {
        struct opcode *op, *list = block->opcode_list, *next = &list[0];
        unsigned int i;
        int ret = 0;
-       s16 offset;
 
        for (i = 0; i < block->nb_ops - 1; i++) {
                op = next;
                next = &list[i + 1];
 
                if (!has_delay_slot(op->c) ||
-                   (!load_in_delay_slot(next->c) &&
-                    !has_delay_slot(next->c) &&
+                   (!has_delay_slot(next->c) &&
+                    !opcode_is_mfc(next->c) &&
                     !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
                        continue;
 
@@ -1270,40 +1250,120 @@ static int lightrec_detect_impossible_branches(struct lightrec_state *state,
                        continue;
                }
 
-               offset = i + 1 + (s16)op->i.imm;
-               if (load_in_delay_slot(next->c) &&
-                   (offset >= 0 && offset < block->nb_ops) &&
-                   !opcode_reads_register(list[offset].c, next->c.i.rt)) {
-                       /* The 'impossible' branch is a local branch - we can
-                        * verify here that the first opcode of the target does
-                        * not use the target register of the delay slot */
-
-                       pr_debug("Branch at offset 0x%x has load delay slot, "
-                                "but is local and dest opcode does not read "
-                                "dest register\n", i << 2);
+               op->flags |= LIGHTREC_EMULATE_BRANCH;
+
+               if (OPT_LOCAL_BRANCHES && i + 2 < block->nb_ops) {
+                       /* The interpreter will only emulate the branch, then
+                        * return to the compiled code. Add a SYNC after the
+                        * branch + delay slot in the case where the branch
+                        * was not taken. */
+                       list[i + 2].flags |= LIGHTREC_SYNC;
+               }
+       }
+
+       return ret;
+}
+
+static bool is_local_branch(const struct block *block, unsigned int idx)
+{
+       const struct opcode *op = &block->opcode_list[idx];
+       s32 offset;
+
+       switch (op->c.i.op) {
+       case OP_BEQ:
+       case OP_BNE:
+       case OP_BLEZ:
+       case OP_BGTZ:
+       case OP_REGIMM:
+               offset = idx + 1 + (s16)op->c.i.imm;
+               if (offset >= 0 && offset < block->nb_ops)
+                       return true;
+               fallthrough;
+       default:
+               return false;
+       }
+}
+
+static int lightrec_handle_load_delays(struct lightrec_state *state,
+                                      struct block *block)
+{
+       struct opcode *op, *list = block->opcode_list;
+       unsigned int i;
+       s16 imm;
+
+       for (i = 0; i < block->nb_ops; i++) {
+               op = &list[i];
+
+               if (!opcode_is_load(op->c) || !op->c.i.rt || op->c.i.op == OP_LWC2)
+                       continue;
+
+               if (!is_delay_slot(list, i)) {
+                       /* Only handle load delays in delay slots.
+                        * PSX games never abused load delay slots otherwise. */
                        continue;
                }
 
-               op->flags |= LIGHTREC_EMULATE_BRANCH;
+               if (is_local_branch(block, i - 1)) {
+                       imm = (s16)list[i - 1].c.i.imm;
 
-               if (op == list) {
-                       pr_debug("First opcode of block PC 0x%08x is an impossible branch\n",
-                                block->pc);
+                       if (!opcode_reads_register(list[i + imm].c, op->c.i.rt)) {
+                               /* The target opcode of the branch is inside
+                                * the block, and it does not read the register
+                                * written to by the load opcode; we can ignore
+                                * the load delay. */
+                               continue;
+                       }
+               }
 
-                       /* If the first opcode is an 'impossible' branch, we
-                        * only keep the first two opcodes of the block (the
-                        * branch itself + its delay slot) */
-                       if (block->nb_ops > 2)
-                               ret = shrink_opcode_list(state, block, 2);
-                       break;
+               op->flags |= LIGHTREC_LOAD_DELAY;
+       }
+
+       return 0;
+}
+
+static int lightrec_swap_load_delays(struct lightrec_state *state,
+                                    struct block *block)
+{
+       unsigned int i;
+       union code c, next;
+       bool in_ds = false, skip_next = false;
+       struct opcode op;
+
+       if (block->nb_ops < 2)
+               return 0;
+
+       for (i = 0; i < block->nb_ops - 2; i++) {
+               c = block->opcode_list[i].c;
+
+               if (skip_next) {
+                       skip_next = false;
+               } else if (!in_ds && opcode_is_load(c) && c.i.op != OP_LWC2) {
+                       next = block->opcode_list[i + 1].c;
+
+                       if (c.i.op == OP_LWL && next.i.op == OP_LWR)
+                               continue;
+
+                       if (opcode_reads_register(next, c.i.rt)
+                           && !opcode_writes_register(next, c.i.rs)) {
+                               pr_debug("Swapping opcodes at offset 0x%x to "
+                                        "respect load delay\n", i << 2);
+
+                               op = block->opcode_list[i];
+                               block->opcode_list[i] = block->opcode_list[i + 1];
+                               block->opcode_list[i + 1] = op;
+                               skip_next = true;
+                       }
                }
+
+               in_ds = has_delay_slot(c);
        }
 
-       return ret;
+       return 0;
 }
 
 static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
 {
+       const struct opcode *ds;
        struct opcode *list;
        unsigned int i;
        s32 offset;
@@ -1311,25 +1371,19 @@ static int lightrec_local_branches(struct lightrec_state *state, struct block *b
        for (i = 0; i < block->nb_ops; i++) {
                list = &block->opcode_list[i];
 
-               if (should_emulate(list))
+               if (should_emulate(list) || !is_local_branch(block, i))
                        continue;
 
-               switch (list->i.op) {
-               case OP_BEQ:
-               case OP_BNE:
-               case OP_BLEZ:
-               case OP_BGTZ:
-               case OP_REGIMM:
-                       offset = i + 1 + (s16)list->i.imm;
-                       if (offset >= 0 && offset < block->nb_ops)
-                               break;
-                       fallthrough;
-               default:
-                       continue;
-               }
+               offset = i + 1 + (s16)list->c.i.imm;
 
                pr_debug("Found local branch to offset 0x%x\n", offset << 2);
 
+               ds = get_delay_slot(block->opcode_list, i);
+               if (op_flag_load_delay(ds->flags) && opcode_is_load(ds->c)) {
+                       pr_debug("Branch delay slot has a load delay - skip\n");
+                       continue;
+               }
+
                if (should_emulate(&block->opcode_list[offset])) {
                        pr_debug("Branch target must be emulated - skip\n");
                        continue;
@@ -1388,7 +1442,7 @@ static bool op_writes_rd(union code c)
 {
        switch (c.i.op) {
        case OP_SPECIAL:
-       case OP_META_MOV:
+       case OP_META:
                return true;
        default:
                return false;
@@ -1447,7 +1501,7 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo
        struct opcode *op;
        s16 last_r[34], last_w[34], last_sync = 0, next_sync = 0;
        u64 mask_r, mask_w, dirty = 0, loaded = 0;
-       u8 reg;
+       u8 reg, load_delay_reg = 0;
 
        memset(last_r, 0xff, sizeof(last_r));
        memset(last_w, 0xff, sizeof(last_w));
@@ -1468,6 +1522,13 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo
        for (i = 0; i < block->nb_ops; i++) {
                op = &block->opcode_list[i];
 
+               if (OPT_HANDLE_LOAD_DELAYS && load_delay_reg) {
+                       /* Handle delayed register write from load opcodes in
+                        * delay slots */
+                       last_w[load_delay_reg] = i;
+                       load_delay_reg = 0;
+               }
+
                if (op_flag_sync(op->flags) || should_emulate(op)) {
                        /* The next opcode has the SYNC flag set, or is a branch
                         * that should be emulated: unload all registers. */
@@ -1489,6 +1550,15 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo
                mask_r = opcode_read_mask(op->c);
                mask_w = opcode_write_mask(op->c);
 
+               if (op_flag_load_delay(op->flags) && opcode_is_load(op->c)) {
+                       /* If we have a load opcode in a delay slot, its target
+                        * register is actually not written there but at a
+                        * later point, in the dispatcher. Prevent the algorithm
+                        * from discarding its previous value. */
+                       load_delay_reg = op->c.i.rt;
+                       mask_w &= ~BIT(op->c.i.rt);
+               }
+
                for (reg = 0; reg < 34; reg++) {
                        if (mask_r & BIT(reg)) {
                                if (dirty & BIT(reg) && last_w[reg] < last_sync) {
@@ -1553,37 +1623,32 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
        for (i = 0; i < block->nb_ops; i++) {
                list = &block->opcode_list[i];
 
-               lightrec_consts_propagate(block->opcode_list, i, v);
+               lightrec_consts_propagate(block, i, v);
 
                switch (list->i.op) {
                case OP_SB:
                case OP_SH:
                case OP_SW:
-                       if (OPT_FLAG_STORES) {
-                               /* Mark all store operations that target $sp or $gp
-                                * as not requiring code invalidation. This is based
-                                * on the heuristic that stores using one of these
-                                * registers as address will never hit a code page. */
-                               if (list->i.rs >= 28 && list->i.rs <= 29 &&
-                                   !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
-                                       pr_debug("Flaging opcode 0x%08x as not "
-                                                "requiring invalidation\n",
-                                                list->opcode);
-                                       list->flags |= LIGHTREC_NO_INVALIDATE;
-                                       list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
-                               }
+                       /* Mark all store operations that target $sp or $gp
+                        * as not requiring code invalidation. This is based
+                        * on the heuristic that stores using one of these
+                        * registers as address will never hit a code page. */
+                       if (list->i.rs >= 28 && list->i.rs <= 29 &&
+                           !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
+                               pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n",
+                                        list->opcode);
+                               list->flags |= LIGHTREC_NO_INVALIDATE;
+                       }
 
-                               /* Detect writes whose destination address is inside the
-                                * current block, using constant propagation. When these
-                                * occur, we mark the blocks as not compilable. */
-                               if (is_known(v, list->i.rs) &&
-                                   kunseg(v[list->i.rs].value) >= kunseg(block->pc) &&
-                                   kunseg(v[list->i.rs].value) < (kunseg(block->pc) +
-                                                                  block->nb_ops * 4)) {
-                                       pr_debug("Self-modifying block detected\n");
-                                       block_set_flags(block, BLOCK_NEVER_COMPILE);
-                                       list->flags |= LIGHTREC_SMC;
-                               }
+                       /* Detect writes whose destination address is inside the
+                        * current block, using constant propagation. When these
+                        * occur, we mark the blocks as not compilable. */
+                       if (is_known(v, list->i.rs) &&
+                           kunseg(v[list->i.rs].value) >= kunseg(block->pc) &&
+                           kunseg(v[list->i.rs].value) < (kunseg(block->pc) + block->nb_ops * 4)) {
+                               pr_debug("Self-modifying block detected\n");
+                               block_set_flags(block, BLOCK_NEVER_COMPILE);
+                               list->flags |= LIGHTREC_SMC;
                        }
                        fallthrough;
                case OP_SWL:
@@ -1597,8 +1662,7 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
                case OP_LWL:
                case OP_LWR:
                case OP_LWC2:
-                       if (OPT_FLAG_IO &&
-                           (v[list->i.rs].known | v[list->i.rs].sign)) {
+                       if (v[list->i.rs].known | v[list->i.rs].sign) {
                                psx_map = lightrec_get_constprop_map(state, v,
                                                                     list->i.rs,
                                                                     (s16) list->i.imm);
@@ -1664,6 +1728,16 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
                                        break;
                                }
                        }
+
+                       if (!LIGHTREC_FLAGS_GET_IO_MODE(list->flags)
+                           && list->i.rs >= 28 && list->i.rs <= 29
+                           && !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
+                               /* Assume that all I/O operations that target
+                                * $sp or $gp will always only target a mapped
+                                * memory (RAM, BIOS, scratchpad). */
+                               list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+                       }
+
                        fallthrough;
                default:
                        break;
@@ -1862,7 +1936,7 @@ static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *
        for (i = 0; i < block->nb_ops - 1; i++) {
                list = &block->opcode_list[i];
 
-               lightrec_consts_propagate(block->opcode_list, i, v);
+               lightrec_consts_propagate(block, i, v);
 
                switch (list->i.op) {
                case OP_SPECIAL:
@@ -2079,11 +2153,13 @@ static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block *
        IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
        IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
        IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
+       IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_handle_load_delays),
+       IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_swap_load_delays),
        IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_branches),
        IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
        IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
        IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
-       IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io),
+       IF_OPT(OPT_FLAG_IO, &lightrec_flag_io),
        IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
        IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
 };
index 825042d..f2b1f30 100644 (file)
 struct block;
 struct opcode;
 
-_Bool opcode_reads_register(union code op, u8 reg);
-_Bool opcode_writes_register(union code op, u8 reg);
-_Bool has_delay_slot(union code op);
+__cnst _Bool opcode_reads_register(union code op, u8 reg);
+__cnst _Bool opcode_writes_register(union code op, u8 reg);
+__cnst u64 opcode_write_mask(union code op);
+__cnst _Bool has_delay_slot(union code op);
 _Bool is_delay_slot(const struct opcode *list, unsigned int offset);
-_Bool load_in_delay_slot(union code op);
-_Bool opcode_is_io(union code op);
-_Bool is_unconditional_jump(union code c);
-_Bool is_syscall(union code c);
+__cnst _Bool opcode_is_mfc(union code op);
+__cnst _Bool opcode_is_load(union code op);
+__cnst _Bool opcode_is_io(union code op);
+__cnst _Bool is_unconditional_jump(union code c);
+__cnst _Bool is_syscall(union code c);
 
 _Bool should_emulate(const struct opcode *op);
 
index c62ba3d..2a7ffe9 100644 (file)
@@ -49,6 +49,10 @@ static const char * mips_regs[] = {
        "lo", "hi",
 };
 
+/* Forward declaration(s) */
+static void clean_reg(jit_state_t *_jit,
+                     struct native_register *nreg, u8 jit_reg, bool clean);
+
 const char * lightrec_reg_name(u8 reg)
 {
        return mips_regs[reg];
@@ -219,14 +223,7 @@ static void lightrec_discard_nreg(struct native_register *nreg)
 static void lightrec_unload_nreg(struct regcache *cache, jit_state_t *_jit,
                struct native_register *nreg, u8 jit_reg)
 {
-       /* If we get a dirty register, store back the old value */
-       if (nreg->prio == REG_IS_DIRTY) {
-               s16 offset = offsetof(struct lightrec_state, regs.gpr)
-                       + (nreg->emulated_register << 2);
-
-               jit_stxi_i(offset, LIGHTREC_REG_STATE, jit_reg);
-       }
-
+       clean_reg(_jit, nreg, jit_reg, false);
        lightrec_discard_nreg(nreg);
 }
 
@@ -519,6 +516,7 @@ void lightrec_free_regs(struct regcache *cache)
 static void clean_reg(jit_state_t *_jit,
                struct native_register *nreg, u8 jit_reg, bool clean)
 {
+       /* If we get a dirty register, store back the old value */
        if (nreg->prio == REG_IS_DIRTY) {
                s16 offset = offsetof(struct lightrec_state, regs.gpr)
                        + (nreg->emulated_register << 2);
@@ -579,6 +577,11 @@ void lightrec_clean_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg)
        }
 }
 
+bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg)
+{
+       return !!find_mapped_reg(cache, reg, false);
+}
+
 void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit,
                                  u16 reg, bool unload)
 {
index d242c54..55f1cfd 100644 (file)
@@ -8,8 +8,13 @@
 
 #include "lightning-wrapper.h"
 
-#define NUM_REGS (JIT_V_NUM - 1)
-#define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1))
+#if defined(__sh__)
+#  define NUM_REGS JIT_V_NUM
+#  define LIGHTREC_REG_STATE _GBR
+#else
+#  define NUM_REGS (JIT_V_NUM - 1)
+#  define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1))
+#endif
 
 #if defined(__powerpc__)
 #  define NUM_TEMPS JIT_R_NUM
@@ -68,6 +73,7 @@ void lightrec_unload_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg);
 void lightrec_storeback_regs(struct regcache *cache, jit_state_t *_jit);
 _Bool lightrec_has_dirty_regs(struct regcache *cache);
 
+_Bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg);
 void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit,
                                  u16 reg, _Bool unload);
 void lightrec_discard_reg_if_loaded(struct regcache *cache, u16 reg);
@@ -82,7 +88,7 @@ void lightrec_regcache_leave_branch(struct regcache *cache,
 struct regcache * lightrec_regcache_init(struct lightrec_state *state);
 void lightrec_free_regcache(struct regcache *cache);
 
-const char * lightrec_reg_name(u8 reg);
+__cnst const char * lightrec_reg_name(u8 reg);
 
 void lightrec_regcache_mark_live(struct regcache *cache, jit_state_t *_jit);