From cb72ea130a5ef1b2f47691ed586ad48bb0c39269 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 9 Jul 2023 13:56:01 +0200 Subject: [PATCH] git subrepo pull --force deps/lightrec subrepo: subdir: "deps/lightrec" merged: "fcf239e7e9" upstream: origin: "https://github.com/pcercuei/lightrec.git" branch: "master" commit: "fcf239e7e9" git-subrepo: version: "0.4.3" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "2f68596" --- deps/lightrec/.gitrepo | 4 +- deps/lightrec/CMakeLists.txt | 4 +- deps/lightrec/README.md | 7 +- deps/lightrec/constprop.c | 83 +++-- deps/lightrec/constprop.h | 4 +- deps/lightrec/disassembler.c | 25 +- deps/lightrec/disassembler.h | 47 ++- deps/lightrec/emitter.c | 273 +++++++++++---- deps/lightrec/emitter.h | 4 +- deps/lightrec/interpreter.c | 149 ++++++-- deps/lightrec/interpreter.h | 2 + deps/lightrec/lightning-wrapper.h | 10 + deps/lightrec/lightrec-config.h.cmakein | 2 +- deps/lightrec/lightrec-private.h | 23 +- deps/lightrec/lightrec.c | 241 ++++++++++--- deps/lightrec/lightrec.h | 18 +- deps/lightrec/memmanager.c | 2 +- deps/lightrec/optimizer.c | 444 ++++++++++++++---------- deps/lightrec/optimizer.h | 16 +- deps/lightrec/regcache.c | 19 +- deps/lightrec/regcache.h | 12 +- 21 files changed, 962 insertions(+), 427 deletions(-) diff --git a/deps/lightrec/.gitrepo b/deps/lightrec/.gitrepo index 6e8794f0..8a344c4f 100644 --- a/deps/lightrec/.gitrepo +++ b/deps/lightrec/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/pcercuei/lightrec.git branch = master - commit = 3ff589bcb7d52b3a091fe0b922ba02a0b1a7f095 - parent = aced3eb3fcaa0fe13c44c4dd196cdab42555fd98 + commit = fcf239e7e9d42fedb7a8de64057d6895acf3ceee + parent = 03ec8a8c606eb87642be336632e1792ab89650d8 method = merge cmdver = 0.4.3 diff --git a/deps/lightrec/CMakeLists.txt b/deps/lightrec/CMakeLists.txt index 12da14ea..9518a9ab 100644 --- a/deps/lightrec/CMakeLists.txt +++ b/deps/lightrec/CMakeLists.txt @@ -66,11 +66,11 @@ endif (ENABLE_THREADED_COMPILER) option(OPT_REMOVE_DIV_BY_ZERO_SEQ "(optimization) Remove div-by-zero check sequence" ON) option(OPT_REPLACE_MEMSET "(optimization) Detect and replace memset with host variant" ON) option(OPT_DETECT_IMPOSSIBLE_BRANCHES "(optimization) Detect impossible branches" ON) +option(OPT_HANDLE_LOAD_DELAYS "(optimization) Detect load delays" ON) option(OPT_TRANSFORM_OPS "(optimization) Transform opcodes" ON) option(OPT_LOCAL_BRANCHES "(optimization) Detect local branches" ON) option(OPT_SWITCH_DELAY_SLOTS "(optimization) Switch delay slots" ON) -option(OPT_FLAG_STORES "(optimization) Flag stores that don't require invalidation" ON) -option(OPT_FLAG_IO "(optimization) Flag I/O opcodes whose target is known" ON) +option(OPT_FLAG_IO "(optimization) Flag I/O opcodes when the target can be detected" ON) option(OPT_FLAG_MULT_DIV "(optimization) Flag MULT/DIV that only use one of HI/LO" ON) option(OPT_EARLY_UNLOAD "(optimization) Unload registers early" ON) diff --git a/deps/lightrec/README.md b/deps/lightrec/README.md index ab2c13b5..449e06c1 100644 --- a/deps/lightrec/README.md +++ b/deps/lightrec/README.md @@ -17,8 +17,7 @@ a form of Intermediate Representation (IR). Basically, just a single-linked list of structures representing the instructions. On that list, several optimization steps are performed: instructions are modified, reordered, tagged; new meta-instructions -can be added, for instance to tell the code generator that a certain -register won't be used anymore. +can also be added. * __Lazy compilation__. If Lightrec detects a block of code that would be very hard to @@ -46,10 +45,12 @@ typically happens when a lot of new code is run. Lightrec has been ported to the following emulators: -* [__PCSX-ReArmed__ (my own fork)](https://github.com/pcercuei/pcsx_rearmed) +* [__PCSX-ReArmed__ (libretro)](https://github.com/libretro/pcsx_rearmed) * [__pcsx4all__ (my own fork)](https://github.com/pcercuei/pcsx4all) * [__Beetle__ (libretro)](https://github.com/libretro/beetle-psx-libretro/) +* [__CubeSX/WiiSX__](https://github.com/emukidid/pcsxgc/) + [![Star History Chart](https://api.star-history.com/svg?repos=pcercuei/lightrec&type=Date)](https://star-history.com/#pcercuei/lightrec&Date) diff --git a/deps/lightrec/constprop.c b/deps/lightrec/constprop.c index 353f42f1..8499c6ec 100644 --- a/deps/lightrec/constprop.c +++ b/deps/lightrec/constprop.c @@ -243,12 +243,13 @@ static void lightrec_propagate_slt(u32 rs, u32 rd, bool is_signed, } } -void lightrec_consts_propagate(const struct opcode *list, +void lightrec_consts_propagate(const struct block *block, unsigned int idx, struct constprop_data *v) { + const struct opcode *list = block->opcode_list; union code c; - u32 imm; + u32 imm, flags; if (idx == 0) return; @@ -263,8 +264,13 @@ void lightrec_consts_propagate(const struct opcode *list, return; } - if (idx > 1 && !op_flag_sync(list[idx - 1].flags)) { - c = list[idx - 2].c; + flags = list[idx - 1].flags; + + if (idx > 1 && !op_flag_sync(flags)) { + if (op_flag_no_ds(flags)) + c = list[idx - 1].c; + else + c = list[idx - 2].c; switch (c.i.op) { case OP_BNE: @@ -449,6 +455,13 @@ void lightrec_consts_propagate(const struct opcode *list, v[c.r.rd].known = 0; v[c.r.rd].sign = 0; break; + + case OP_SPECIAL_JALR: + v[c.r.rd].known = 0xffffffff; + v[c.r.rd].sign = 0; + v[c.r.rd].value = block->pc + (idx + 2 << 2); + break; + default: break; } @@ -644,7 +657,7 @@ void lightrec_consts_propagate(const struct opcode *list, imm = imm ? GENMASK(31, 32 - imm) : 0; v[c.i.rt].sign = 0; } - v[c.i.rt].known &= ~imm; + v[c.i.rt].known &= imm; break; } fallthrough; @@ -652,30 +665,48 @@ void lightrec_consts_propagate(const struct opcode *list, v[c.i.rt].known = 0; v[c.i.rt].sign = 0; break; - case OP_META_MOV: - v[c.r.rd] = v[c.r.rs]; - break; - case OP_META_EXTC: - v[c.i.rt].value = (s32)(s8)v[c.i.rs].value; - if (v[c.i.rs].known & BIT(7)) { - v[c.i.rt].known = v[c.i.rs].known | 0xffffff00; - v[c.i.rt].sign = 0; - } else { - v[c.i.rt].known = v[c.i.rs].known & 0x7f; - v[c.i.rt].sign = 0xffffff80; - } - break; + case OP_META: + switch (c.m.op) { + case OP_META_MOV: + v[c.m.rd] = v[c.m.rs]; + break; - case OP_META_EXTS: - v[c.i.rt].value = (s32)(s16)v[c.i.rs].value; - if (v[c.i.rs].known & BIT(15)) { - v[c.i.rt].known = v[c.i.rs].known | 0xffff0000; - v[c.i.rt].sign = 0; - } else { - v[c.i.rt].known = v[c.i.rs].known & 0x7fff; - v[c.i.rt].sign = 0xffff8000; + case OP_META_EXTC: + v[c.m.rd].value = (s32)(s8)v[c.m.rs].value; + if (v[c.m.rs].known & BIT(7)) { + v[c.m.rd].known = v[c.m.rs].known | 0xffffff00; + v[c.m.rd].sign = 0; + } else { + v[c.m.rd].known = v[c.m.rs].known & 0x7f; + v[c.m.rd].sign = 0xffffff80; + } + break; + + case OP_META_EXTS: + v[c.m.rd].value = (s32)(s16)v[c.m.rs].value; + if (v[c.m.rs].known & BIT(15)) { + v[c.m.rd].known = v[c.m.rs].known | 0xffff0000; + v[c.m.rd].sign = 0; + } else { + v[c.m.rd].known = v[c.m.rs].known & 0x7fff; + v[c.m.rd].sign = 0xffff8000; + } + break; + + case OP_META_COM: + v[c.m.rd].known = v[c.m.rs].known; + v[c.m.rd].value = ~v[c.m.rs].value; + v[c.m.rd].sign = v[c.m.rs].sign; + break; + default: + break; } break; + case OP_JAL: + v[31].known = 0xffffffff; + v[31].sign = 0; + v[31].value = block->pc + (idx + 2 << 2); + break; default: break; diff --git a/deps/lightrec/constprop.h b/deps/lightrec/constprop.h index cebf0b38..9f9ecc3c 100644 --- a/deps/lightrec/constprop.h +++ b/deps/lightrec/constprop.h @@ -10,7 +10,7 @@ #define LIGHTREC_CONSTPROP_INITIALIZER { { 0, 0xffffffff, 0 }, } -struct opcode; +struct block; struct constprop_data { u32 value; @@ -34,7 +34,7 @@ static inline _Bool is_known_zero(const struct constprop_data *v, u8 reg) return bits_are_known_zero(v, reg, 0xffffffff); } -void lightrec_consts_propagate(const struct opcode *list, +void lightrec_consts_propagate(const struct block *block, unsigned int idx, struct constprop_data *v); diff --git a/deps/lightrec/disassembler.c b/deps/lightrec/disassembler.c index bef95948..f687d28c 100644 --- a/deps/lightrec/disassembler.c +++ b/deps/lightrec/disassembler.c @@ -120,6 +120,13 @@ static const char * const cp2_opcodes[] = { [OP_CP2_NCCT] = "ncct ", }; +static const char * const meta_opcodes[] = { + [OP_META_MOV] = "move ", + [OP_META_EXTC] = "extc ", + [OP_META_EXTS] = "exts ", + [OP_META_COM] = "com ", +}; + static const char * const mult2_opcodes[] = { "mult2 ", "multu2 ", }; @@ -133,6 +140,7 @@ static const char * const opcode_io_flags[] = { "self-modifying code", "no invalidation", "no mask", + "load delay", }; static const char * const opcode_io_modes[] = { @@ -444,18 +452,11 @@ static int print_op(union code c, u32 pc, char *buf, size_t len, lightrec_reg_name(c.i.rt), (s16)c.i.imm, lightrec_reg_name(c.i.rs)); - case OP_META_MOV: - return snprintf(buf, len, "move %s,%s", - lightrec_reg_name(c.r.rd), - lightrec_reg_name(c.r.rs)); - case OP_META_EXTC: - return snprintf(buf, len, "extc %s,%s", - lightrec_reg_name(c.i.rt), - lightrec_reg_name(c.i.rs)); - case OP_META_EXTS: - return snprintf(buf, len, "exts %s,%s", - lightrec_reg_name(c.i.rt), - lightrec_reg_name(c.i.rs)); + case OP_META: + return snprintf(buf, len, "%s%s,%s", + meta_opcodes[c.m.op], + lightrec_reg_name(c.m.rd), + lightrec_reg_name(c.m.rs)); case OP_META_MULT2: case OP_META_MULTU2: *flags_ptr = opcode_multdiv_flags; diff --git a/deps/lightrec/disassembler.h b/deps/lightrec/disassembler.h index e4685a9d..9e39484c 100644 --- a/deps/lightrec/disassembler.h +++ b/deps/lightrec/disassembler.h @@ -24,9 +24,10 @@ #define LIGHTREC_SMC BIT(2) #define LIGHTREC_NO_INVALIDATE BIT(3) #define LIGHTREC_NO_MASK BIT(4) +#define LIGHTREC_LOAD_DELAY BIT(5) /* I/O mode for load/store opcodes */ -#define LIGHTREC_IO_MODE_LSB 5 +#define LIGHTREC_IO_MODE_LSB 6 #define LIGHTREC_IO_MODE(x) ((x) << LIGHTREC_IO_MODE_LSB) #define LIGHTREC_IO_UNKNOWN 0x0 #define LIGHTREC_IO_DIRECT 0x1 @@ -107,10 +108,7 @@ enum standard_opcodes { OP_LWC2 = 0x32, OP_SWC2 = 0x3a, - OP_META_MOV = 0x16, - - OP_META_EXTC = 0x17, - OP_META_EXTS = 0x18, + OP_META = 0x3b, OP_META_MULT2 = 0x19, OP_META_MULTU2 = 0x1a, @@ -195,6 +193,15 @@ enum cp2_basic_opcodes { OP_CP2_BASIC_CTC2 = 0x06, }; +enum meta_opcodes { + OP_META_MOV = 0x00, + + OP_META_EXTC = 0x01, + OP_META_EXTS = 0x02, + + OP_META_COM = 0x03, +}; + struct opcode_r { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ u32 zero :6; @@ -237,12 +244,31 @@ struct opcode_j { #endif } __packed; +struct opcode_m { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + u32 meta :6; + u32 rs :5; + u32 rt :5; + u32 rd :5; + u32 imm :6; + u32 op :5; +#else + u32 op :5; + u32 imm :6; + u32 rd :5; + u32 rt :5; + u32 rs :5; + u32 meta :6; +#endif +}; + union code { /* Keep in sync with struct opcode */ u32 opcode; struct opcode_r r; struct opcode_i i; struct opcode_j j; + struct opcode_m m; }; struct opcode { @@ -255,6 +281,7 @@ struct opcode { struct opcode_r r; struct opcode_i i; struct opcode_j j; + struct opcode_m m; }; u32 flags; }; @@ -278,13 +305,12 @@ static inline _Bool op_flag_sync(u32 flags) static inline _Bool op_flag_smc(u32 flags) { - return OPT_FLAG_STORES && (flags & LIGHTREC_SMC); + return OPT_FLAG_IO && (flags & LIGHTREC_SMC); } static inline _Bool op_flag_no_invalidate(u32 flags) { - return (OPT_FLAG_IO || OPT_FLAG_STORES) && - (flags & LIGHTREC_NO_INVALIDATE); + return OPT_FLAG_IO && (flags & LIGHTREC_NO_INVALIDATE); } static inline _Bool op_flag_no_mask(u32 flags) @@ -292,6 +318,11 @@ static inline _Bool op_flag_no_mask(u32 flags) return OPT_FLAG_IO && (flags & LIGHTREC_NO_MASK); } +static inline _Bool op_flag_load_delay(u32 flags) +{ + return OPT_HANDLE_LOAD_DELAYS && (flags & LIGHTREC_LOAD_DELAY); +} + static inline _Bool op_flag_emulate_branch(u32 flags) { return OPT_DETECT_IMPOSSIBLE_BRANCHES && diff --git a/deps/lightrec/emitter.c b/deps/lightrec/emitter.c index 14820e50..a6d43551 100644 --- a/deps/lightrec/emitter.c +++ b/deps/lightrec/emitter.c @@ -21,6 +21,7 @@ static void rec_SPECIAL(struct lightrec_cstate *state, const struct block *block static void rec_REGIMM(struct lightrec_cstate *state, const struct block *block, u16 offset); static void rec_CP0(struct lightrec_cstate *state, const struct block *block, u16 offset); static void rec_CP2(struct lightrec_cstate *state, const struct block *block, u16 offset); +static void rec_META(struct lightrec_cstate *state, const struct block *block, u16 offset); static void rec_cp2_do_mtc2(struct lightrec_cstate *state, const struct block *block, u16 offset, u8 reg, u8 in_reg); static void rec_cp2_do_mfc2(struct lightrec_cstate *state, @@ -35,12 +36,24 @@ static void unknown_opcode(struct lightrec_cstate *state, const struct block *bl } static void -lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit) +lightrec_jump_to_fn(jit_state_t *_jit, void (*fn)(void)) { /* Prevent jit_jmpi() from using our cycles register as a temporary */ jit_live(LIGHTREC_REG_CYCLE); - jit_patch_abs(jit_jmpi(), state->state->eob_wrapper_func); + jit_patch_abs(jit_jmpi(), fn); +} + +static void +lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit) +{ + lightrec_jump_to_fn(_jit, state->state->eob_wrapper_func); +} + +static void +lightrec_jump_to_ds_check(struct lightrec_cstate *state, jit_state_t *_jit) +{ + lightrec_jump_to_fn(_jit, state->state->ds_check_func); } static void update_ra_register(struct regcache *reg_cache, jit_state_t *_jit, @@ -61,7 +74,7 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, struct regcache *reg_cache = state->reg_cache; jit_state_t *_jit = block->_jit; const struct opcode *op = &block->opcode_list[offset], - *next = &block->opcode_list[offset + 1]; + *ds = get_delay_slot(block->opcode_list, offset); u32 cycles = state->cycles + lightrec_cycles_of_opcode(op->c); jit_note(__FILE__, __LINE__); @@ -83,10 +96,10 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, if (has_delay_slot(op->c) && !op_flag_no_ds(op->flags) && !op_flag_local_branch(op->flags)) { - cycles += lightrec_cycles_of_opcode(next->c); + cycles += lightrec_cycles_of_opcode(ds->c); /* Recompile the delay slot */ - if (next->c.opcode) + if (ds->c.opcode) lightrec_rec_opcode(state, block, offset + 1); } @@ -98,11 +111,41 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, pr_debug("EOB: %u cycles\n", cycles); } - lightrec_jump_to_eob(state, _jit); + if (op_flag_load_delay(ds->flags) + && opcode_is_load(ds->c) && !state->no_load_delay) { + /* If the delay slot is a load opcode, its target register + * will be written after the first opcode of the target is + * executed. Handle this by jumping to a special section of + * the dispatcher. It expects the loaded value to be in + * REG_TEMP, and the target register number to be in JIT_V1.*/ + jit_movi(JIT_V1, ds->c.i.rt); + + lightrec_jump_to_ds_check(state, _jit); + } else { + lightrec_jump_to_eob(state, _jit); + } } -void lightrec_emit_eob(struct lightrec_cstate *state, - const struct block *block, u16 offset) +void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state, + const struct block *block, u16 offset) +{ + struct regcache *reg_cache = state->reg_cache; + jit_state_t *_jit = block->_jit; + + lightrec_clean_regs(reg_cache, _jit); + + /* Call the interpreter with the block's address in JIT_V1 and the + * PC (which might have an offset) in JIT_V0. */ + lightrec_load_imm(reg_cache, _jit, JIT_V0, block->pc, + block->pc + (offset << 2)); + jit_movi(JIT_V1, (uintptr_t)block); + + jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles); + lightrec_jump_to_fn(_jit, state->state->interpreter_func); +} + +static void lightrec_emit_eob(struct lightrec_cstate *state, + const struct block *block, u16 offset) { struct regcache *reg_cache = state->reg_cache; jit_state_t *_jit = block->_jit; @@ -198,9 +241,9 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 jit_state_t *_jit = block->_jit; struct lightrec_branch *branch; const struct opcode *op = &block->opcode_list[offset], - *next = &block->opcode_list[offset + 1]; + *ds = get_delay_slot(block->opcode_list, offset); jit_node_t *addr; - bool is_forward = (s16)op->i.imm >= -1; + bool is_forward = (s16)op->i.imm >= 0; int op_cycles = lightrec_cycles_of_opcode(op->c); u32 target_offset, cycles = state->cycles + op_cycles; bool no_indirection = false; @@ -210,7 +253,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 jit_note(__FILE__, __LINE__); if (!op_flag_no_ds(op->flags)) - cycles += lightrec_cycles_of_opcode(next->c); + cycles += lightrec_cycles_of_opcode(ds->c); state->cycles = -op_cycles; @@ -224,7 +267,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 lightrec_do_early_unload(state, block, offset); if (op_flag_local_branch(op->flags) && - (op_flag_no_ds(op->flags) || !next->opcode) && + (op_flag_no_ds(op->flags) || !ds->opcode) && is_forward && !lightrec_has_dirty_regs(reg_cache)) no_indirection = true; @@ -246,8 +289,11 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 if (op_flag_local_branch(op->flags)) { /* Recompile the delay slot */ - if (!op_flag_no_ds(op->flags) && next->opcode) + if (!op_flag_no_ds(op->flags) && ds->opcode) { + /* Never handle load delays with local branches. */ + state->no_load_delay = true; lightrec_rec_opcode(state, block, offset + 1); + } if (link) update_ra_register(reg_cache, _jit, 31, block->pc, link); @@ -274,6 +320,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 if (!op_flag_local_branch(op->flags) || !is_forward) { next_pc = get_branch_pc(block, offset, 1 + (s16)op->i.imm); + state->no_load_delay = op_flag_local_branch(op->flags); lightrec_emit_end_of_block(state, block, offset, -1, next_pc, 31, link, false); } @@ -287,8 +334,10 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 if (bz && link) update_ra_register(reg_cache, _jit, 31, block->pc, link); - if (!op_flag_no_ds(op->flags) && next->opcode) + if (!op_flag_no_ds(op->flags) && ds->opcode) { + state->no_load_delay = true; lightrec_rec_opcode(state, block, offset + 1); + } } } @@ -1090,6 +1139,7 @@ static void rec_io(struct lightrec_cstate *state, u32 flags = block->opcode_list[offset].flags; bool is_tagged = LIGHTREC_FLAGS_GET_IO_MODE(flags); u32 lut_entry; + u8 zero; jit_note(__FILE__, __LINE__); @@ -1100,6 +1150,16 @@ static void rec_io(struct lightrec_cstate *state, else if (load_rt) lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false); + if (op_flag_load_delay(flags) && !state->no_load_delay) { + /* Clear state->in_delay_slot_n. This notifies the lightrec_rw + * wrapper that it should write the REG_TEMP register instead of + * the actual output register of the opcode. */ + zero = lightrec_alloc_reg_in(reg_cache, _jit, 0, 0); + jit_stxi_c(offsetof(struct lightrec_state, in_delay_slot_n), + LIGHTREC_REG_STATE, zero); + lightrec_free_reg(reg_cache, zero); + } + if (is_tagged) { call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_RW); } else { @@ -1143,7 +1203,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate, ((imm & 0x3) || simm + lut_offt != (s16)(simm + lut_offt)))); bool need_tmp = !no_mask || addr_offset || add_imm || invalidate; bool swc2 = c.i.op == OP_SWC2; - u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; + u8 in_reg = swc2 ? REG_TEMP : c.i.rt; rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); @@ -1202,7 +1262,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate, if (addr_reg == rs && c.i.rs == 0) { addr_reg = LIGHTREC_REG_STATE; } else { - jit_addr(tmp, addr_reg, LIGHTREC_REG_STATE); + jit_add_state(tmp, addr_reg); addr_reg = tmp; } @@ -1268,14 +1328,15 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_end; bool swc2 = c.i.op == OP_SWC2; - u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; + bool offset_ram_or_scratch = state->offset_ram || state->offset_scratch; + u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_TEMP : c.i.rt; s16 imm; jit_note(__FILE__, __LINE__); rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); tmp = lightrec_alloc_reg_temp(reg_cache, _jit); - if (state->offset_ram || state->offset_scratch) + if (offset_ram_or_scratch) tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit); /* Convert to KUNSEG and avoid RAM mirrors */ @@ -1307,7 +1368,7 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, jit_movi(tmp2, state->offset_ram); } - if (state->offset_ram || state->offset_scratch) { + if (offset_ram_or_scratch) { jit_addr(tmp, tmp, tmp2); lightrec_free_reg(reg_cache, tmp2); } @@ -1340,7 +1401,7 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block jit_node_t *to_not_ram, *to_end; bool swc2 = c.i.op == OP_SWC2; u8 tmp, tmp2, tmp3, masked_reg, rs, rt; - u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; + u8 in_reg = swc2 ? REG_TEMP : c.i.rt; jit_note(__FILE__, __LINE__); @@ -1376,7 +1437,7 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block if (!lut_is_32bit(state)) jit_lshi(tmp, tmp, 1); - jit_addr(tmp, LIGHTREC_REG_STATE, tmp); + jit_add_state(tmp, tmp); /* Write NULL to the code LUT to invalidate any block that's there */ if (lut_is_32bit(state)) @@ -1437,7 +1498,7 @@ static void rec_store(struct lightrec_cstate *state, case LIGHTREC_IO_SCRATCH: case LIGHTREC_IO_DIRECT: case LIGHTREC_IO_DIRECT_HW: - rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_CP2_TEMP); + rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_TEMP); break; default: break; @@ -1469,7 +1530,7 @@ static void rec_store(struct lightrec_cstate *state, } if (is_swc2) - lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP); + lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP); } static void rec_SB(struct lightrec_cstate *state, @@ -1519,14 +1580,15 @@ static void rec_load_memory(struct lightrec_cstate *cstate, { struct regcache *reg_cache = cstate->reg_cache; struct opcode *op = &block->opcode_list[offset]; + bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay; jit_state_t *_jit = block->_jit; u8 rs, rt, out_reg, addr_reg, flags = REG_EXT; bool no_mask = op_flag_no_mask(op->flags); union code c = op->c; s16 imm; - if (c.i.op == OP_LWC2) - out_reg = REG_CP2_TEMP; + if (load_delay || c.i.op == OP_LWC2) + out_reg = REG_TEMP; else if (c.i.rt) out_reg = c.i.rt; else @@ -1619,14 +1681,16 @@ static void rec_load_direct(struct lightrec_cstate *cstate, { struct lightrec_state *state = cstate->state; struct regcache *reg_cache = cstate->reg_cache; - union code c = block->opcode_list[offset].c; + struct opcode *op = &block->opcode_list[offset]; + bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay; jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_not_bios, *to_end, *to_end2; u8 tmp, rs, rt, out_reg, addr_reg, flags = REG_EXT; + union code c = op->c; s16 imm; - if (c.i.op == OP_LWC2) - out_reg = REG_CP2_TEMP; + if (load_delay || c.i.op == OP_LWC2) + out_reg = REG_TEMP; else if (c.i.rt) out_reg = c.i.rt; else @@ -1754,8 +1818,8 @@ static void rec_load(struct lightrec_cstate *state, const struct block *block, } if (op->i.op == OP_LWC2) { - rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_CP2_TEMP); - lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP); + rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_TEMP); + lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP); } } @@ -1827,6 +1891,15 @@ static void rec_break_syscall(struct lightrec_cstate *state, jit_stxi_i(offsetof(struct lightrec_state, exit_flags), LIGHTREC_REG_STATE, tmp); + jit_ldxi_i(tmp, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, target_cycle)); + jit_subr(tmp, tmp, LIGHTREC_REG_CYCLE); + jit_movi(LIGHTREC_REG_CYCLE, 0); + jit_stxi_i(offsetof(struct lightrec_state, target_cycle), + LIGHTREC_REG_STATE, tmp); + jit_stxi_i(offsetof(struct lightrec_state, current_cycle), + LIGHTREC_REG_STATE, tmp); + lightrec_free_reg(reg_cache, tmp); /* TODO: the return address should be "pc - 4" if we're a delay slot */ @@ -1872,6 +1945,7 @@ static void rec_mtc(struct lightrec_cstate *state, const struct block *block, u1 jit_note(__FILE__, __LINE__); lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rs, false); lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false); + lightrec_clean_reg_if_loaded(reg_cache, _jit, REG_TEMP, false); call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_MTC); @@ -1901,13 +1975,16 @@ rec_mfc0(struct lightrec_cstate *state, const struct block *block, u16 offset) lightrec_free_reg(reg_cache, rt); } -static bool block_in_bios(const struct lightrec_cstate *state, - const struct block *block) +static bool block_uses_icache(const struct lightrec_cstate *state, + const struct block *block) { - const struct lightrec_mem_map *bios = &state->state->maps[PSX_MAP_BIOS]; + const struct lightrec_mem_map *map = &state->state->maps[PSX_MAP_KERNEL_USER_RAM]; u32 pc = kunseg(block->pc); - return pc >= bios->pc && pc < bios->pc + bios->length; + if (pc < map->pc || pc >= map->pc + map->length) + return false; + + return (block->pc >> 28) < 0xa; } static void @@ -1933,11 +2010,11 @@ rec_mtc0(struct lightrec_cstate *state, const struct block *block, u16 offset) break; } - if (/*block_in_bios(state, block) &&*/ c.r.rd == 12) { - /* If we are running code from the BIOS, handle writes to the - * Status register in C. BIOS code may toggle bit 16 which will - * map/unmap the RAM, while game code cannot do that. */ - /* ^ wrong, it can execute from 0xa0000000 with isolated cache */ + if (!block_uses_icache(state, block) && c.r.rd == 12) { + /* If we are not running code from the RAM through kuseg or + * kseg0, handle writes to the Status register in C; as the + * code may toggle bit 16 which isolates the cache. Code + * running from kuseg or kseg0 in RAM cannot do that. */ rec_mtc(state, block, offset); return; } @@ -2193,7 +2270,6 @@ static void rec_cp2_do_mtc2(struct lightrec_cstate *state, { struct regcache *reg_cache = state->reg_cache; jit_state_t *_jit = block->_jit; - jit_node_t *loop, *to_loop; u8 rt, tmp, tmp2, flags = 0; _jit_name(block->_jit, __func__); @@ -2246,30 +2322,20 @@ static void rec_cp2_do_mtc2(struct lightrec_cstate *state, break; case 30: tmp = lightrec_alloc_reg_temp(reg_cache, _jit); - tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit); /* if (rt < 0) rt = ~rt; */ jit_rshi(tmp, rt, 31); jit_xorr(tmp, rt, tmp); - /* We know the sign bit is 0. Left-shift by 1 to start the algorithm */ - jit_lshi(tmp, tmp, 1); - jit_movi(tmp2, 33); - - /* Decrement tmp2 and right-shift the value by 1 until it equals zero */ - loop = jit_label(); - jit_subi(tmp2, tmp2, 1); - jit_rshi_u(tmp, tmp, 1); - to_loop = jit_bnei(tmp, 0); - - jit_patch_at(to_loop, loop); + /* Count leading zeros */ + jit_clzr(tmp, tmp); + if (__WORDSIZE != 32) + jit_subi(tmp, tmp, __WORDSIZE - 32); - jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp2); - jit_stxi_i(cp2d_i_offset(30), LIGHTREC_REG_STATE, rt); + jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp); lightrec_free_reg(reg_cache, tmp); - lightrec_free_reg(reg_cache, tmp2); - break; + fallthrough; default: jit_stxi_i(cp2d_i_offset(reg), LIGHTREC_REG_STATE, rt); break; @@ -2406,34 +2472,44 @@ static void rec_meta_MOV(struct lightrec_cstate *state, unload_rd = OPT_EARLY_UNLOAD && LIGHTREC_FLAGS_GET_RD(op->flags) == LIGHTREC_REG_UNLOAD; - if (c.r.rs || unload_rd) - rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0); + if (c.m.rs && !lightrec_reg_is_loaded(reg_cache, c.m.rs)) { + /* The source register is not yet loaded - we can load its value + * from the register cache directly into the target register. */ + rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT); + + jit_ldxi_i(rd, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, regs.gpr) + (c.m.rs << 2)); - if (unload_rd) { + lightrec_free_reg(reg_cache, rd); + } else if (unload_rd) { /* If the destination register will be unloaded right after the * MOV meta-opcode, we don't actually need to write any host * register - we can just store the source register directly to * the register cache, at the offset corresponding to the * destination register. */ - lightrec_discard_reg_if_loaded(reg_cache, c.r.rd); + lightrec_discard_reg_if_loaded(reg_cache, c.m.rd); + + rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0); jit_stxi_i(offsetof(struct lightrec_state, regs.gpr) - + c.r.rd << 2, LIGHTREC_REG_STATE, rs); + + (c.m.rd << 2), LIGHTREC_REG_STATE, rs); lightrec_free_reg(reg_cache, rs); } else { - rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT); + if (c.m.rs) + rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0); + + rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT); - if (c.r.rs == 0) + if (c.m.rs == 0) { jit_movi(rd, 0); - else + } else { jit_extr_i(rd, rs); + lightrec_free_reg(reg_cache, rs); + } lightrec_free_reg(reg_cache, rd); } - - if (c.r.rs || unload_rd) - lightrec_free_reg(reg_cache, rs); } static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state, @@ -2443,21 +2519,21 @@ static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state, struct regcache *reg_cache = state->reg_cache; union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; - u8 rs, rt; + u8 rs, rd; _jit_name(block->_jit, __func__); jit_note(__FILE__, __LINE__); - rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); - rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, REG_EXT); + rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0); + rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT); - if (c.i.op == OP_META_EXTC) - jit_extr_c(rt, rs); + if (c.m.op == OP_META_EXTC) + jit_extr_c(rd, rs); else - jit_extr_s(rt, rs); + jit_extr_s(rd, rs); lightrec_free_reg(reg_cache, rs); - lightrec_free_reg(reg_cache, rt); + lightrec_free_reg(reg_cache, rd); } static void rec_meta_MULT2(struct lightrec_cstate *state, @@ -2524,6 +2600,29 @@ static void rec_meta_MULT2(struct lightrec_cstate *state, jit_note(__FILE__, __LINE__); } +static void rec_meta_COM(struct lightrec_cstate *state, + const struct block *block, u16 offset) +{ + struct regcache *reg_cache = state->reg_cache; + union code c = block->opcode_list[offset].c; + jit_state_t *_jit = block->_jit; + u8 rd, rs, flags; + + jit_note(__FILE__, __LINE__); + rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0); + rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, 0); + + flags = lightrec_get_reg_in_flags(reg_cache, rs); + + lightrec_set_reg_out_flags(reg_cache, rd, + flags & REG_EXT); + + jit_comr(rd, rs); + + lightrec_free_reg(reg_cache, rs); + lightrec_free_reg(reg_cache, rd); +} + static const lightrec_rec_func_t rec_standard[64] = { SET_DEFAULT_ELM(rec_standard, unknown_opcode), [OP_SPECIAL] = rec_SPECIAL, @@ -2559,9 +2658,7 @@ static const lightrec_rec_func_t rec_standard[64] = { [OP_LWC2] = rec_LW, [OP_SWC2] = rec_SW, - [OP_META_MOV] = rec_meta_MOV, - [OP_META_EXTC] = rec_meta_EXTC_EXTS, - [OP_META_EXTS] = rec_meta_EXTC_EXTS, + [OP_META] = rec_META, [OP_META_MULT2] = rec_meta_MULT2, [OP_META_MULTU2] = rec_meta_MULT2, }; @@ -2623,6 +2720,14 @@ static const lightrec_rec_func_t rec_cp2_basic[64] = { [OP_CP2_BASIC_CTC2] = rec_cp2_basic_CTC2, }; +static const lightrec_rec_func_t rec_meta[64] = { + SET_DEFAULT_ELM(rec_meta, unknown_opcode), + [OP_META_MOV] = rec_meta_MOV, + [OP_META_EXTC] = rec_meta_EXTC_EXTS, + [OP_META_EXTS] = rec_meta_EXTC_EXTS, + [OP_META_COM] = rec_meta_COM, +}; + static void rec_SPECIAL(struct lightrec_cstate *state, const struct block *block, u16 offset) { @@ -2676,6 +2781,18 @@ static void rec_CP2(struct lightrec_cstate *state, rec_CP(state, block, offset); } +static void rec_META(struct lightrec_cstate *state, + const struct block *block, u16 offset) +{ + union code c = block->opcode_list[offset].c; + lightrec_rec_func_t f = rec_meta[c.m.op]; + + if (!HAS_DEFAULT_ELM && unlikely(!f)) + unknown_opcode(state, block, offset); + else + (*f)(state, block, offset); +} + void lightrec_rec_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset) { @@ -2715,4 +2832,6 @@ void lightrec_rec_opcode(struct lightrec_cstate *state, lightrec_do_early_unload(state, block, unload_offset); } + + state->no_load_delay = false; } diff --git a/deps/lightrec/emitter.h b/deps/lightrec/emitter.h index 4cbe8da6..c960a7fb 100644 --- a/deps/lightrec/emitter.h +++ b/deps/lightrec/emitter.h @@ -13,7 +13,7 @@ struct lightrec_cstate; struct opcode; void lightrec_rec_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset); -void lightrec_emit_eob(struct lightrec_cstate *state, - const struct block *block, u16 offset); +void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state, + const struct block *block, u16 offset); #endif /* __EMITTER_H__ */ diff --git a/deps/lightrec/interpreter.c b/deps/lightrec/interpreter.c index ea8098cd..80a07f32 100644 --- a/deps/lightrec/interpreter.c +++ b/deps/lightrec/interpreter.c @@ -16,6 +16,7 @@ struct interpreter; static u32 int_CP0(struct interpreter *inter); static u32 int_CP2(struct interpreter *inter); static u32 int_SPECIAL(struct interpreter *inter); +static u32 int_META(struct interpreter *inter); static u32 int_REGIMM(struct interpreter *inter); static u32 int_branch(struct interpreter *inter, u32 pc, union code code, bool branch); @@ -45,7 +46,7 @@ static inline u32 int_get_ds_pc(const struct interpreter *inter, s16 imm) static inline struct opcode *next_op(const struct interpreter *inter) { - return &inter->block->opcode_list[inter->offset + 1]; + return &inter->op[1]; } static inline u32 execute(lightrec_int_func_t func, struct interpreter *inter) @@ -186,7 +187,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch) * interpreter in that case. * Same goes for when we have a branch in a delay slot of another * branch. */ - load_in_ds = load_in_delay_slot(op->c); + load_in_ds = opcode_is_load(op->c) || opcode_is_mfc(op->c); branch_in_ds = has_delay_slot(op->c); if (branch) { @@ -241,6 +242,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch) new_op.c = op_next; new_op.flags = 0; inter2.op = &new_op; + inter2.offset = 0; /* Execute the first opcode of the next block */ lightrec_int_op(&inter2); @@ -259,6 +261,7 @@ static u32 int_delay_slot(struct interpreter *inter, u32 pc, bool branch) inter2.block = inter->block; inter2.op = op; inter2.cycles = inter->cycles; + inter2.offset = inter->offset + 1; if (dummy_ld) new_rt = reg_cache[op->r.rt]; @@ -351,11 +354,6 @@ static u32 int_jumpr(struct interpreter *inter, u8 link_reg) u32 old_pc = int_get_branch_pc(inter); u32 next_pc = state->regs.gpr[inter->op->r.rs]; - if (op_flag_emulate_branch(inter->op->flags) && inter->offset) { - inter->cycles -= lightrec_cycles_of_opcode(inter->op->c); - return old_pc; - } - if (link_reg) state->regs.gpr[link_reg] = old_pc + 8; @@ -391,11 +389,6 @@ static u32 int_branch(struct interpreter *inter, u32 pc, { u32 next_pc = pc + 4 + ((s16)code.i.imm << 2); - if (op_flag_emulate_branch(inter->op->flags) && inter->offset) { - inter->cycles -= lightrec_cycles_of_opcode(inter->op->c); - return pc; - } - update_cycles_before_branch(inter); if (op_flag_no_ds(inter->op->flags)) { @@ -605,11 +598,14 @@ static u32 int_io(struct interpreter *inter, bool is_load) { struct opcode_i *op = &inter->op->i; u32 *reg_cache = inter->state->regs.gpr; - u32 val; + u32 val, *flags = NULL; + + if (inter->block) + flags = &inter->op->flags; val = lightrec_rw(inter->state, inter->op->c, reg_cache[op->rs], reg_cache[op->rt], - &inter->op->flags, inter->block); + flags, inter->block, inter->offset); if (is_load && op->rt) reg_cache[op->rt] = val; @@ -632,7 +628,7 @@ static u32 int_store(struct interpreter *inter) lightrec_rw(inter->state, inter->op->c, inter->state->regs.gpr[inter->op->i.rs], inter->state->regs.gpr[inter->op->i.rt], - &inter->op->flags, inter->block); + &inter->op->flags, inter->block, inter->offset); next_pc = int_get_ds_pc(inter, 1); @@ -717,9 +713,9 @@ static u32 int_syscall_break(struct interpreter *inter) { if (inter->op->r.op == OP_SPECIAL_BREAK) - inter->state->exit_flags |= LIGHTREC_EXIT_BREAK; + lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_BREAK); else - inter->state->exit_flags |= LIGHTREC_EXIT_SYSCALL; + lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_SYSCALL); return int_get_ds_pc(inter, 0); } @@ -955,7 +951,7 @@ static u32 int_special_SLTU(struct interpreter *inter) static u32 int_META_MOV(struct interpreter *inter) { u32 *reg_cache = inter->state->regs.gpr; - struct opcode_r *op = &inter->op->r; + struct opcode_m *op = &inter->op->m; if (likely(op->rd)) reg_cache[op->rd] = reg_cache[op->rs]; @@ -966,10 +962,10 @@ static u32 int_META_MOV(struct interpreter *inter) static u32 int_META_EXTC(struct interpreter *inter) { u32 *reg_cache = inter->state->regs.gpr; - struct opcode_i *op = &inter->op->i; + struct opcode_m *op = &inter->op->m; - if (likely(op->rt)) - reg_cache[op->rt] = (u32)(s32)(s8)reg_cache[op->rs]; + if (likely(op->rd)) + reg_cache[op->rd] = (u32)(s32)(s8)reg_cache[op->rs]; return jump_next(inter); } @@ -977,10 +973,10 @@ static u32 int_META_EXTC(struct interpreter *inter) static u32 int_META_EXTS(struct interpreter *inter) { u32 *reg_cache = inter->state->regs.gpr; - struct opcode_i *op = &inter->op->i; + struct opcode_m *op = &inter->op->m; - if (likely(op->rt)) - reg_cache[op->rt] = (u32)(s32)(s16)reg_cache[op->rs]; + if (likely(op->rd)) + reg_cache[op->rd] = (u32)(s32)(s16)reg_cache[op->rs]; return jump_next(inter); } @@ -1012,6 +1008,17 @@ static u32 int_META_MULT2(struct interpreter *inter) return jump_next(inter); } +static u32 int_META_COM(struct interpreter *inter) +{ + u32 *reg_cache = inter->state->regs.gpr; + union code c = inter->op->c; + + if (likely(c.m.rd)) + reg_cache[c.m.rd] = ~reg_cache[c.m.rs]; + + return jump_next(inter); +} + static const lightrec_int_func_t int_standard[64] = { SET_DEFAULT_ELM(int_standard, int_unimplemented), [OP_SPECIAL] = int_SPECIAL, @@ -1047,9 +1054,7 @@ static const lightrec_int_func_t int_standard[64] = { [OP_LWC2] = int_LWC2, [OP_SWC2] = int_store, - [OP_META_MOV] = int_META_MOV, - [OP_META_EXTC] = int_META_EXTC, - [OP_META_EXTS] = int_META_EXTS, + [OP_META] = int_META, [OP_META_MULT2] = int_META_MULT2, [OP_META_MULTU2] = int_META_MULT2, }; @@ -1111,6 +1116,14 @@ static const lightrec_int_func_t int_cp2_basic[64] = { [OP_CP2_BASIC_CTC2] = int_ctc, }; +static const lightrec_int_func_t int_meta[64] = { + SET_DEFAULT_ELM(int_meta, int_unimplemented), + [OP_META_MOV] = int_META_MOV, + [OP_META_EXTC] = int_META_EXTC, + [OP_META_EXTS] = int_META_EXTS, + [OP_META_COM] = int_META_COM, +}; + static u32 int_SPECIAL(struct interpreter *inter) { lightrec_int_func_t f = int_special[inter->op->r.op]; @@ -1152,6 +1165,16 @@ static u32 int_CP2(struct interpreter *inter) return int_CP(inter); } +static u32 int_META(struct interpreter *inter) +{ + lightrec_int_func_t f = int_meta[inter->op->m.op]; + + if (!HAS_DEFAULT_ELM && unlikely(!f)) + return int_unimplemented(inter); + + return execute(f, inter); +} + static u32 lightrec_emulate_block_list(struct lightrec_state *state, struct block *block, u32 offset) { @@ -1188,3 +1211,75 @@ u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u3 return 0; } + +static u32 branch_get_next_pc(struct lightrec_state *state, union code c, u32 pc) +{ + switch (c.i.op) { + case OP_SPECIAL: + /* JR / JALR */ + return state->regs.gpr[c.r.rs]; + case OP_J: + case OP_JAL: + return (pc & 0xf0000000) | (c.j.imm << 2); + default: + /* Branch opcodes */ + return pc + 4 + ((s16)c.i.imm << 2); + } +} + +u32 lightrec_handle_load_delay(struct lightrec_state *state, + struct block *block, u32 pc, u32 reg) +{ + union code c = lightrec_read_opcode(state, pc); + struct opcode op[2] = { + { + .c = c, + .flags = 0, + }, + { + .flags = 0, + }, + }; + struct interpreter inter = { + .block = block, + .state = state, + .offset = 0, + .op = op, + .cycles = 0, + }; + bool branch_taken; + u32 reg_mask, next_pc; + + if (has_delay_slot(c)) { + op[1].c = lightrec_read_opcode(state, pc + 4); + + branch_taken = is_branch_taken(state->regs.gpr, c); + next_pc = branch_get_next_pc(state, c, pc); + + /* Branch was evaluated, we can write the load opcode's target + * register now. */ + state->regs.gpr[reg] = state->temp_reg; + + /* Handle JALR / regimm opcodes setting $ra (or any other + * register in the case of JALR) */ + reg_mask = (u32)opcode_write_mask(c); + if (reg_mask) + state->regs.gpr[ctz32(reg_mask)] = pc + 8; + + /* Handle delay slot of the branch opcode */ + pc = int_delay_slot(&inter, next_pc, branch_taken); + } else { + /* Make sure we only run one instruction */ + inter.delay_slot = true; + + lightrec_int_op(&inter); + pc += 4; + + if (!opcode_writes_register(c, reg)) + state->regs.gpr[reg] = state->temp_reg; + } + + state->current_cycle += inter.cycles; + + return pc; +} diff --git a/deps/lightrec/interpreter.h b/deps/lightrec/interpreter.h index 96600bfc..51c53906 100644 --- a/deps/lightrec/interpreter.h +++ b/deps/lightrec/interpreter.h @@ -11,5 +11,7 @@ struct block; u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u32 pc); +u32 lightrec_handle_load_delay(struct lightrec_state *state, + struct block *block, u32 pc, u32 reg); #endif /* __LIGHTREC_INTERPRETER_H__ */ diff --git a/deps/lightrec/lightning-wrapper.h b/deps/lightrec/lightning-wrapper.h index b0e8bf3b..4cb97d3a 100644 --- a/deps/lightrec/lightning-wrapper.h +++ b/deps/lightrec/lightning-wrapper.h @@ -21,4 +21,14 @@ #define jit_b() jit_beqr(0, 0) +#if defined(__sh__) +#define jit_add_state(u,v) \ + do { \ + jit_new_node_ww(jit_code_movr,_R0,LIGHTREC_REG_STATE); \ + jit_new_node_www(jit_code_addr,u,v,_R0); \ + } while (0) +#else +#define jit_add_state(u,v) jit_addr(u,v,LIGHTREC_REG_STATE) +#endif + #endif /* __LIGHTNING_WRAPPER_H__ */ diff --git a/deps/lightrec/lightrec-config.h.cmakein b/deps/lightrec/lightrec-config.h.cmakein index 11886653..ed29ee4d 100644 --- a/deps/lightrec/lightrec-config.h.cmakein +++ b/deps/lightrec/lightrec-config.h.cmakein @@ -16,10 +16,10 @@ #cmakedefine01 OPT_REMOVE_DIV_BY_ZERO_SEQ #cmakedefine01 OPT_REPLACE_MEMSET #cmakedefine01 OPT_DETECT_IMPOSSIBLE_BRANCHES +#cmakedefine01 OPT_HANDLE_LOAD_DELAYS #cmakedefine01 OPT_TRANSFORM_OPS #cmakedefine01 OPT_LOCAL_BRANCHES #cmakedefine01 OPT_SWITCH_DELAY_SLOTS -#cmakedefine01 OPT_FLAG_STORES #cmakedefine01 OPT_FLAG_IO #cmakedefine01 OPT_FLAG_MULT_DIV #cmakedefine01 OPT_EARLY_UNLOAD diff --git a/deps/lightrec/lightrec-private.h b/deps/lightrec/lightrec-private.h index e67d406f..12e953a2 100644 --- a/deps/lightrec/lightrec-private.h +++ b/deps/lightrec/lightrec-private.h @@ -81,7 +81,7 @@ #define REG_LO 32 #define REG_HI 33 -#define REG_CP2_TEMP (offsetof(struct lightrec_state, cp2_temp_reg) / sizeof(u32)) +#define REG_TEMP (offsetof(struct lightrec_state, temp_reg) / sizeof(u32)) /* Definition of jit_state_t (avoids inclusion of ) */ struct jit_node; @@ -149,13 +149,16 @@ struct lightrec_cstate { unsigned int cycles; struct regcache *reg_cache; + + _Bool no_load_delay; }; struct lightrec_state { struct lightrec_registers regs; - u32 cp2_temp_reg; + u32 temp_reg; u32 next_pc; uintptr_t wrapper_regs[NUM_TEMPS]; + u8 in_delay_slot_n; u32 current_cycle; u32 target_cycle; u32 exit_flags; @@ -169,10 +172,13 @@ struct lightrec_state { struct reaper *reaper; void *tlsf; void (*eob_wrapper_func)(void); + void (*interpreter_func)(void); + void (*ds_check_func)(void); void (*memset_func)(void); void (*get_next_block)(void); struct lightrec_ops ops; unsigned int nb_precompile; + unsigned int nb_compile; unsigned int nb_maps; const struct lightrec_mem_map *maps; uintptr_t offset_ram, offset_bios, offset_scratch, offset_io; @@ -182,9 +188,8 @@ struct lightrec_state { void *code_lut[]; }; -u32 lightrec_rw(struct lightrec_state *state, union code op, - u32 addr, u32 data, u32 *flags, - struct block *block); +u32 lightrec_rw(struct lightrec_state *state, union code op, u32 addr, + u32 data, u32 *flags, struct block *block, u16 offset); void lightrec_free_block(struct lightrec_state *state, struct block *block); @@ -285,7 +290,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, struct block *block); void lightrec_free_opcode_list(struct lightrec_state *state, struct opcode *list); -unsigned int lightrec_cycles_of_opcode(union code code); +__cnst unsigned int lightrec_cycles_of_opcode(union code code); static inline u8 get_mult_div_lo(union code c) { @@ -349,4 +354,10 @@ static inline _Bool can_zero_extend(u32 value, u8 order) return (value >> order) == 0; } +static inline const struct opcode * +get_delay_slot(const struct opcode *list, u16 i) +{ + return op_flag_no_ds(list[i].flags) ? &list[i - 1] : &list[i + 1]; +} + #endif /* __LIGHTREC_PRIVATE_H__ */ diff --git a/deps/lightrec/lightrec.c b/deps/lightrec/lightrec.c index b9e82fb2..d5b1de96 100644 --- a/deps/lightrec/lightrec.c +++ b/deps/lightrec/lightrec.c @@ -237,26 +237,43 @@ lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr) return map; } -u32 lightrec_rw(struct lightrec_state *state, union code op, - u32 addr, u32 data, u32 *flags, struct block *block) +u32 lightrec_rw(struct lightrec_state *state, union code op, u32 base, + u32 data, u32 *flags, struct block *block, u16 offset) { const struct lightrec_mem_map *map; const struct lightrec_mem_map_ops *ops; u32 opcode = op.opcode; + bool was_tagged = true; + u16 old_flags; + u32 addr; void *host; - addr += (s16) op.i.imm; + addr = kunseg(base + (s16) op.i.imm); - map = lightrec_get_map(state, &host, kunseg(addr)); + map = lightrec_get_map(state, &host, addr); if (!map) { __segfault_cb(state, addr, block); return 0; } + if (flags) + was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(*flags); if (likely(!map->ops)) { - if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags)) - *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT); + if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags)) { + /* Force parallel port accesses as HW accesses, because + * the direct-I/O emitters can't differenciate it. */ + if (unlikely(map == &state->maps[PSX_MAP_PARALLEL_PORT])) + *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW); + /* If the base register is 0x0, be extra suspicious. + * Some games (e.g. Sled Storm) actually do segmentation + * faults by using uninitialized pointers, which are + * later initialized to point to hardware registers. */ + else if (op.i.rs && base == 0x0) + *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW); + else + *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT); + } ops = &lightrec_default_ops; } else if (flags && @@ -269,6 +286,17 @@ u32 lightrec_rw(struct lightrec_state *state, union code op, ops = map->ops; } + if (!was_tagged) { + old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE); + + if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) { + pr_debug("Opcode of block at PC 0x%08x has been tagged" + " - flag for recompilation\n", block->pc); + + lut_write(state, lut_offset(block->pc), NULL); + } + } + switch (op.i.op) { case OP_SB: ops->sb(state, opcode, host, addr, (u8) data); @@ -311,10 +339,10 @@ u32 lightrec_rw(struct lightrec_state *state, union code op, static void lightrec_rw_helper(struct lightrec_state *state, union code op, u32 *flags, - struct block *block) + struct block *block, u16 offset) { u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs], - state->regs.gpr[op.i.rt], flags, block); + state->regs.gpr[op.i.rt], flags, block, offset); switch (op.i.op) { case OP_LB: @@ -324,8 +352,12 @@ static void lightrec_rw_helper(struct lightrec_state *state, case OP_LWL: case OP_LWR: case OP_LW: - if (op.i.rt) + if (OPT_HANDLE_LOAD_DELAYS && unlikely(!state->in_delay_slot_n)) { + state->temp_reg = ret; + state->in_delay_slot_n = 0xff; + } else if (op.i.rt) { state->regs.gpr[op.i.rt] = ret; + } fallthrough; default: break; @@ -334,16 +366,14 @@ static void lightrec_rw_helper(struct lightrec_state *state, static void lightrec_rw_cb(struct lightrec_state *state, u32 arg) { - lightrec_rw_helper(state, (union code) arg, NULL, NULL); + lightrec_rw_helper(state, (union code) arg, NULL, NULL, 0); } static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg) { struct block *block; struct opcode *op; - bool was_tagged; u16 offset = (u16)arg; - u16 old_flags; block = lightrec_find_block_from_lut(state->block_cache, arg >> 16, state->next_pc); @@ -355,20 +385,7 @@ static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg) } op = &block->opcode_list[offset]; - was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(op->flags); - - lightrec_rw_helper(state, op->c, &op->flags, block); - - if (!was_tagged) { - old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE); - - if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) { - pr_debug("Opcode of block at PC 0x%08x has been tagged" - " - flag for recompilation\n", block->pc); - - lut_write(state, lut_offset(block->pc), NULL); - } - } + lightrec_rw_helper(state, op->c, &op->flags, block, offset); } static u32 clamp_s32(s32 val, s32 min, s32 max) @@ -462,7 +479,7 @@ static void lightrec_mfc_cb(struct lightrec_state *state, union code op) u32 rt = lightrec_mfc(state, op); if (op.i.op == OP_SWC2) - state->cp2_temp_reg = rt; + state->temp_reg = rt; else if (op.r.rt) state->regs.gpr[op.r.rt] = rt; } @@ -603,7 +620,7 @@ static void lightrec_mtc_cb(struct lightrec_state *state, u32 arg) u8 reg; if (op.i.op == OP_LWC2) { - data = state->cp2_temp_reg; + data = state->temp_reg; reg = op.i.rt; } else { data = state->regs.gpr[op.r.rt]; @@ -703,6 +720,7 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc) } should_recompile = block_has_flag(block, BLOCK_SHOULD_RECOMPILE) && + !block_has_flag(block, BLOCK_NEVER_COMPILE) && !block_has_flag(block, BLOCK_IS_DEAD); if (unlikely(should_recompile)) { @@ -803,6 +821,8 @@ static void lightrec_free_code(struct lightrec_state *state, void *ptr) lightrec_code_alloc_unlock(state); } +static char lightning_code_data[0x80000]; + static void * lightrec_emit_code(struct lightrec_state *state, const struct block *block, jit_state_t *_jit, unsigned int *size) @@ -813,7 +833,9 @@ static void * lightrec_emit_code(struct lightrec_state *state, jit_realize(); - if (!ENABLE_DISASSEMBLER) + if (ENABLE_DISASSEMBLER) + jit_set_data(lightning_code_data, sizeof(lightning_code_data), 0); + else jit_set_data(NULL, 0, JIT_DISABLE_DATA | JIT_DISABLE_NOTE); if (has_code_buffer) { @@ -872,6 +894,15 @@ static struct block * generate_wrapper(struct lightrec_state *state) unsigned int i; jit_node_t *addr[C_WRAPPERS_COUNT - 1]; jit_node_t *to_end[C_WRAPPERS_COUNT - 1]; + u8 tmp = JIT_R1; + +#ifdef __sh__ + /* On SH, GBR-relative loads target the r0 register. + * Use it as the temporary register to factorize the move to + * JIT_R1. */ + if (LIGHTREC_REG_STATE == _GBR) + tmp = _R0; +#endif block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block)); if (!block) @@ -890,17 +921,18 @@ static struct block * generate_wrapper(struct lightrec_state *state) /* Add entry points */ for (i = C_WRAPPERS_COUNT - 1; i > 0; i--) { - jit_ldxi(JIT_R1, LIGHTREC_REG_STATE, + jit_ldxi(tmp, LIGHTREC_REG_STATE, offsetof(struct lightrec_state, c_wrappers[i])); to_end[i - 1] = jit_b(); addr[i - 1] = jit_indirect(); } - jit_ldxi(JIT_R1, LIGHTREC_REG_STATE, + jit_ldxi(tmp, LIGHTREC_REG_STATE, offsetof(struct lightrec_state, c_wrappers[0])); for (i = 0; i < C_WRAPPERS_COUNT - 1; i++) jit_patch(to_end[i]); + jit_movr(JIT_R1, tmp); jit_epilog(); jit_prolog(); @@ -1002,11 +1034,54 @@ static u32 lightrec_memset(struct lightrec_state *state) return 8 + 5 * (length + 3 / 4); } +static u32 lightrec_check_load_delay(struct lightrec_state *state, u32 pc, u8 reg) +{ + struct block *block; + union code first_op; + + first_op = lightrec_read_opcode(state, pc); + + if (likely(!opcode_reads_register(first_op, reg))) { + state->regs.gpr[reg] = state->temp_reg; + } else { + block = lightrec_get_block(state, pc); + if (unlikely(!block)) { + pr_err("Unable to get block at PC 0x%08x\n", pc); + lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT); + pc = 0; + } else { + pc = lightrec_handle_load_delay(state, block, pc, reg); + } + } + + return pc; +} + +static void update_cycle_counter_before_c(jit_state_t *_jit) +{ + /* update state->current_cycle */ + jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, target_cycle)); + jit_subr(JIT_R1, JIT_R2, LIGHTREC_REG_CYCLE); + jit_stxi_i(offsetof(struct lightrec_state, current_cycle), + LIGHTREC_REG_STATE, JIT_R1); +} + +static void update_cycle_counter_after_c(jit_state_t *_jit) +{ + /* Recalc the delta */ + jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, current_cycle)); + jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, target_cycle)); + jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1); +} + static struct block * generate_dispatcher(struct lightrec_state *state) { struct block *block; jit_state_t *_jit; - jit_node_t *to_end, *loop, *addr, *addr2, *addr3; + jit_node_t *to_end, *loop, *addr, *addr2, *addr3, *addr4, *addr5, *jmp, *jmp2; unsigned int i; u32 offset; @@ -1047,13 +1122,70 @@ static struct block * generate_dispatcher(struct lightrec_state *state) jit_prepare(); jit_pushargr(LIGHTREC_REG_STATE); + jit_finishi(lightrec_memset); + jit_retval(LIGHTREC_REG_CYCLE); jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE, offsetof(struct lightrec_state, regs.gpr[31])); - - jit_retval(LIGHTREC_REG_CYCLE); jit_subr(LIGHTREC_REG_CYCLE, JIT_V1, LIGHTREC_REG_CYCLE); + + if (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS) + jmp = jit_b(); + } + + if (OPT_DETECT_IMPOSSIBLE_BRANCHES) { + /* Blocks will jump here when they reach a branch that should + * be executed with the interpreter, passing the branch's PC + * in JIT_V0 and the address of the block in JIT_V1. */ + addr4 = jit_indirect(); + + update_cycle_counter_before_c(_jit); + + jit_prepare(); + jit_pushargr(LIGHTREC_REG_STATE); + jit_pushargr(JIT_V1); + jit_pushargr(JIT_V0); + jit_finishi(lightrec_emulate_block); + + jit_retval(JIT_V0); + + update_cycle_counter_after_c(_jit); + + if (OPT_HANDLE_LOAD_DELAYS) + jmp2 = jit_b(); + + } + + if (OPT_HANDLE_LOAD_DELAYS) { + /* Blocks will jump here when they reach a branch with a load + * opcode in its delay slot. The delay slot has already been + * executed; the load value is in (state->temp_reg), and the + * register number is in JIT_V1. + * Jump to a C function which will evaluate the branch target's + * first opcode, to make sure that it does not read the register + * in question; and if it does, handle it accordingly. */ + addr5 = jit_indirect(); + + update_cycle_counter_before_c(_jit); + + jit_prepare(); + jit_pushargr(LIGHTREC_REG_STATE); + jit_pushargr(JIT_V0); + jit_pushargr(JIT_V1); + jit_finishi(lightrec_check_load_delay); + + jit_retval(JIT_V0); + + update_cycle_counter_after_c(_jit); + + if (OPT_DETECT_IMPOSSIBLE_BRANCHES) + jit_patch(jmp2); + } + + if (OPT_REPLACE_MEMSET + && (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)) { + jit_patch(jmp); } /* The block will jump here, with the number of cycles remaining in @@ -1077,7 +1209,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state) /* If possible, use the code LUT */ if (!lut_is_32bit(state)) jit_lshi(JIT_V1, JIT_V1, 1); - jit_addr(JIT_V1, JIT_V1, LIGHTREC_REG_STATE); + jit_add_state(JIT_V1, JIT_V1); offset = offsetof(struct lightrec_state, code_lut); if (lut_is_32bit(state)) @@ -1097,11 +1229,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state) if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) { /* We may call the interpreter - update state->current_cycle */ - jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE, - offsetof(struct lightrec_state, target_cycle)); - jit_subr(JIT_V1, JIT_R2, LIGHTREC_REG_CYCLE); - jit_stxi_i(offsetof(struct lightrec_state, current_cycle), - LIGHTREC_REG_STATE, JIT_V1); + update_cycle_counter_before_c(_jit); } jit_prepare(); @@ -1119,11 +1247,7 @@ static struct block * generate_dispatcher(struct lightrec_state *state) if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) { /* The interpreter may have updated state->current_cycle and * state->target_cycle - recalc the delta */ - jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE, - offsetof(struct lightrec_state, current_cycle)); - jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE, - offsetof(struct lightrec_state, target_cycle)); - jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1); + update_cycle_counter_after_c(_jit); } else { jit_movr(LIGHTREC_REG_CYCLE, JIT_V0); } @@ -1153,6 +1277,10 @@ static struct block * generate_dispatcher(struct lightrec_state *state) goto err_free_block; state->eob_wrapper_func = jit_address(addr2); + if (OPT_DETECT_IMPOSSIBLE_BRANCHES) + state->interpreter_func = jit_address(addr4); + if (OPT_HANDLE_LOAD_DELAYS) + state->ds_check_func = jit_address(addr5); if (OPT_REPLACE_MEMSET) state->memset_func = jit_address(addr3); state->get_next_block = jit_address(addr); @@ -1183,7 +1311,7 @@ union code lightrec_read_opcode(struct lightrec_state *state, u32 pc) return (union code) LE32TOH(*code); } -unsigned int lightrec_cycles_of_opcode(union code code) +__cnst unsigned int lightrec_cycles_of_opcode(union code code) { return 2; } @@ -1291,11 +1419,6 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state, pr_debug("Block size: %hu opcodes\n", block->nb_ops); - /* If the first opcode is an 'impossible' branch, never compile the - * block */ - if (should_emulate(block->opcode_list)) - block_flags |= BLOCK_NEVER_COMPILE; - fully_tagged = lightrec_block_is_fully_tagged(block); if (fully_tagged) block_flags |= BLOCK_FULLY_TAGGED; @@ -1311,7 +1434,7 @@ static struct block * lightrec_precompile_block(struct lightrec_state *state, addr = state->get_next_block; lut_write(state, lut_offset(pc), addr); - pr_debug("Recompile count: %u\n", state->nb_precompile++); + pr_debug("Blocks created: %u\n", ++state->nb_precompile); return block; } @@ -1324,8 +1447,12 @@ static bool lightrec_block_is_fully_tagged(const struct block *block) for (i = 0; i < block->nb_ops; i++) { op = &block->opcode_list[i]; - /* Verify that all load/stores of the opcode list - * Check all loads/stores of the opcode list and mark the + /* If we have one branch that must be emulated, we cannot trash + * the opcode list. */ + if (should_emulate(op)) + return false; + + /* Check all loads/stores of the opcode list and mark the * block as fully compiled if they all have been tagged. */ switch (op->c.i.op) { case OP_LB: @@ -1421,6 +1548,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, cstate->cycles = 0; cstate->nb_local_branches = 0; cstate->nb_targets = 0; + cstate->no_load_delay = false; jit_prolog(); jit_tramp(256); @@ -1439,7 +1567,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, pr_debug("Branch at offset 0x%x will be emulated\n", i << 2); - lightrec_emit_eob(cstate, block, i); + lightrec_emit_jump_to_interpreter(cstate, block, i); skip_next = !op_flag_no_ds(elm->flags); } else { lightrec_rec_opcode(cstate, block, i); @@ -1603,6 +1731,8 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, lightrec_unregister(MEM_FOR_CODE, old_code_size); } + pr_debug("Blocks compiled: %u\n", ++state->nb_compile); + return 0; } @@ -1775,6 +1905,7 @@ struct lightrec_state * lightrec_init(char *argv0, state->tlsf = tlsf; state->with_32bit_lut = with_32bit_lut; + state->in_delay_slot_n = 0xff; state->block_cache = lightrec_blockcache_init(state); if (!state->block_cache) diff --git a/deps/lightrec/lightrec.h b/deps/lightrec/lightrec.h index 9cd7f478..bd878c86 100644 --- a/deps/lightrec/lightrec.h +++ b/deps/lightrec/lightrec.h @@ -28,6 +28,21 @@ extern "C" { # define __api #endif +#ifndef __cnst +# ifdef __GNUC__ +# define __cnst __attribute__((const)) +# else +# define __cnst +# endif +#endif +#ifndef __pure +# ifdef __GNUC__ +# define __pure __attribute__((pure)) +# else +# define __pure +# endif +#endif + typedef uint64_t u64; typedef uint32_t u32; typedef uint16_t u16; @@ -119,7 +134,8 @@ __api void lightrec_set_invalidate_mode(struct lightrec_state *state, __api void lightrec_set_exit_flags(struct lightrec_state *state, u32 flags); __api u32 lightrec_exit_flags(struct lightrec_state *state); -__api struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state); +__api __cnst struct lightrec_registers * +lightrec_get_registers(struct lightrec_state *state); __api u32 lightrec_current_cycle_count(const struct lightrec_state *state); __api void lightrec_reset_cycle_count(struct lightrec_state *state, u32 cycles); diff --git a/deps/lightrec/memmanager.c b/deps/lightrec/memmanager.c index c7502cdb..2934d4c7 100644 --- a/deps/lightrec/memmanager.c +++ b/deps/lightrec/memmanager.c @@ -9,7 +9,7 @@ #include -#ifdef ENABLE_THREADED_COMPILER +#if ENABLE_THREADED_COMPILER #include static atomic_uint lightrec_bytes[MEM_TYPE_END]; diff --git a/deps/lightrec/optimizer.c b/deps/lightrec/optimizer.c index 04d9d809..5ce58ada 100644 --- a/deps/lightrec/optimizer.c +++ b/deps/lightrec/optimizer.c @@ -115,6 +115,8 @@ static u64 opcode_read_mask(union code op) case OP_SW: case OP_SWR: return BIT(op.i.rs) | BIT(op.i.rt); + case OP_META: + return BIT(op.m.rs); default: return BIT(op.i.rs); } @@ -139,12 +141,14 @@ static u64 mult_div_write_mask(union code op) return flags; } -static u64 opcode_write_mask(union code op) +u64 opcode_write_mask(union code op) { switch (op.i.op) { case OP_META_MULT2: case OP_META_MULTU2: return mult_div_write_mask(op); + case OP_META: + return BIT(op.m.rd); case OP_SPECIAL: switch (op.r.op) { case OP_SPECIAL_JR: @@ -182,8 +186,6 @@ static u64 opcode_write_mask(union code op) case OP_LBU: case OP_LHU: case OP_LWR: - case OP_META_EXTC: - case OP_META_EXTS: return BIT(op.i.rt); case OP_JAL: return BIT(31); @@ -214,8 +216,6 @@ static u64 opcode_write_mask(union code op) default: return 0; } - case OP_META_MOV: - return BIT(op.r.rd); default: return 0; } @@ -339,7 +339,39 @@ static bool reg_is_read_or_written(const struct opcode *list, return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg); } -static bool opcode_is_load(union code op) +bool opcode_is_mfc(union code op) +{ + switch (op.i.op) { + case OP_CP0: + switch (op.r.rs) { + case OP_CP0_MFC0: + case OP_CP0_CFC0: + return true; + default: + break; + } + + break; + case OP_CP2: + if (op.r.op == OP_CP2_BASIC) { + switch (op.r.rs) { + case OP_CP2_BASIC_MFC2: + case OP_CP2_BASIC_CFC2: + return true; + default: + break; + } + } + + break; + default: + break; + } + + return false; +} + +bool opcode_is_load(union code op) { switch (op.i.op) { case OP_LB: @@ -456,46 +488,6 @@ static bool is_nop(union code op) } } -bool load_in_delay_slot(union code op) -{ - switch (op.i.op) { - case OP_CP0: - switch (op.r.rs) { - case OP_CP0_MFC0: - case OP_CP0_CFC0: - return true; - default: - break; - } - - break; - case OP_CP2: - if (op.r.op == OP_CP2_BASIC) { - switch (op.r.rs) { - case OP_CP2_BASIC_MFC2: - case OP_CP2_BASIC_CFC2: - return true; - default: - break; - } - } - - break; - case OP_LB: - case OP_LH: - case OP_LW: - case OP_LWL: - case OP_LWR: - case OP_LBU: - case OP_LHU: - return true; - default: - break; - } - - return false; -} - static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset, struct constprop_data *v) { @@ -592,9 +584,10 @@ static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset, ldop->i.rt = next->r.rd; to_change->opcode = 0; } else { - to_change->i.op = OP_META_MOV; - to_change->r.rd = next->r.rd; - to_change->r.rs = ldop->i.rt; + to_change->i.op = OP_META; + to_change->m.op = OP_META_MOV; + to_change->m.rd = next->r.rd; + to_change->m.rs = ldop->i.rt; } if (to_nop->r.imm == 24) @@ -611,18 +604,9 @@ static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset, pr_debug("Convert SLL/SRA #%u to EXT%c\n", curr->r.imm, curr->r.imm == 24 ? 'C' : 'S'); - if (to_change == curr) { - to_change->i.rs = curr->r.rt; - to_change->i.rt = next->r.rd; - } else { - to_change->i.rt = next->r.rd; - to_change->i.rs = curr->r.rt; - } - - if (to_nop->r.imm == 24) - to_change->i.op = OP_META_EXTC; - else - to_change->i.op = OP_META_EXTS; + to_change->m.rs = curr->r.rt; + to_change->m.op = to_nop->r.imm == 24 ? OP_META_EXTC : OP_META_EXTS; + to_change->i.op = OP_META; } to_nop->opcode = 0; @@ -678,6 +662,12 @@ static void lightrec_modify_lui(struct block *block, unsigned int offset) break; if (opcode_writes_register(c, lui->i.rt)) { + if (c.i.op == OP_LWL || c.i.op == OP_LWR) { + /* LWL/LWR only partially write their target register; + * therefore the LUI should not write a different value. */ + break; + } + pr_debug("Convert LUI at offset 0x%x to kuseg\n", i - 1 << 2); lui->i.imm = kunseg(lui->i.imm << 16) >> 16; @@ -796,13 +786,11 @@ static void lightrec_patch_known_zero(struct opcode *op, case OP_ANDI: case OP_ORI: case OP_XORI: - case OP_META_MOV: - case OP_META_EXTC: - case OP_META_EXTS: case OP_META_MULT2: case OP_META_MULTU2: - if (is_known_zero(v, op->i.rs)) - op->i.rs = 0; + case OP_META: + if (is_known_zero(v, op->m.rs)) + op->m.rs = 0; break; case OP_SB: case OP_SH: @@ -842,9 +830,14 @@ static void lightrec_reset_syncs(struct block *block) for (i = 0; i < block->nb_ops; i++) { op = &list[i]; - if (op_flag_local_branch(op->flags) && has_delay_slot(op->c)) { - offset = i + 1 + (s16)op->i.imm; - list[offset].flags |= LIGHTREC_SYNC; + if (has_delay_slot(op->c)) { + if (op_flag_local_branch(op->flags)) { + offset = i + 1 - op_flag_no_ds(op->flags) + (s16)op->i.imm; + list[offset].flags |= LIGHTREC_SYNC; + } + + if (op_flag_emulate_branch(op->flags) && i + 2 < block->nb_ops) + list[i + 2].flags |= LIGHTREC_SYNC; } } } @@ -860,7 +853,7 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl for (i = 0; i < block->nb_ops; i++) { op = &list[i]; - lightrec_consts_propagate(list, i, v); + lightrec_consts_propagate(block, i, v); lightrec_patch_known_zero(op, v); @@ -963,8 +956,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl case OP_ADDIU: if (op->i.imm == 0) { pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n"); - op->i.op = OP_META_MOV; - op->r.rd = op->i.rt; + op->m.rd = op->i.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } break; case OP_ANDI: @@ -974,8 +968,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl if (op->i.rs == op->i.rt) { op->opcode = 0; } else { - op->i.op = OP_META_MOV; - op->r.rd = op->i.rt; + op->m.rd = op->i.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } } break; @@ -1023,8 +1018,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl case OP_SPECIAL_SRA: if (op->r.imm == 0) { pr_debug("Convert SRA #0 to MOV\n"); - op->i.op = OP_META_MOV; - op->r.rs = op->r.rt; + op->m.rs = op->r.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; break; } break; @@ -1041,8 +1037,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl case OP_SPECIAL_SLL: if (op->r.imm == 0) { pr_debug("Convert SLL #0 to MOV\n"); - op->i.op = OP_META_MOV; - op->r.rs = op->r.rt; + op->m.rs = op->r.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } lightrec_optimize_sll_sra(block->opcode_list, i, v); @@ -1060,8 +1057,9 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl case OP_SPECIAL_SRL: if (op->r.imm == 0) { pr_debug("Convert SRL #0 to MOV\n"); - op->i.op = OP_META_MOV; - op->r.rs = op->r.rt; + op->m.rs = op->r.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } break; @@ -1087,20 +1085,31 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl op->r.op = ctz32(v[op->r.rt].value); break; + case OP_SPECIAL_NOR: + if (op->r.rs == 0 || op->r.rt == 0) { + pr_debug("Convert NOR $zero to COM\n"); + op->i.op = OP_META; + op->m.op = OP_META_COM; + if (!op->m.rs) + op->m.rs = op->r.rt; + } + break; case OP_SPECIAL_OR: case OP_SPECIAL_ADD: case OP_SPECIAL_ADDU: if (op->r.rs == 0) { pr_debug("Convert OR/ADD $zero to MOV\n"); - op->i.op = OP_META_MOV; - op->r.rs = op->r.rt; + op->m.rs = op->r.rt; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } fallthrough; case OP_SPECIAL_SUB: case OP_SPECIAL_SUBU: if (op->r.rt == 0) { pr_debug("Convert OR/ADD/SUB $zero to MOV\n"); - op->i.op = OP_META_MOV; + op->m.op = OP_META_MOV; + op->i.op = OP_META; } fallthrough; default: @@ -1197,6 +1206,9 @@ static int lightrec_switch_delay_slots(struct lightrec_state *state, struct bloc if (op_flag_sync(next->flags)) continue; + if (op_flag_load_delay(next->flags) && opcode_is_load(next_op)) + continue; + if (!lightrec_can_switch_delay_slot(list->c, next_op)) continue; @@ -1214,52 +1226,20 @@ static int lightrec_switch_delay_slots(struct lightrec_state *state, struct bloc return 0; } -static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size) -{ - struct opcode_list *list, *old_list; - - if (new_size >= block->nb_ops) { - pr_err("Invalid shrink size (%u vs %u)\n", - new_size, block->nb_ops); - return -EINVAL; - } - - list = lightrec_malloc(state, MEM_FOR_IR, - sizeof(*list) + sizeof(struct opcode) * new_size); - if (!list) { - pr_err("Unable to allocate memory\n"); - return -ENOMEM; - } - - old_list = container_of(block->opcode_list, struct opcode_list, ops); - memcpy(list->ops, old_list->ops, sizeof(struct opcode) * new_size); - - lightrec_free_opcode_list(state, block->opcode_list); - list->nb_ops = new_size; - block->nb_ops = new_size; - block->opcode_list = list->ops; - - pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n", - block->pc, new_size); - - return 0; -} - static int lightrec_detect_impossible_branches(struct lightrec_state *state, struct block *block) { struct opcode *op, *list = block->opcode_list, *next = &list[0]; unsigned int i; int ret = 0; - s16 offset; for (i = 0; i < block->nb_ops - 1; i++) { op = next; next = &list[i + 1]; if (!has_delay_slot(op->c) || - (!load_in_delay_slot(next->c) && - !has_delay_slot(next->c) && + (!has_delay_slot(next->c) && + !opcode_is_mfc(next->c) && !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE))) continue; @@ -1270,40 +1250,120 @@ static int lightrec_detect_impossible_branches(struct lightrec_state *state, continue; } - offset = i + 1 + (s16)op->i.imm; - if (load_in_delay_slot(next->c) && - (offset >= 0 && offset < block->nb_ops) && - !opcode_reads_register(list[offset].c, next->c.i.rt)) { - /* The 'impossible' branch is a local branch - we can - * verify here that the first opcode of the target does - * not use the target register of the delay slot */ - - pr_debug("Branch at offset 0x%x has load delay slot, " - "but is local and dest opcode does not read " - "dest register\n", i << 2); + op->flags |= LIGHTREC_EMULATE_BRANCH; + + if (OPT_LOCAL_BRANCHES && i + 2 < block->nb_ops) { + /* The interpreter will only emulate the branch, then + * return to the compiled code. Add a SYNC after the + * branch + delay slot in the case where the branch + * was not taken. */ + list[i + 2].flags |= LIGHTREC_SYNC; + } + } + + return ret; +} + +static bool is_local_branch(const struct block *block, unsigned int idx) +{ + const struct opcode *op = &block->opcode_list[idx]; + s32 offset; + + switch (op->c.i.op) { + case OP_BEQ: + case OP_BNE: + case OP_BLEZ: + case OP_BGTZ: + case OP_REGIMM: + offset = idx + 1 + (s16)op->c.i.imm; + if (offset >= 0 && offset < block->nb_ops) + return true; + fallthrough; + default: + return false; + } +} + +static int lightrec_handle_load_delays(struct lightrec_state *state, + struct block *block) +{ + struct opcode *op, *list = block->opcode_list; + unsigned int i; + s16 imm; + + for (i = 0; i < block->nb_ops; i++) { + op = &list[i]; + + if (!opcode_is_load(op->c) || !op->c.i.rt || op->c.i.op == OP_LWC2) + continue; + + if (!is_delay_slot(list, i)) { + /* Only handle load delays in delay slots. + * PSX games never abused load delay slots otherwise. */ continue; } - op->flags |= LIGHTREC_EMULATE_BRANCH; + if (is_local_branch(block, i - 1)) { + imm = (s16)list[i - 1].c.i.imm; - if (op == list) { - pr_debug("First opcode of block PC 0x%08x is an impossible branch\n", - block->pc); + if (!opcode_reads_register(list[i + imm].c, op->c.i.rt)) { + /* The target opcode of the branch is inside + * the block, and it does not read the register + * written to by the load opcode; we can ignore + * the load delay. */ + continue; + } + } - /* If the first opcode is an 'impossible' branch, we - * only keep the first two opcodes of the block (the - * branch itself + its delay slot) */ - if (block->nb_ops > 2) - ret = shrink_opcode_list(state, block, 2); - break; + op->flags |= LIGHTREC_LOAD_DELAY; + } + + return 0; +} + +static int lightrec_swap_load_delays(struct lightrec_state *state, + struct block *block) +{ + unsigned int i; + union code c, next; + bool in_ds = false, skip_next = false; + struct opcode op; + + if (block->nb_ops < 2) + return 0; + + for (i = 0; i < block->nb_ops - 2; i++) { + c = block->opcode_list[i].c; + + if (skip_next) { + skip_next = false; + } else if (!in_ds && opcode_is_load(c) && c.i.op != OP_LWC2) { + next = block->opcode_list[i + 1].c; + + if (c.i.op == OP_LWL && next.i.op == OP_LWR) + continue; + + if (opcode_reads_register(next, c.i.rt) + && !opcode_writes_register(next, c.i.rs)) { + pr_debug("Swapping opcodes at offset 0x%x to " + "respect load delay\n", i << 2); + + op = block->opcode_list[i]; + block->opcode_list[i] = block->opcode_list[i + 1]; + block->opcode_list[i + 1] = op; + skip_next = true; + } } + + in_ds = has_delay_slot(c); } - return ret; + return 0; } static int lightrec_local_branches(struct lightrec_state *state, struct block *block) { + const struct opcode *ds; struct opcode *list; unsigned int i; s32 offset; @@ -1311,25 +1371,19 @@ static int lightrec_local_branches(struct lightrec_state *state, struct block *b for (i = 0; i < block->nb_ops; i++) { list = &block->opcode_list[i]; - if (should_emulate(list)) + if (should_emulate(list) || !is_local_branch(block, i)) continue; - switch (list->i.op) { - case OP_BEQ: - case OP_BNE: - case OP_BLEZ: - case OP_BGTZ: - case OP_REGIMM: - offset = i + 1 + (s16)list->i.imm; - if (offset >= 0 && offset < block->nb_ops) - break; - fallthrough; - default: - continue; - } + offset = i + 1 + (s16)list->c.i.imm; pr_debug("Found local branch to offset 0x%x\n", offset << 2); + ds = get_delay_slot(block->opcode_list, i); + if (op_flag_load_delay(ds->flags) && opcode_is_load(ds->c)) { + pr_debug("Branch delay slot has a load delay - skip\n"); + continue; + } + if (should_emulate(&block->opcode_list[offset])) { pr_debug("Branch target must be emulated - skip\n"); continue; @@ -1388,7 +1442,7 @@ static bool op_writes_rd(union code c) { switch (c.i.op) { case OP_SPECIAL: - case OP_META_MOV: + case OP_META: return true; default: return false; @@ -1447,7 +1501,7 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo struct opcode *op; s16 last_r[34], last_w[34], last_sync = 0, next_sync = 0; u64 mask_r, mask_w, dirty = 0, loaded = 0; - u8 reg; + u8 reg, load_delay_reg = 0; memset(last_r, 0xff, sizeof(last_r)); memset(last_w, 0xff, sizeof(last_w)); @@ -1468,6 +1522,13 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo for (i = 0; i < block->nb_ops; i++) { op = &block->opcode_list[i]; + if (OPT_HANDLE_LOAD_DELAYS && load_delay_reg) { + /* Handle delayed register write from load opcodes in + * delay slots */ + last_w[load_delay_reg] = i; + load_delay_reg = 0; + } + if (op_flag_sync(op->flags) || should_emulate(op)) { /* The next opcode has the SYNC flag set, or is a branch * that should be emulated: unload all registers. */ @@ -1489,6 +1550,15 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo mask_r = opcode_read_mask(op->c); mask_w = opcode_write_mask(op->c); + if (op_flag_load_delay(op->flags) && opcode_is_load(op->c)) { + /* If we have a load opcode in a delay slot, its target + * register is actually not written there but at a + * later point, in the dispatcher. Prevent the algorithm + * from discarding its previous value. */ + load_delay_reg = op->c.i.rt; + mask_w &= ~BIT(op->c.i.rt); + } + for (reg = 0; reg < 34; reg++) { if (mask_r & BIT(reg)) { if (dirty & BIT(reg) && last_w[reg] < last_sync) { @@ -1553,37 +1623,32 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) for (i = 0; i < block->nb_ops; i++) { list = &block->opcode_list[i]; - lightrec_consts_propagate(block->opcode_list, i, v); + lightrec_consts_propagate(block, i, v); switch (list->i.op) { case OP_SB: case OP_SH: case OP_SW: - if (OPT_FLAG_STORES) { - /* Mark all store operations that target $sp or $gp - * as not requiring code invalidation. This is based - * on the heuristic that stores using one of these - * registers as address will never hit a code page. */ - if (list->i.rs >= 28 && list->i.rs <= 29 && - !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) { - pr_debug("Flaging opcode 0x%08x as not " - "requiring invalidation\n", - list->opcode); - list->flags |= LIGHTREC_NO_INVALIDATE; - list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT); - } + /* Mark all store operations that target $sp or $gp + * as not requiring code invalidation. This is based + * on the heuristic that stores using one of these + * registers as address will never hit a code page. */ + if (list->i.rs >= 28 && list->i.rs <= 29 && + !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) { + pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n", + list->opcode); + list->flags |= LIGHTREC_NO_INVALIDATE; + } - /* Detect writes whose destination address is inside the - * current block, using constant propagation. When these - * occur, we mark the blocks as not compilable. */ - if (is_known(v, list->i.rs) && - kunseg(v[list->i.rs].value) >= kunseg(block->pc) && - kunseg(v[list->i.rs].value) < (kunseg(block->pc) + - block->nb_ops * 4)) { - pr_debug("Self-modifying block detected\n"); - block_set_flags(block, BLOCK_NEVER_COMPILE); - list->flags |= LIGHTREC_SMC; - } + /* Detect writes whose destination address is inside the + * current block, using constant propagation. When these + * occur, we mark the blocks as not compilable. */ + if (is_known(v, list->i.rs) && + kunseg(v[list->i.rs].value) >= kunseg(block->pc) && + kunseg(v[list->i.rs].value) < (kunseg(block->pc) + block->nb_ops * 4)) { + pr_debug("Self-modifying block detected\n"); + block_set_flags(block, BLOCK_NEVER_COMPILE); + list->flags |= LIGHTREC_SMC; } fallthrough; case OP_SWL: @@ -1597,8 +1662,7 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) case OP_LWL: case OP_LWR: case OP_LWC2: - if (OPT_FLAG_IO && - (v[list->i.rs].known | v[list->i.rs].sign)) { + if (v[list->i.rs].known | v[list->i.rs].sign) { psx_map = lightrec_get_constprop_map(state, v, list->i.rs, (s16) list->i.imm); @@ -1664,6 +1728,16 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) break; } } + + if (!LIGHTREC_FLAGS_GET_IO_MODE(list->flags) + && list->i.rs >= 28 && list->i.rs <= 29 + && !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) { + /* Assume that all I/O operations that target + * $sp or $gp will always only target a mapped + * memory (RAM, BIOS, scratchpad). */ + list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT); + } + fallthrough; default: break; @@ -1862,7 +1936,7 @@ static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block * for (i = 0; i < block->nb_ops - 1; i++) { list = &block->opcode_list[i]; - lightrec_consts_propagate(block->opcode_list, i, v); + lightrec_consts_propagate(block, i, v); switch (list->i.op) { case OP_SPECIAL: @@ -2079,11 +2153,13 @@ static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block * IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence), IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset), IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches), + IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_handle_load_delays), + IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_swap_load_delays), IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_branches), IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches), IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops), IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots), - IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io), + IF_OPT(OPT_FLAG_IO, &lightrec_flag_io), IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs), IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload), }; diff --git a/deps/lightrec/optimizer.h b/deps/lightrec/optimizer.h index 825042df..f2b1f30f 100644 --- a/deps/lightrec/optimizer.h +++ b/deps/lightrec/optimizer.h @@ -11,14 +11,16 @@ struct block; struct opcode; -_Bool opcode_reads_register(union code op, u8 reg); -_Bool opcode_writes_register(union code op, u8 reg); -_Bool has_delay_slot(union code op); +__cnst _Bool opcode_reads_register(union code op, u8 reg); +__cnst _Bool opcode_writes_register(union code op, u8 reg); +__cnst u64 opcode_write_mask(union code op); +__cnst _Bool has_delay_slot(union code op); _Bool is_delay_slot(const struct opcode *list, unsigned int offset); -_Bool load_in_delay_slot(union code op); -_Bool opcode_is_io(union code op); -_Bool is_unconditional_jump(union code c); -_Bool is_syscall(union code c); +__cnst _Bool opcode_is_mfc(union code op); +__cnst _Bool opcode_is_load(union code op); +__cnst _Bool opcode_is_io(union code op); +__cnst _Bool is_unconditional_jump(union code c); +__cnst _Bool is_syscall(union code c); _Bool should_emulate(const struct opcode *op); diff --git a/deps/lightrec/regcache.c b/deps/lightrec/regcache.c index c62ba3d5..2a7ffe92 100644 --- a/deps/lightrec/regcache.c +++ b/deps/lightrec/regcache.c @@ -49,6 +49,10 @@ static const char * mips_regs[] = { "lo", "hi", }; +/* Forward declaration(s) */ +static void clean_reg(jit_state_t *_jit, + struct native_register *nreg, u8 jit_reg, bool clean); + const char * lightrec_reg_name(u8 reg) { return mips_regs[reg]; @@ -219,14 +223,7 @@ static void lightrec_discard_nreg(struct native_register *nreg) static void lightrec_unload_nreg(struct regcache *cache, jit_state_t *_jit, struct native_register *nreg, u8 jit_reg) { - /* If we get a dirty register, store back the old value */ - if (nreg->prio == REG_IS_DIRTY) { - s16 offset = offsetof(struct lightrec_state, regs.gpr) - + (nreg->emulated_register << 2); - - jit_stxi_i(offset, LIGHTREC_REG_STATE, jit_reg); - } - + clean_reg(_jit, nreg, jit_reg, false); lightrec_discard_nreg(nreg); } @@ -519,6 +516,7 @@ void lightrec_free_regs(struct regcache *cache) static void clean_reg(jit_state_t *_jit, struct native_register *nreg, u8 jit_reg, bool clean) { + /* If we get a dirty register, store back the old value */ if (nreg->prio == REG_IS_DIRTY) { s16 offset = offsetof(struct lightrec_state, regs.gpr) + (nreg->emulated_register << 2); @@ -579,6 +577,11 @@ void lightrec_clean_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg) } } +bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg) +{ + return !!find_mapped_reg(cache, reg, false); +} + void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit, u16 reg, bool unload) { diff --git a/deps/lightrec/regcache.h b/deps/lightrec/regcache.h index d242c54b..55f1cfd9 100644 --- a/deps/lightrec/regcache.h +++ b/deps/lightrec/regcache.h @@ -8,8 +8,13 @@ #include "lightning-wrapper.h" -#define NUM_REGS (JIT_V_NUM - 1) -#define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1)) +#if defined(__sh__) +# define NUM_REGS JIT_V_NUM +# define LIGHTREC_REG_STATE _GBR +#else +# define NUM_REGS (JIT_V_NUM - 1) +# define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1)) +#endif #if defined(__powerpc__) # define NUM_TEMPS JIT_R_NUM @@ -68,6 +73,7 @@ void lightrec_unload_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg); void lightrec_storeback_regs(struct regcache *cache, jit_state_t *_jit); _Bool lightrec_has_dirty_regs(struct regcache *cache); +_Bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg); void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit, u16 reg, _Bool unload); void lightrec_discard_reg_if_loaded(struct regcache *cache, u16 reg); @@ -82,7 +88,7 @@ void lightrec_regcache_leave_branch(struct regcache *cache, struct regcache * lightrec_regcache_init(struct lightrec_state *state); void lightrec_free_regcache(struct regcache *cache); -const char * lightrec_reg_name(u8 reg); +__cnst const char * lightrec_reg_name(u8 reg); void lightrec_regcache_mark_live(struct regcache *cache, jit_state_t *_jit); -- 2.39.2