[subrepo]
remote = https://github.com/pcercuei/lightrec.git
branch = master
- commit = 3ff589bcb7d52b3a091fe0b922ba02a0b1a7f095
- parent = aced3eb3fcaa0fe13c44c4dd196cdab42555fd98
+ commit = fcf239e7e9d42fedb7a8de64057d6895acf3ceee
+ parent = 03ec8a8c606eb87642be336632e1792ab89650d8
method = merge
cmdver = 0.4.3
option(OPT_REMOVE_DIV_BY_ZERO_SEQ "(optimization) Remove div-by-zero check sequence" ON)
option(OPT_REPLACE_MEMSET "(optimization) Detect and replace memset with host variant" ON)
option(OPT_DETECT_IMPOSSIBLE_BRANCHES "(optimization) Detect impossible branches" ON)
+option(OPT_HANDLE_LOAD_DELAYS "(optimization) Detect load delays" ON)
option(OPT_TRANSFORM_OPS "(optimization) Transform opcodes" ON)
option(OPT_LOCAL_BRANCHES "(optimization) Detect local branches" ON)
option(OPT_SWITCH_DELAY_SLOTS "(optimization) Switch delay slots" ON)
-option(OPT_FLAG_STORES "(optimization) Flag stores that don't require invalidation" ON)
-option(OPT_FLAG_IO "(optimization) Flag I/O opcodes whose target is known" ON)
+option(OPT_FLAG_IO "(optimization) Flag I/O opcodes when the target can be detected" ON)
option(OPT_FLAG_MULT_DIV "(optimization) Flag MULT/DIV that only use one of HI/LO" ON)
option(OPT_EARLY_UNLOAD "(optimization) Unload registers early" ON)
Basically, just a single-linked list of structures representing the
instructions. On that list, several optimization steps are performed:
instructions are modified, reordered, tagged; new meta-instructions
-can be added, for instance to tell the code generator that a certain
-register won't be used anymore.
+can also be added.
* __Lazy compilation__.
If Lightrec detects a block of code that would be very hard to
Lightrec has been ported to the following emulators:
-* [__PCSX-ReArmed__ (my own fork)](https://github.com/pcercuei/pcsx_rearmed)
+* [__PCSX-ReArmed__ (libretro)](https://github.com/libretro/pcsx_rearmed)
* [__pcsx4all__ (my own fork)](https://github.com/pcercuei/pcsx4all)
* [__Beetle__ (libretro)](https://github.com/libretro/beetle-psx-libretro/)
+* [__CubeSX/WiiSX__](https://github.com/emukidid/pcsxgc/)
+
[![Star History Chart](https://api.star-history.com/svg?repos=pcercuei/lightrec&type=Date)](https://star-history.com/#pcercuei/lightrec&Date)
}
}
-void lightrec_consts_propagate(const struct opcode *list,
+void lightrec_consts_propagate(const struct block *block,
unsigned int idx,
struct constprop_data *v)
{
+ const struct opcode *list = block->opcode_list;
union code c;
- u32 imm;
+ u32 imm, flags;
if (idx == 0)
return;
return;
}
- if (idx > 1 && !op_flag_sync(list[idx - 1].flags)) {
- c = list[idx - 2].c;
+ flags = list[idx - 1].flags;
+
+ if (idx > 1 && !op_flag_sync(flags)) {
+ if (op_flag_no_ds(flags))
+ c = list[idx - 1].c;
+ else
+ c = list[idx - 2].c;
switch (c.i.op) {
case OP_BNE:
v[c.r.rd].known = 0;
v[c.r.rd].sign = 0;
break;
+
+ case OP_SPECIAL_JALR:
+ v[c.r.rd].known = 0xffffffff;
+ v[c.r.rd].sign = 0;
+ v[c.r.rd].value = block->pc + (idx + 2 << 2);
+ break;
+
default:
break;
}
imm = imm ? GENMASK(31, 32 - imm) : 0;
v[c.i.rt].sign = 0;
}
- v[c.i.rt].known &= ~imm;
+ v[c.i.rt].known &= imm;
break;
}
fallthrough;
v[c.i.rt].known = 0;
v[c.i.rt].sign = 0;
break;
- case OP_META_MOV:
- v[c.r.rd] = v[c.r.rs];
- break;
- case OP_META_EXTC:
- v[c.i.rt].value = (s32)(s8)v[c.i.rs].value;
- if (v[c.i.rs].known & BIT(7)) {
- v[c.i.rt].known = v[c.i.rs].known | 0xffffff00;
- v[c.i.rt].sign = 0;
- } else {
- v[c.i.rt].known = v[c.i.rs].known & 0x7f;
- v[c.i.rt].sign = 0xffffff80;
- }
- break;
+ case OP_META:
+ switch (c.m.op) {
+ case OP_META_MOV:
+ v[c.m.rd] = v[c.m.rs];
+ break;
- case OP_META_EXTS:
- v[c.i.rt].value = (s32)(s16)v[c.i.rs].value;
- if (v[c.i.rs].known & BIT(15)) {
- v[c.i.rt].known = v[c.i.rs].known | 0xffff0000;
- v[c.i.rt].sign = 0;
- } else {
- v[c.i.rt].known = v[c.i.rs].known & 0x7fff;
- v[c.i.rt].sign = 0xffff8000;
+ case OP_META_EXTC:
+ v[c.m.rd].value = (s32)(s8)v[c.m.rs].value;
+ if (v[c.m.rs].known & BIT(7)) {
+ v[c.m.rd].known = v[c.m.rs].known | 0xffffff00;
+ v[c.m.rd].sign = 0;
+ } else {
+ v[c.m.rd].known = v[c.m.rs].known & 0x7f;
+ v[c.m.rd].sign = 0xffffff80;
+ }
+ break;
+
+ case OP_META_EXTS:
+ v[c.m.rd].value = (s32)(s16)v[c.m.rs].value;
+ if (v[c.m.rs].known & BIT(15)) {
+ v[c.m.rd].known = v[c.m.rs].known | 0xffff0000;
+ v[c.m.rd].sign = 0;
+ } else {
+ v[c.m.rd].known = v[c.m.rs].known & 0x7fff;
+ v[c.m.rd].sign = 0xffff8000;
+ }
+ break;
+
+ case OP_META_COM:
+ v[c.m.rd].known = v[c.m.rs].known;
+ v[c.m.rd].value = ~v[c.m.rs].value;
+ v[c.m.rd].sign = v[c.m.rs].sign;
+ break;
+ default:
+ break;
}
break;
+ case OP_JAL:
+ v[31].known = 0xffffffff;
+ v[31].sign = 0;
+ v[31].value = block->pc + (idx + 2 << 2);
+ break;
default:
break;
#define LIGHTREC_CONSTPROP_INITIALIZER { { 0, 0xffffffff, 0 }, }
-struct opcode;
+struct block;
struct constprop_data {
u32 value;
return bits_are_known_zero(v, reg, 0xffffffff);
}
-void lightrec_consts_propagate(const struct opcode *list,
+void lightrec_consts_propagate(const struct block *block,
unsigned int idx,
struct constprop_data *v);
[OP_CP2_NCCT] = "ncct ",
};
+static const char * const meta_opcodes[] = {
+ [OP_META_MOV] = "move ",
+ [OP_META_EXTC] = "extc ",
+ [OP_META_EXTS] = "exts ",
+ [OP_META_COM] = "com ",
+};
+
static const char * const mult2_opcodes[] = {
"mult2 ", "multu2 ",
};
"self-modifying code",
"no invalidation",
"no mask",
+ "load delay",
};
static const char * const opcode_io_modes[] = {
lightrec_reg_name(c.i.rt),
(s16)c.i.imm,
lightrec_reg_name(c.i.rs));
- case OP_META_MOV:
- return snprintf(buf, len, "move %s,%s",
- lightrec_reg_name(c.r.rd),
- lightrec_reg_name(c.r.rs));
- case OP_META_EXTC:
- return snprintf(buf, len, "extc %s,%s",
- lightrec_reg_name(c.i.rt),
- lightrec_reg_name(c.i.rs));
- case OP_META_EXTS:
- return snprintf(buf, len, "exts %s,%s",
- lightrec_reg_name(c.i.rt),
- lightrec_reg_name(c.i.rs));
+ case OP_META:
+ return snprintf(buf, len, "%s%s,%s",
+ meta_opcodes[c.m.op],
+ lightrec_reg_name(c.m.rd),
+ lightrec_reg_name(c.m.rs));
case OP_META_MULT2:
case OP_META_MULTU2:
*flags_ptr = opcode_multdiv_flags;
#define LIGHTREC_SMC BIT(2)
#define LIGHTREC_NO_INVALIDATE BIT(3)
#define LIGHTREC_NO_MASK BIT(4)
+#define LIGHTREC_LOAD_DELAY BIT(5)
/* I/O mode for load/store opcodes */
-#define LIGHTREC_IO_MODE_LSB 5
+#define LIGHTREC_IO_MODE_LSB 6
#define LIGHTREC_IO_MODE(x) ((x) << LIGHTREC_IO_MODE_LSB)
#define LIGHTREC_IO_UNKNOWN 0x0
#define LIGHTREC_IO_DIRECT 0x1
OP_LWC2 = 0x32,
OP_SWC2 = 0x3a,
- OP_META_MOV = 0x16,
-
- OP_META_EXTC = 0x17,
- OP_META_EXTS = 0x18,
+ OP_META = 0x3b,
OP_META_MULT2 = 0x19,
OP_META_MULTU2 = 0x1a,
OP_CP2_BASIC_CTC2 = 0x06,
};
+enum meta_opcodes {
+ OP_META_MOV = 0x00,
+
+ OP_META_EXTC = 0x01,
+ OP_META_EXTS = 0x02,
+
+ OP_META_COM = 0x03,
+};
+
struct opcode_r {
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
u32 zero :6;
#endif
} __packed;
+struct opcode_m {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ u32 meta :6;
+ u32 rs :5;
+ u32 rt :5;
+ u32 rd :5;
+ u32 imm :6;
+ u32 op :5;
+#else
+ u32 op :5;
+ u32 imm :6;
+ u32 rd :5;
+ u32 rt :5;
+ u32 rs :5;
+ u32 meta :6;
+#endif
+};
+
union code {
/* Keep in sync with struct opcode */
u32 opcode;
struct opcode_r r;
struct opcode_i i;
struct opcode_j j;
+ struct opcode_m m;
};
struct opcode {
struct opcode_r r;
struct opcode_i i;
struct opcode_j j;
+ struct opcode_m m;
};
u32 flags;
};
static inline _Bool op_flag_smc(u32 flags)
{
- return OPT_FLAG_STORES && (flags & LIGHTREC_SMC);
+ return OPT_FLAG_IO && (flags & LIGHTREC_SMC);
}
static inline _Bool op_flag_no_invalidate(u32 flags)
{
- return (OPT_FLAG_IO || OPT_FLAG_STORES) &&
- (flags & LIGHTREC_NO_INVALIDATE);
+ return OPT_FLAG_IO && (flags & LIGHTREC_NO_INVALIDATE);
}
static inline _Bool op_flag_no_mask(u32 flags)
return OPT_FLAG_IO && (flags & LIGHTREC_NO_MASK);
}
+static inline _Bool op_flag_load_delay(u32 flags)
+{
+ return OPT_HANDLE_LOAD_DELAYS && (flags & LIGHTREC_LOAD_DELAY);
+}
+
static inline _Bool op_flag_emulate_branch(u32 flags)
{
return OPT_DETECT_IMPOSSIBLE_BRANCHES &&
static void rec_REGIMM(struct lightrec_cstate *state, const struct block *block, u16 offset);
static void rec_CP0(struct lightrec_cstate *state, const struct block *block, u16 offset);
static void rec_CP2(struct lightrec_cstate *state, const struct block *block, u16 offset);
+static void rec_META(struct lightrec_cstate *state, const struct block *block, u16 offset);
static void rec_cp2_do_mtc2(struct lightrec_cstate *state,
const struct block *block, u16 offset, u8 reg, u8 in_reg);
static void rec_cp2_do_mfc2(struct lightrec_cstate *state,
}
static void
-lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit)
+lightrec_jump_to_fn(jit_state_t *_jit, void (*fn)(void))
{
/* Prevent jit_jmpi() from using our cycles register as a temporary */
jit_live(LIGHTREC_REG_CYCLE);
- jit_patch_abs(jit_jmpi(), state->state->eob_wrapper_func);
+ jit_patch_abs(jit_jmpi(), fn);
+}
+
+static void
+lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit)
+{
+ lightrec_jump_to_fn(_jit, state->state->eob_wrapper_func);
+}
+
+static void
+lightrec_jump_to_ds_check(struct lightrec_cstate *state, jit_state_t *_jit)
+{
+ lightrec_jump_to_fn(_jit, state->state->ds_check_func);
}
static void update_ra_register(struct regcache *reg_cache, jit_state_t *_jit,
struct regcache *reg_cache = state->reg_cache;
jit_state_t *_jit = block->_jit;
const struct opcode *op = &block->opcode_list[offset],
- *next = &block->opcode_list[offset + 1];
+ *ds = get_delay_slot(block->opcode_list, offset);
u32 cycles = state->cycles + lightrec_cycles_of_opcode(op->c);
jit_note(__FILE__, __LINE__);
if (has_delay_slot(op->c) &&
!op_flag_no_ds(op->flags) && !op_flag_local_branch(op->flags)) {
- cycles += lightrec_cycles_of_opcode(next->c);
+ cycles += lightrec_cycles_of_opcode(ds->c);
/* Recompile the delay slot */
- if (next->c.opcode)
+ if (ds->c.opcode)
lightrec_rec_opcode(state, block, offset + 1);
}
pr_debug("EOB: %u cycles\n", cycles);
}
- lightrec_jump_to_eob(state, _jit);
+ if (op_flag_load_delay(ds->flags)
+ && opcode_is_load(ds->c) && !state->no_load_delay) {
+ /* If the delay slot is a load opcode, its target register
+ * will be written after the first opcode of the target is
+ * executed. Handle this by jumping to a special section of
+ * the dispatcher. It expects the loaded value to be in
+ * REG_TEMP, and the target register number to be in JIT_V1.*/
+ jit_movi(JIT_V1, ds->c.i.rt);
+
+ lightrec_jump_to_ds_check(state, _jit);
+ } else {
+ lightrec_jump_to_eob(state, _jit);
+ }
}
-void lightrec_emit_eob(struct lightrec_cstate *state,
- const struct block *block, u16 offset)
+void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state,
+ const struct block *block, u16 offset)
+{
+ struct regcache *reg_cache = state->reg_cache;
+ jit_state_t *_jit = block->_jit;
+
+ lightrec_clean_regs(reg_cache, _jit);
+
+ /* Call the interpreter with the block's address in JIT_V1 and the
+ * PC (which might have an offset) in JIT_V0. */
+ lightrec_load_imm(reg_cache, _jit, JIT_V0, block->pc,
+ block->pc + (offset << 2));
+ jit_movi(JIT_V1, (uintptr_t)block);
+
+ jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles);
+ lightrec_jump_to_fn(_jit, state->state->interpreter_func);
+}
+
+static void lightrec_emit_eob(struct lightrec_cstate *state,
+ const struct block *block, u16 offset)
{
struct regcache *reg_cache = state->reg_cache;
jit_state_t *_jit = block->_jit;
jit_state_t *_jit = block->_jit;
struct lightrec_branch *branch;
const struct opcode *op = &block->opcode_list[offset],
- *next = &block->opcode_list[offset + 1];
+ *ds = get_delay_slot(block->opcode_list, offset);
jit_node_t *addr;
- bool is_forward = (s16)op->i.imm >= -1;
+ bool is_forward = (s16)op->i.imm >= 0;
int op_cycles = lightrec_cycles_of_opcode(op->c);
u32 target_offset, cycles = state->cycles + op_cycles;
bool no_indirection = false;
jit_note(__FILE__, __LINE__);
if (!op_flag_no_ds(op->flags))
- cycles += lightrec_cycles_of_opcode(next->c);
+ cycles += lightrec_cycles_of_opcode(ds->c);
state->cycles = -op_cycles;
lightrec_do_early_unload(state, block, offset);
if (op_flag_local_branch(op->flags) &&
- (op_flag_no_ds(op->flags) || !next->opcode) &&
+ (op_flag_no_ds(op->flags) || !ds->opcode) &&
is_forward && !lightrec_has_dirty_regs(reg_cache))
no_indirection = true;
if (op_flag_local_branch(op->flags)) {
/* Recompile the delay slot */
- if (!op_flag_no_ds(op->flags) && next->opcode)
+ if (!op_flag_no_ds(op->flags) && ds->opcode) {
+ /* Never handle load delays with local branches. */
+ state->no_load_delay = true;
lightrec_rec_opcode(state, block, offset + 1);
+ }
if (link)
update_ra_register(reg_cache, _jit, 31, block->pc, link);
if (!op_flag_local_branch(op->flags) || !is_forward) {
next_pc = get_branch_pc(block, offset, 1 + (s16)op->i.imm);
+ state->no_load_delay = op_flag_local_branch(op->flags);
lightrec_emit_end_of_block(state, block, offset, -1, next_pc,
31, link, false);
}
if (bz && link)
update_ra_register(reg_cache, _jit, 31, block->pc, link);
- if (!op_flag_no_ds(op->flags) && next->opcode)
+ if (!op_flag_no_ds(op->flags) && ds->opcode) {
+ state->no_load_delay = true;
lightrec_rec_opcode(state, block, offset + 1);
+ }
}
}
u32 flags = block->opcode_list[offset].flags;
bool is_tagged = LIGHTREC_FLAGS_GET_IO_MODE(flags);
u32 lut_entry;
+ u8 zero;
jit_note(__FILE__, __LINE__);
else if (load_rt)
lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
+ if (op_flag_load_delay(flags) && !state->no_load_delay) {
+ /* Clear state->in_delay_slot_n. This notifies the lightrec_rw
+ * wrapper that it should write the REG_TEMP register instead of
+ * the actual output register of the opcode. */
+ zero = lightrec_alloc_reg_in(reg_cache, _jit, 0, 0);
+ jit_stxi_c(offsetof(struct lightrec_state, in_delay_slot_n),
+ LIGHTREC_REG_STATE, zero);
+ lightrec_free_reg(reg_cache, zero);
+ }
+
if (is_tagged) {
call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_RW);
} else {
((imm & 0x3) || simm + lut_offt != (s16)(simm + lut_offt))));
bool need_tmp = !no_mask || addr_offset || add_imm || invalidate;
bool swc2 = c.i.op == OP_SWC2;
- u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+ u8 in_reg = swc2 ? REG_TEMP : c.i.rt;
rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0);
rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
if (addr_reg == rs && c.i.rs == 0) {
addr_reg = LIGHTREC_REG_STATE;
} else {
- jit_addr(tmp, addr_reg, LIGHTREC_REG_STATE);
+ jit_add_state(tmp, addr_reg);
addr_reg = tmp;
}
jit_state_t *_jit = block->_jit;
jit_node_t *to_not_ram, *to_end;
bool swc2 = c.i.op == OP_SWC2;
- u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+ bool offset_ram_or_scratch = state->offset_ram || state->offset_scratch;
+ u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_TEMP : c.i.rt;
s16 imm;
jit_note(__FILE__, __LINE__);
rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
- if (state->offset_ram || state->offset_scratch)
+ if (offset_ram_or_scratch)
tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
/* Convert to KUNSEG and avoid RAM mirrors */
jit_movi(tmp2, state->offset_ram);
}
- if (state->offset_ram || state->offset_scratch) {
+ if (offset_ram_or_scratch) {
jit_addr(tmp, tmp, tmp2);
lightrec_free_reg(reg_cache, tmp2);
}
jit_node_t *to_not_ram, *to_end;
bool swc2 = c.i.op == OP_SWC2;
u8 tmp, tmp2, tmp3, masked_reg, rs, rt;
- u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt;
+ u8 in_reg = swc2 ? REG_TEMP : c.i.rt;
jit_note(__FILE__, __LINE__);
if (!lut_is_32bit(state))
jit_lshi(tmp, tmp, 1);
- jit_addr(tmp, LIGHTREC_REG_STATE, tmp);
+ jit_add_state(tmp, tmp);
/* Write NULL to the code LUT to invalidate any block that's there */
if (lut_is_32bit(state))
case LIGHTREC_IO_SCRATCH:
case LIGHTREC_IO_DIRECT:
case LIGHTREC_IO_DIRECT_HW:
- rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_CP2_TEMP);
+ rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_TEMP);
break;
default:
break;
}
if (is_swc2)
- lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP);
+ lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP);
}
static void rec_SB(struct lightrec_cstate *state,
{
struct regcache *reg_cache = cstate->reg_cache;
struct opcode *op = &block->opcode_list[offset];
+ bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay;
jit_state_t *_jit = block->_jit;
u8 rs, rt, out_reg, addr_reg, flags = REG_EXT;
bool no_mask = op_flag_no_mask(op->flags);
union code c = op->c;
s16 imm;
- if (c.i.op == OP_LWC2)
- out_reg = REG_CP2_TEMP;
+ if (load_delay || c.i.op == OP_LWC2)
+ out_reg = REG_TEMP;
else if (c.i.rt)
out_reg = c.i.rt;
else
{
struct lightrec_state *state = cstate->state;
struct regcache *reg_cache = cstate->reg_cache;
- union code c = block->opcode_list[offset].c;
+ struct opcode *op = &block->opcode_list[offset];
+ bool load_delay = op_flag_load_delay(op->flags) && !cstate->no_load_delay;
jit_state_t *_jit = block->_jit;
jit_node_t *to_not_ram, *to_not_bios, *to_end, *to_end2;
u8 tmp, rs, rt, out_reg, addr_reg, flags = REG_EXT;
+ union code c = op->c;
s16 imm;
- if (c.i.op == OP_LWC2)
- out_reg = REG_CP2_TEMP;
+ if (load_delay || c.i.op == OP_LWC2)
+ out_reg = REG_TEMP;
else if (c.i.rt)
out_reg = c.i.rt;
else
}
if (op->i.op == OP_LWC2) {
- rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_CP2_TEMP);
- lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP);
+ rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_TEMP);
+ lightrec_discard_reg_if_loaded(state->reg_cache, REG_TEMP);
}
}
jit_stxi_i(offsetof(struct lightrec_state, exit_flags),
LIGHTREC_REG_STATE, tmp);
+ jit_ldxi_i(tmp, LIGHTREC_REG_STATE,
+ offsetof(struct lightrec_state, target_cycle));
+ jit_subr(tmp, tmp, LIGHTREC_REG_CYCLE);
+ jit_movi(LIGHTREC_REG_CYCLE, 0);
+ jit_stxi_i(offsetof(struct lightrec_state, target_cycle),
+ LIGHTREC_REG_STATE, tmp);
+ jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+ LIGHTREC_REG_STATE, tmp);
+
lightrec_free_reg(reg_cache, tmp);
/* TODO: the return address should be "pc - 4" if we're a delay slot */
jit_note(__FILE__, __LINE__);
lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rs, false);
lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, false);
+ lightrec_clean_reg_if_loaded(reg_cache, _jit, REG_TEMP, false);
call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_MTC);
lightrec_free_reg(reg_cache, rt);
}
-static bool block_in_bios(const struct lightrec_cstate *state,
- const struct block *block)
+static bool block_uses_icache(const struct lightrec_cstate *state,
+ const struct block *block)
{
- const struct lightrec_mem_map *bios = &state->state->maps[PSX_MAP_BIOS];
+ const struct lightrec_mem_map *map = &state->state->maps[PSX_MAP_KERNEL_USER_RAM];
u32 pc = kunseg(block->pc);
- return pc >= bios->pc && pc < bios->pc + bios->length;
+ if (pc < map->pc || pc >= map->pc + map->length)
+ return false;
+
+ return (block->pc >> 28) < 0xa;
}
static void
break;
}
- if (/*block_in_bios(state, block) &&*/ c.r.rd == 12) {
- /* If we are running code from the BIOS, handle writes to the
- * Status register in C. BIOS code may toggle bit 16 which will
- * map/unmap the RAM, while game code cannot do that. */
- /* ^ wrong, it can execute from 0xa0000000 with isolated cache */
+ if (!block_uses_icache(state, block) && c.r.rd == 12) {
+ /* If we are not running code from the RAM through kuseg or
+ * kseg0, handle writes to the Status register in C; as the
+ * code may toggle bit 16 which isolates the cache. Code
+ * running from kuseg or kseg0 in RAM cannot do that. */
rec_mtc(state, block, offset);
return;
}
{
struct regcache *reg_cache = state->reg_cache;
jit_state_t *_jit = block->_jit;
- jit_node_t *loop, *to_loop;
u8 rt, tmp, tmp2, flags = 0;
_jit_name(block->_jit, __func__);
break;
case 30:
tmp = lightrec_alloc_reg_temp(reg_cache, _jit);
- tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit);
/* if (rt < 0) rt = ~rt; */
jit_rshi(tmp, rt, 31);
jit_xorr(tmp, rt, tmp);
- /* We know the sign bit is 0. Left-shift by 1 to start the algorithm */
- jit_lshi(tmp, tmp, 1);
- jit_movi(tmp2, 33);
-
- /* Decrement tmp2 and right-shift the value by 1 until it equals zero */
- loop = jit_label();
- jit_subi(tmp2, tmp2, 1);
- jit_rshi_u(tmp, tmp, 1);
- to_loop = jit_bnei(tmp, 0);
-
- jit_patch_at(to_loop, loop);
+ /* Count leading zeros */
+ jit_clzr(tmp, tmp);
+ if (__WORDSIZE != 32)
+ jit_subi(tmp, tmp, __WORDSIZE - 32);
- jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp2);
- jit_stxi_i(cp2d_i_offset(30), LIGHTREC_REG_STATE, rt);
+ jit_stxi_i(cp2d_i_offset(31), LIGHTREC_REG_STATE, tmp);
lightrec_free_reg(reg_cache, tmp);
- lightrec_free_reg(reg_cache, tmp2);
- break;
+ fallthrough;
default:
jit_stxi_i(cp2d_i_offset(reg), LIGHTREC_REG_STATE, rt);
break;
unload_rd = OPT_EARLY_UNLOAD
&& LIGHTREC_FLAGS_GET_RD(op->flags) == LIGHTREC_REG_UNLOAD;
- if (c.r.rs || unload_rd)
- rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0);
+ if (c.m.rs && !lightrec_reg_is_loaded(reg_cache, c.m.rs)) {
+ /* The source register is not yet loaded - we can load its value
+ * from the register cache directly into the target register. */
+ rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
+
+ jit_ldxi_i(rd, LIGHTREC_REG_STATE,
+ offsetof(struct lightrec_state, regs.gpr) + (c.m.rs << 2));
- if (unload_rd) {
+ lightrec_free_reg(reg_cache, rd);
+ } else if (unload_rd) {
/* If the destination register will be unloaded right after the
* MOV meta-opcode, we don't actually need to write any host
* register - we can just store the source register directly to
* the register cache, at the offset corresponding to the
* destination register. */
- lightrec_discard_reg_if_loaded(reg_cache, c.r.rd);
+ lightrec_discard_reg_if_loaded(reg_cache, c.m.rd);
+
+ rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
jit_stxi_i(offsetof(struct lightrec_state, regs.gpr)
- + c.r.rd << 2, LIGHTREC_REG_STATE, rs);
+ + (c.m.rd << 2), LIGHTREC_REG_STATE, rs);
lightrec_free_reg(reg_cache, rs);
} else {
- rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT);
+ if (c.m.rs)
+ rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+
+ rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
- if (c.r.rs == 0)
+ if (c.m.rs == 0) {
jit_movi(rd, 0);
- else
+ } else {
jit_extr_i(rd, rs);
+ lightrec_free_reg(reg_cache, rs);
+ }
lightrec_free_reg(reg_cache, rd);
}
-
- if (c.r.rs || unload_rd)
- lightrec_free_reg(reg_cache, rs);
}
static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state,
struct regcache *reg_cache = state->reg_cache;
union code c = block->opcode_list[offset].c;
jit_state_t *_jit = block->_jit;
- u8 rs, rt;
+ u8 rs, rd;
_jit_name(block->_jit, __func__);
jit_note(__FILE__, __LINE__);
- rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0);
- rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, REG_EXT);
+ rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+ rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, REG_EXT);
- if (c.i.op == OP_META_EXTC)
- jit_extr_c(rt, rs);
+ if (c.m.op == OP_META_EXTC)
+ jit_extr_c(rd, rs);
else
- jit_extr_s(rt, rs);
+ jit_extr_s(rd, rs);
lightrec_free_reg(reg_cache, rs);
- lightrec_free_reg(reg_cache, rt);
+ lightrec_free_reg(reg_cache, rd);
}
static void rec_meta_MULT2(struct lightrec_cstate *state,
jit_note(__FILE__, __LINE__);
}
+static void rec_meta_COM(struct lightrec_cstate *state,
+ const struct block *block, u16 offset)
+{
+ struct regcache *reg_cache = state->reg_cache;
+ union code c = block->opcode_list[offset].c;
+ jit_state_t *_jit = block->_jit;
+ u8 rd, rs, flags;
+
+ jit_note(__FILE__, __LINE__);
+ rs = lightrec_alloc_reg_in(reg_cache, _jit, c.m.rs, 0);
+ rd = lightrec_alloc_reg_out(reg_cache, _jit, c.m.rd, 0);
+
+ flags = lightrec_get_reg_in_flags(reg_cache, rs);
+
+ lightrec_set_reg_out_flags(reg_cache, rd,
+ flags & REG_EXT);
+
+ jit_comr(rd, rs);
+
+ lightrec_free_reg(reg_cache, rs);
+ lightrec_free_reg(reg_cache, rd);
+}
+
static const lightrec_rec_func_t rec_standard[64] = {
SET_DEFAULT_ELM(rec_standard, unknown_opcode),
[OP_SPECIAL] = rec_SPECIAL,
[OP_LWC2] = rec_LW,
[OP_SWC2] = rec_SW,
- [OP_META_MOV] = rec_meta_MOV,
- [OP_META_EXTC] = rec_meta_EXTC_EXTS,
- [OP_META_EXTS] = rec_meta_EXTC_EXTS,
+ [OP_META] = rec_META,
[OP_META_MULT2] = rec_meta_MULT2,
[OP_META_MULTU2] = rec_meta_MULT2,
};
[OP_CP2_BASIC_CTC2] = rec_cp2_basic_CTC2,
};
+static const lightrec_rec_func_t rec_meta[64] = {
+ SET_DEFAULT_ELM(rec_meta, unknown_opcode),
+ [OP_META_MOV] = rec_meta_MOV,
+ [OP_META_EXTC] = rec_meta_EXTC_EXTS,
+ [OP_META_EXTS] = rec_meta_EXTC_EXTS,
+ [OP_META_COM] = rec_meta_COM,
+};
+
static void rec_SPECIAL(struct lightrec_cstate *state,
const struct block *block, u16 offset)
{
rec_CP(state, block, offset);
}
+static void rec_META(struct lightrec_cstate *state,
+ const struct block *block, u16 offset)
+{
+ union code c = block->opcode_list[offset].c;
+ lightrec_rec_func_t f = rec_meta[c.m.op];
+
+ if (!HAS_DEFAULT_ELM && unlikely(!f))
+ unknown_opcode(state, block, offset);
+ else
+ (*f)(state, block, offset);
+}
+
void lightrec_rec_opcode(struct lightrec_cstate *state,
const struct block *block, u16 offset)
{
lightrec_do_early_unload(state, block, unload_offset);
}
+
+ state->no_load_delay = false;
}
struct opcode;
void lightrec_rec_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset);
-void lightrec_emit_eob(struct lightrec_cstate *state,
- const struct block *block, u16 offset);
+void lightrec_emit_jump_to_interpreter(struct lightrec_cstate *state,
+ const struct block *block, u16 offset);
#endif /* __EMITTER_H__ */
static u32 int_CP0(struct interpreter *inter);
static u32 int_CP2(struct interpreter *inter);
static u32 int_SPECIAL(struct interpreter *inter);
+static u32 int_META(struct interpreter *inter);
static u32 int_REGIMM(struct interpreter *inter);
static u32 int_branch(struct interpreter *inter, u32 pc,
union code code, bool branch);
static inline struct opcode *next_op(const struct interpreter *inter)
{
- return &inter->block->opcode_list[inter->offset + 1];
+ return &inter->op[1];
}
static inline u32 execute(lightrec_int_func_t func, struct interpreter *inter)
* interpreter in that case.
* Same goes for when we have a branch in a delay slot of another
* branch. */
- load_in_ds = load_in_delay_slot(op->c);
+ load_in_ds = opcode_is_load(op->c) || opcode_is_mfc(op->c);
branch_in_ds = has_delay_slot(op->c);
if (branch) {
new_op.c = op_next;
new_op.flags = 0;
inter2.op = &new_op;
+ inter2.offset = 0;
/* Execute the first opcode of the next block */
lightrec_int_op(&inter2);
inter2.block = inter->block;
inter2.op = op;
inter2.cycles = inter->cycles;
+ inter2.offset = inter->offset + 1;
if (dummy_ld)
new_rt = reg_cache[op->r.rt];
u32 old_pc = int_get_branch_pc(inter);
u32 next_pc = state->regs.gpr[inter->op->r.rs];
- if (op_flag_emulate_branch(inter->op->flags) && inter->offset) {
- inter->cycles -= lightrec_cycles_of_opcode(inter->op->c);
- return old_pc;
- }
-
if (link_reg)
state->regs.gpr[link_reg] = old_pc + 8;
{
u32 next_pc = pc + 4 + ((s16)code.i.imm << 2);
- if (op_flag_emulate_branch(inter->op->flags) && inter->offset) {
- inter->cycles -= lightrec_cycles_of_opcode(inter->op->c);
- return pc;
- }
-
update_cycles_before_branch(inter);
if (op_flag_no_ds(inter->op->flags)) {
{
struct opcode_i *op = &inter->op->i;
u32 *reg_cache = inter->state->regs.gpr;
- u32 val;
+ u32 val, *flags = NULL;
+
+ if (inter->block)
+ flags = &inter->op->flags;
val = lightrec_rw(inter->state, inter->op->c,
reg_cache[op->rs], reg_cache[op->rt],
- &inter->op->flags, inter->block);
+ flags, inter->block, inter->offset);
if (is_load && op->rt)
reg_cache[op->rt] = val;
lightrec_rw(inter->state, inter->op->c,
inter->state->regs.gpr[inter->op->i.rs],
inter->state->regs.gpr[inter->op->i.rt],
- &inter->op->flags, inter->block);
+ &inter->op->flags, inter->block, inter->offset);
next_pc = int_get_ds_pc(inter, 1);
{
if (inter->op->r.op == OP_SPECIAL_BREAK)
- inter->state->exit_flags |= LIGHTREC_EXIT_BREAK;
+ lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_BREAK);
else
- inter->state->exit_flags |= LIGHTREC_EXIT_SYSCALL;
+ lightrec_set_exit_flags(inter->state, LIGHTREC_EXIT_SYSCALL);
return int_get_ds_pc(inter, 0);
}
static u32 int_META_MOV(struct interpreter *inter)
{
u32 *reg_cache = inter->state->regs.gpr;
- struct opcode_r *op = &inter->op->r;
+ struct opcode_m *op = &inter->op->m;
if (likely(op->rd))
reg_cache[op->rd] = reg_cache[op->rs];
static u32 int_META_EXTC(struct interpreter *inter)
{
u32 *reg_cache = inter->state->regs.gpr;
- struct opcode_i *op = &inter->op->i;
+ struct opcode_m *op = &inter->op->m;
- if (likely(op->rt))
- reg_cache[op->rt] = (u32)(s32)(s8)reg_cache[op->rs];
+ if (likely(op->rd))
+ reg_cache[op->rd] = (u32)(s32)(s8)reg_cache[op->rs];
return jump_next(inter);
}
static u32 int_META_EXTS(struct interpreter *inter)
{
u32 *reg_cache = inter->state->regs.gpr;
- struct opcode_i *op = &inter->op->i;
+ struct opcode_m *op = &inter->op->m;
- if (likely(op->rt))
- reg_cache[op->rt] = (u32)(s32)(s16)reg_cache[op->rs];
+ if (likely(op->rd))
+ reg_cache[op->rd] = (u32)(s32)(s16)reg_cache[op->rs];
return jump_next(inter);
}
return jump_next(inter);
}
+static u32 int_META_COM(struct interpreter *inter)
+{
+ u32 *reg_cache = inter->state->regs.gpr;
+ union code c = inter->op->c;
+
+ if (likely(c.m.rd))
+ reg_cache[c.m.rd] = ~reg_cache[c.m.rs];
+
+ return jump_next(inter);
+}
+
static const lightrec_int_func_t int_standard[64] = {
SET_DEFAULT_ELM(int_standard, int_unimplemented),
[OP_SPECIAL] = int_SPECIAL,
[OP_LWC2] = int_LWC2,
[OP_SWC2] = int_store,
- [OP_META_MOV] = int_META_MOV,
- [OP_META_EXTC] = int_META_EXTC,
- [OP_META_EXTS] = int_META_EXTS,
+ [OP_META] = int_META,
[OP_META_MULT2] = int_META_MULT2,
[OP_META_MULTU2] = int_META_MULT2,
};
[OP_CP2_BASIC_CTC2] = int_ctc,
};
+static const lightrec_int_func_t int_meta[64] = {
+ SET_DEFAULT_ELM(int_meta, int_unimplemented),
+ [OP_META_MOV] = int_META_MOV,
+ [OP_META_EXTC] = int_META_EXTC,
+ [OP_META_EXTS] = int_META_EXTS,
+ [OP_META_COM] = int_META_COM,
+};
+
static u32 int_SPECIAL(struct interpreter *inter)
{
lightrec_int_func_t f = int_special[inter->op->r.op];
return int_CP(inter);
}
+static u32 int_META(struct interpreter *inter)
+{
+ lightrec_int_func_t f = int_meta[inter->op->m.op];
+
+ if (!HAS_DEFAULT_ELM && unlikely(!f))
+ return int_unimplemented(inter);
+
+ return execute(f, inter);
+}
+
static u32 lightrec_emulate_block_list(struct lightrec_state *state,
struct block *block, u32 offset)
{
return 0;
}
+
+static u32 branch_get_next_pc(struct lightrec_state *state, union code c, u32 pc)
+{
+ switch (c.i.op) {
+ case OP_SPECIAL:
+ /* JR / JALR */
+ return state->regs.gpr[c.r.rs];
+ case OP_J:
+ case OP_JAL:
+ return (pc & 0xf0000000) | (c.j.imm << 2);
+ default:
+ /* Branch opcodes */
+ return pc + 4 + ((s16)c.i.imm << 2);
+ }
+}
+
+u32 lightrec_handle_load_delay(struct lightrec_state *state,
+ struct block *block, u32 pc, u32 reg)
+{
+ union code c = lightrec_read_opcode(state, pc);
+ struct opcode op[2] = {
+ {
+ .c = c,
+ .flags = 0,
+ },
+ {
+ .flags = 0,
+ },
+ };
+ struct interpreter inter = {
+ .block = block,
+ .state = state,
+ .offset = 0,
+ .op = op,
+ .cycles = 0,
+ };
+ bool branch_taken;
+ u32 reg_mask, next_pc;
+
+ if (has_delay_slot(c)) {
+ op[1].c = lightrec_read_opcode(state, pc + 4);
+
+ branch_taken = is_branch_taken(state->regs.gpr, c);
+ next_pc = branch_get_next_pc(state, c, pc);
+
+ /* Branch was evaluated, we can write the load opcode's target
+ * register now. */
+ state->regs.gpr[reg] = state->temp_reg;
+
+ /* Handle JALR / regimm opcodes setting $ra (or any other
+ * register in the case of JALR) */
+ reg_mask = (u32)opcode_write_mask(c);
+ if (reg_mask)
+ state->regs.gpr[ctz32(reg_mask)] = pc + 8;
+
+ /* Handle delay slot of the branch opcode */
+ pc = int_delay_slot(&inter, next_pc, branch_taken);
+ } else {
+ /* Make sure we only run one instruction */
+ inter.delay_slot = true;
+
+ lightrec_int_op(&inter);
+ pc += 4;
+
+ if (!opcode_writes_register(c, reg))
+ state->regs.gpr[reg] = state->temp_reg;
+ }
+
+ state->current_cycle += inter.cycles;
+
+ return pc;
+}
struct block;
u32 lightrec_emulate_block(struct lightrec_state *state, struct block *block, u32 pc);
+u32 lightrec_handle_load_delay(struct lightrec_state *state,
+ struct block *block, u32 pc, u32 reg);
#endif /* __LIGHTREC_INTERPRETER_H__ */
#define jit_b() jit_beqr(0, 0)
+#if defined(__sh__)
+#define jit_add_state(u,v) \
+ do { \
+ jit_new_node_ww(jit_code_movr,_R0,LIGHTREC_REG_STATE); \
+ jit_new_node_www(jit_code_addr,u,v,_R0); \
+ } while (0)
+#else
+#define jit_add_state(u,v) jit_addr(u,v,LIGHTREC_REG_STATE)
+#endif
+
#endif /* __LIGHTNING_WRAPPER_H__ */
#cmakedefine01 OPT_REMOVE_DIV_BY_ZERO_SEQ
#cmakedefine01 OPT_REPLACE_MEMSET
#cmakedefine01 OPT_DETECT_IMPOSSIBLE_BRANCHES
+#cmakedefine01 OPT_HANDLE_LOAD_DELAYS
#cmakedefine01 OPT_TRANSFORM_OPS
#cmakedefine01 OPT_LOCAL_BRANCHES
#cmakedefine01 OPT_SWITCH_DELAY_SLOTS
-#cmakedefine01 OPT_FLAG_STORES
#cmakedefine01 OPT_FLAG_IO
#cmakedefine01 OPT_FLAG_MULT_DIV
#cmakedefine01 OPT_EARLY_UNLOAD
#define REG_LO 32
#define REG_HI 33
-#define REG_CP2_TEMP (offsetof(struct lightrec_state, cp2_temp_reg) / sizeof(u32))
+#define REG_TEMP (offsetof(struct lightrec_state, temp_reg) / sizeof(u32))
/* Definition of jit_state_t (avoids inclusion of <lightning.h>) */
struct jit_node;
unsigned int cycles;
struct regcache *reg_cache;
+
+ _Bool no_load_delay;
};
struct lightrec_state {
struct lightrec_registers regs;
- u32 cp2_temp_reg;
+ u32 temp_reg;
u32 next_pc;
uintptr_t wrapper_regs[NUM_TEMPS];
+ u8 in_delay_slot_n;
u32 current_cycle;
u32 target_cycle;
u32 exit_flags;
struct reaper *reaper;
void *tlsf;
void (*eob_wrapper_func)(void);
+ void (*interpreter_func)(void);
+ void (*ds_check_func)(void);
void (*memset_func)(void);
void (*get_next_block)(void);
struct lightrec_ops ops;
unsigned int nb_precompile;
+ unsigned int nb_compile;
unsigned int nb_maps;
const struct lightrec_mem_map *maps;
uintptr_t offset_ram, offset_bios, offset_scratch, offset_io;
void *code_lut[];
};
-u32 lightrec_rw(struct lightrec_state *state, union code op,
- u32 addr, u32 data, u32 *flags,
- struct block *block);
+u32 lightrec_rw(struct lightrec_state *state, union code op, u32 addr,
+ u32 data, u32 *flags, struct block *block, u16 offset);
void lightrec_free_block(struct lightrec_state *state, struct block *block);
void lightrec_free_opcode_list(struct lightrec_state *state,
struct opcode *list);
-unsigned int lightrec_cycles_of_opcode(union code code);
+__cnst unsigned int lightrec_cycles_of_opcode(union code code);
static inline u8 get_mult_div_lo(union code c)
{
return (value >> order) == 0;
}
+static inline const struct opcode *
+get_delay_slot(const struct opcode *list, u16 i)
+{
+ return op_flag_no_ds(list[i].flags) ? &list[i - 1] : &list[i + 1];
+}
+
#endif /* __LIGHTREC_PRIVATE_H__ */
return map;
}
-u32 lightrec_rw(struct lightrec_state *state, union code op,
- u32 addr, u32 data, u32 *flags, struct block *block)
+u32 lightrec_rw(struct lightrec_state *state, union code op, u32 base,
+ u32 data, u32 *flags, struct block *block, u16 offset)
{
const struct lightrec_mem_map *map;
const struct lightrec_mem_map_ops *ops;
u32 opcode = op.opcode;
+ bool was_tagged = true;
+ u16 old_flags;
+ u32 addr;
void *host;
- addr += (s16) op.i.imm;
+ addr = kunseg(base + (s16) op.i.imm);
- map = lightrec_get_map(state, &host, kunseg(addr));
+ map = lightrec_get_map(state, &host, addr);
if (!map) {
__segfault_cb(state, addr, block);
return 0;
}
+ if (flags)
+ was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(*flags);
if (likely(!map->ops)) {
- if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags))
- *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+ if (flags && !LIGHTREC_FLAGS_GET_IO_MODE(*flags)) {
+ /* Force parallel port accesses as HW accesses, because
+ * the direct-I/O emitters can't differenciate it. */
+ if (unlikely(map == &state->maps[PSX_MAP_PARALLEL_PORT]))
+ *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+ /* If the base register is 0x0, be extra suspicious.
+ * Some games (e.g. Sled Storm) actually do segmentation
+ * faults by using uninitialized pointers, which are
+ * later initialized to point to hardware registers. */
+ else if (op.i.rs && base == 0x0)
+ *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
+ else
+ *flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+ }
ops = &lightrec_default_ops;
} else if (flags &&
ops = map->ops;
}
+ if (!was_tagged) {
+ old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
+
+ if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
+ pr_debug("Opcode of block at PC 0x%08x has been tagged"
+ " - flag for recompilation\n", block->pc);
+
+ lut_write(state, lut_offset(block->pc), NULL);
+ }
+ }
+
switch (op.i.op) {
case OP_SB:
ops->sb(state, opcode, host, addr, (u8) data);
static void lightrec_rw_helper(struct lightrec_state *state,
union code op, u32 *flags,
- struct block *block)
+ struct block *block, u16 offset)
{
u32 ret = lightrec_rw(state, op, state->regs.gpr[op.i.rs],
- state->regs.gpr[op.i.rt], flags, block);
+ state->regs.gpr[op.i.rt], flags, block, offset);
switch (op.i.op) {
case OP_LB:
case OP_LWL:
case OP_LWR:
case OP_LW:
- if (op.i.rt)
+ if (OPT_HANDLE_LOAD_DELAYS && unlikely(!state->in_delay_slot_n)) {
+ state->temp_reg = ret;
+ state->in_delay_slot_n = 0xff;
+ } else if (op.i.rt) {
state->regs.gpr[op.i.rt] = ret;
+ }
fallthrough;
default:
break;
static void lightrec_rw_cb(struct lightrec_state *state, u32 arg)
{
- lightrec_rw_helper(state, (union code) arg, NULL, NULL);
+ lightrec_rw_helper(state, (union code) arg, NULL, NULL, 0);
}
static void lightrec_rw_generic_cb(struct lightrec_state *state, u32 arg)
{
struct block *block;
struct opcode *op;
- bool was_tagged;
u16 offset = (u16)arg;
- u16 old_flags;
block = lightrec_find_block_from_lut(state->block_cache,
arg >> 16, state->next_pc);
}
op = &block->opcode_list[offset];
- was_tagged = LIGHTREC_FLAGS_GET_IO_MODE(op->flags);
-
- lightrec_rw_helper(state, op->c, &op->flags, block);
-
- if (!was_tagged) {
- old_flags = block_set_flags(block, BLOCK_SHOULD_RECOMPILE);
-
- if (!(old_flags & BLOCK_SHOULD_RECOMPILE)) {
- pr_debug("Opcode of block at PC 0x%08x has been tagged"
- " - flag for recompilation\n", block->pc);
-
- lut_write(state, lut_offset(block->pc), NULL);
- }
- }
+ lightrec_rw_helper(state, op->c, &op->flags, block, offset);
}
static u32 clamp_s32(s32 val, s32 min, s32 max)
u32 rt = lightrec_mfc(state, op);
if (op.i.op == OP_SWC2)
- state->cp2_temp_reg = rt;
+ state->temp_reg = rt;
else if (op.r.rt)
state->regs.gpr[op.r.rt] = rt;
}
u8 reg;
if (op.i.op == OP_LWC2) {
- data = state->cp2_temp_reg;
+ data = state->temp_reg;
reg = op.i.rt;
} else {
data = state->regs.gpr[op.r.rt];
}
should_recompile = block_has_flag(block, BLOCK_SHOULD_RECOMPILE) &&
+ !block_has_flag(block, BLOCK_NEVER_COMPILE) &&
!block_has_flag(block, BLOCK_IS_DEAD);
if (unlikely(should_recompile)) {
lightrec_code_alloc_unlock(state);
}
+static char lightning_code_data[0x80000];
+
static void * lightrec_emit_code(struct lightrec_state *state,
const struct block *block,
jit_state_t *_jit, unsigned int *size)
jit_realize();
- if (!ENABLE_DISASSEMBLER)
+ if (ENABLE_DISASSEMBLER)
+ jit_set_data(lightning_code_data, sizeof(lightning_code_data), 0);
+ else
jit_set_data(NULL, 0, JIT_DISABLE_DATA | JIT_DISABLE_NOTE);
if (has_code_buffer) {
unsigned int i;
jit_node_t *addr[C_WRAPPERS_COUNT - 1];
jit_node_t *to_end[C_WRAPPERS_COUNT - 1];
+ u8 tmp = JIT_R1;
+
+#ifdef __sh__
+ /* On SH, GBR-relative loads target the r0 register.
+ * Use it as the temporary register to factorize the move to
+ * JIT_R1. */
+ if (LIGHTREC_REG_STATE == _GBR)
+ tmp = _R0;
+#endif
block = lightrec_malloc(state, MEM_FOR_IR, sizeof(*block));
if (!block)
/* Add entry points */
for (i = C_WRAPPERS_COUNT - 1; i > 0; i--) {
- jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+ jit_ldxi(tmp, LIGHTREC_REG_STATE,
offsetof(struct lightrec_state, c_wrappers[i]));
to_end[i - 1] = jit_b();
addr[i - 1] = jit_indirect();
}
- jit_ldxi(JIT_R1, LIGHTREC_REG_STATE,
+ jit_ldxi(tmp, LIGHTREC_REG_STATE,
offsetof(struct lightrec_state, c_wrappers[0]));
for (i = 0; i < C_WRAPPERS_COUNT - 1; i++)
jit_patch(to_end[i]);
+ jit_movr(JIT_R1, tmp);
jit_epilog();
jit_prolog();
return 8 + 5 * (length + 3 / 4);
}
+static u32 lightrec_check_load_delay(struct lightrec_state *state, u32 pc, u8 reg)
+{
+ struct block *block;
+ union code first_op;
+
+ first_op = lightrec_read_opcode(state, pc);
+
+ if (likely(!opcode_reads_register(first_op, reg))) {
+ state->regs.gpr[reg] = state->temp_reg;
+ } else {
+ block = lightrec_get_block(state, pc);
+ if (unlikely(!block)) {
+ pr_err("Unable to get block at PC 0x%08x\n", pc);
+ lightrec_set_exit_flags(state, LIGHTREC_EXIT_SEGFAULT);
+ pc = 0;
+ } else {
+ pc = lightrec_handle_load_delay(state, block, pc, reg);
+ }
+ }
+
+ return pc;
+}
+
+static void update_cycle_counter_before_c(jit_state_t *_jit)
+{
+ /* update state->current_cycle */
+ jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+ offsetof(struct lightrec_state, target_cycle));
+ jit_subr(JIT_R1, JIT_R2, LIGHTREC_REG_CYCLE);
+ jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
+ LIGHTREC_REG_STATE, JIT_R1);
+}
+
+static void update_cycle_counter_after_c(jit_state_t *_jit)
+{
+ /* Recalc the delta */
+ jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
+ offsetof(struct lightrec_state, current_cycle));
+ jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
+ offsetof(struct lightrec_state, target_cycle));
+ jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+}
+
static struct block * generate_dispatcher(struct lightrec_state *state)
{
struct block *block;
jit_state_t *_jit;
- jit_node_t *to_end, *loop, *addr, *addr2, *addr3;
+ jit_node_t *to_end, *loop, *addr, *addr2, *addr3, *addr4, *addr5, *jmp, *jmp2;
unsigned int i;
u32 offset;
jit_prepare();
jit_pushargr(LIGHTREC_REG_STATE);
+
jit_finishi(lightrec_memset);
+ jit_retval(LIGHTREC_REG_CYCLE);
jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE,
offsetof(struct lightrec_state, regs.gpr[31]));
-
- jit_retval(LIGHTREC_REG_CYCLE);
jit_subr(LIGHTREC_REG_CYCLE, JIT_V1, LIGHTREC_REG_CYCLE);
+
+ if (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)
+ jmp = jit_b();
+ }
+
+ if (OPT_DETECT_IMPOSSIBLE_BRANCHES) {
+ /* Blocks will jump here when they reach a branch that should
+ * be executed with the interpreter, passing the branch's PC
+ * in JIT_V0 and the address of the block in JIT_V1. */
+ addr4 = jit_indirect();
+
+ update_cycle_counter_before_c(_jit);
+
+ jit_prepare();
+ jit_pushargr(LIGHTREC_REG_STATE);
+ jit_pushargr(JIT_V1);
+ jit_pushargr(JIT_V0);
+ jit_finishi(lightrec_emulate_block);
+
+ jit_retval(JIT_V0);
+
+ update_cycle_counter_after_c(_jit);
+
+ if (OPT_HANDLE_LOAD_DELAYS)
+ jmp2 = jit_b();
+
+ }
+
+ if (OPT_HANDLE_LOAD_DELAYS) {
+ /* Blocks will jump here when they reach a branch with a load
+ * opcode in its delay slot. The delay slot has already been
+ * executed; the load value is in (state->temp_reg), and the
+ * register number is in JIT_V1.
+ * Jump to a C function which will evaluate the branch target's
+ * first opcode, to make sure that it does not read the register
+ * in question; and if it does, handle it accordingly. */
+ addr5 = jit_indirect();
+
+ update_cycle_counter_before_c(_jit);
+
+ jit_prepare();
+ jit_pushargr(LIGHTREC_REG_STATE);
+ jit_pushargr(JIT_V0);
+ jit_pushargr(JIT_V1);
+ jit_finishi(lightrec_check_load_delay);
+
+ jit_retval(JIT_V0);
+
+ update_cycle_counter_after_c(_jit);
+
+ if (OPT_DETECT_IMPOSSIBLE_BRANCHES)
+ jit_patch(jmp2);
+ }
+
+ if (OPT_REPLACE_MEMSET
+ && (OPT_DETECT_IMPOSSIBLE_BRANCHES || OPT_HANDLE_LOAD_DELAYS)) {
+ jit_patch(jmp);
}
/* The block will jump here, with the number of cycles remaining in
/* If possible, use the code LUT */
if (!lut_is_32bit(state))
jit_lshi(JIT_V1, JIT_V1, 1);
- jit_addr(JIT_V1, JIT_V1, LIGHTREC_REG_STATE);
+ jit_add_state(JIT_V1, JIT_V1);
offset = offsetof(struct lightrec_state, code_lut);
if (lut_is_32bit(state))
if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
/* We may call the interpreter - update state->current_cycle */
- jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
- offsetof(struct lightrec_state, target_cycle));
- jit_subr(JIT_V1, JIT_R2, LIGHTREC_REG_CYCLE);
- jit_stxi_i(offsetof(struct lightrec_state, current_cycle),
- LIGHTREC_REG_STATE, JIT_V1);
+ update_cycle_counter_before_c(_jit);
}
jit_prepare();
if (ENABLE_FIRST_PASS || OPT_DETECT_IMPOSSIBLE_BRANCHES) {
/* The interpreter may have updated state->current_cycle and
* state->target_cycle - recalc the delta */
- jit_ldxi_i(JIT_R1, LIGHTREC_REG_STATE,
- offsetof(struct lightrec_state, current_cycle));
- jit_ldxi_i(JIT_R2, LIGHTREC_REG_STATE,
- offsetof(struct lightrec_state, target_cycle));
- jit_subr(LIGHTREC_REG_CYCLE, JIT_R2, JIT_R1);
+ update_cycle_counter_after_c(_jit);
} else {
jit_movr(LIGHTREC_REG_CYCLE, JIT_V0);
}
goto err_free_block;
state->eob_wrapper_func = jit_address(addr2);
+ if (OPT_DETECT_IMPOSSIBLE_BRANCHES)
+ state->interpreter_func = jit_address(addr4);
+ if (OPT_HANDLE_LOAD_DELAYS)
+ state->ds_check_func = jit_address(addr5);
if (OPT_REPLACE_MEMSET)
state->memset_func = jit_address(addr3);
state->get_next_block = jit_address(addr);
return (union code) LE32TOH(*code);
}
-unsigned int lightrec_cycles_of_opcode(union code code)
+__cnst unsigned int lightrec_cycles_of_opcode(union code code)
{
return 2;
}
pr_debug("Block size: %hu opcodes\n", block->nb_ops);
- /* If the first opcode is an 'impossible' branch, never compile the
- * block */
- if (should_emulate(block->opcode_list))
- block_flags |= BLOCK_NEVER_COMPILE;
-
fully_tagged = lightrec_block_is_fully_tagged(block);
if (fully_tagged)
block_flags |= BLOCK_FULLY_TAGGED;
addr = state->get_next_block;
lut_write(state, lut_offset(pc), addr);
- pr_debug("Recompile count: %u\n", state->nb_precompile++);
+ pr_debug("Blocks created: %u\n", ++state->nb_precompile);
return block;
}
for (i = 0; i < block->nb_ops; i++) {
op = &block->opcode_list[i];
- /* Verify that all load/stores of the opcode list
- * Check all loads/stores of the opcode list and mark the
+ /* If we have one branch that must be emulated, we cannot trash
+ * the opcode list. */
+ if (should_emulate(op))
+ return false;
+
+ /* Check all loads/stores of the opcode list and mark the
* block as fully compiled if they all have been tagged. */
switch (op->c.i.op) {
case OP_LB:
cstate->cycles = 0;
cstate->nb_local_branches = 0;
cstate->nb_targets = 0;
+ cstate->no_load_delay = false;
jit_prolog();
jit_tramp(256);
pr_debug("Branch at offset 0x%x will be emulated\n",
i << 2);
- lightrec_emit_eob(cstate, block, i);
+ lightrec_emit_jump_to_interpreter(cstate, block, i);
skip_next = !op_flag_no_ds(elm->flags);
} else {
lightrec_rec_opcode(cstate, block, i);
lightrec_unregister(MEM_FOR_CODE, old_code_size);
}
+ pr_debug("Blocks compiled: %u\n", ++state->nb_compile);
+
return 0;
}
state->tlsf = tlsf;
state->with_32bit_lut = with_32bit_lut;
+ state->in_delay_slot_n = 0xff;
state->block_cache = lightrec_blockcache_init(state);
if (!state->block_cache)
# define __api
#endif
+#ifndef __cnst
+# ifdef __GNUC__
+# define __cnst __attribute__((const))
+# else
+# define __cnst
+# endif
+#endif
+#ifndef __pure
+# ifdef __GNUC__
+# define __pure __attribute__((pure))
+# else
+# define __pure
+# endif
+#endif
+
typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
__api void lightrec_set_exit_flags(struct lightrec_state *state, u32 flags);
__api u32 lightrec_exit_flags(struct lightrec_state *state);
-__api struct lightrec_registers * lightrec_get_registers(struct lightrec_state *state);
+__api __cnst struct lightrec_registers *
+lightrec_get_registers(struct lightrec_state *state);
__api u32 lightrec_current_cycle_count(const struct lightrec_state *state);
__api void lightrec_reset_cycle_count(struct lightrec_state *state, u32 cycles);
#include <stdlib.h>
-#ifdef ENABLE_THREADED_COMPILER
+#if ENABLE_THREADED_COMPILER
#include <stdatomic.h>
static atomic_uint lightrec_bytes[MEM_TYPE_END];
case OP_SW:
case OP_SWR:
return BIT(op.i.rs) | BIT(op.i.rt);
+ case OP_META:
+ return BIT(op.m.rs);
default:
return BIT(op.i.rs);
}
return flags;
}
-static u64 opcode_write_mask(union code op)
+u64 opcode_write_mask(union code op)
{
switch (op.i.op) {
case OP_META_MULT2:
case OP_META_MULTU2:
return mult_div_write_mask(op);
+ case OP_META:
+ return BIT(op.m.rd);
case OP_SPECIAL:
switch (op.r.op) {
case OP_SPECIAL_JR:
case OP_LBU:
case OP_LHU:
case OP_LWR:
- case OP_META_EXTC:
- case OP_META_EXTS:
return BIT(op.i.rt);
case OP_JAL:
return BIT(31);
default:
return 0;
}
- case OP_META_MOV:
- return BIT(op.r.rd);
default:
return 0;
}
return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
}
-static bool opcode_is_load(union code op)
+bool opcode_is_mfc(union code op)
+{
+ switch (op.i.op) {
+ case OP_CP0:
+ switch (op.r.rs) {
+ case OP_CP0_MFC0:
+ case OP_CP0_CFC0:
+ return true;
+ default:
+ break;
+ }
+
+ break;
+ case OP_CP2:
+ if (op.r.op == OP_CP2_BASIC) {
+ switch (op.r.rs) {
+ case OP_CP2_BASIC_MFC2:
+ case OP_CP2_BASIC_CFC2:
+ return true;
+ default:
+ break;
+ }
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool opcode_is_load(union code op)
{
switch (op.i.op) {
case OP_LB:
}
}
-bool load_in_delay_slot(union code op)
-{
- switch (op.i.op) {
- case OP_CP0:
- switch (op.r.rs) {
- case OP_CP0_MFC0:
- case OP_CP0_CFC0:
- return true;
- default:
- break;
- }
-
- break;
- case OP_CP2:
- if (op.r.op == OP_CP2_BASIC) {
- switch (op.r.rs) {
- case OP_CP2_BASIC_MFC2:
- case OP_CP2_BASIC_CFC2:
- return true;
- default:
- break;
- }
- }
-
- break;
- case OP_LB:
- case OP_LH:
- case OP_LW:
- case OP_LWL:
- case OP_LWR:
- case OP_LBU:
- case OP_LHU:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset,
struct constprop_data *v)
{
ldop->i.rt = next->r.rd;
to_change->opcode = 0;
} else {
- to_change->i.op = OP_META_MOV;
- to_change->r.rd = next->r.rd;
- to_change->r.rs = ldop->i.rt;
+ to_change->i.op = OP_META;
+ to_change->m.op = OP_META_MOV;
+ to_change->m.rd = next->r.rd;
+ to_change->m.rs = ldop->i.rt;
}
if (to_nop->r.imm == 24)
pr_debug("Convert SLL/SRA #%u to EXT%c\n",
curr->r.imm, curr->r.imm == 24 ? 'C' : 'S');
- if (to_change == curr) {
- to_change->i.rs = curr->r.rt;
- to_change->i.rt = next->r.rd;
- } else {
- to_change->i.rt = next->r.rd;
- to_change->i.rs = curr->r.rt;
- }
-
- if (to_nop->r.imm == 24)
- to_change->i.op = OP_META_EXTC;
- else
- to_change->i.op = OP_META_EXTS;
+ to_change->m.rs = curr->r.rt;
+ to_change->m.op = to_nop->r.imm == 24 ? OP_META_EXTC : OP_META_EXTS;
+ to_change->i.op = OP_META;
}
to_nop->opcode = 0;
break;
if (opcode_writes_register(c, lui->i.rt)) {
+ if (c.i.op == OP_LWL || c.i.op == OP_LWR) {
+ /* LWL/LWR only partially write their target register;
+ * therefore the LUI should not write a different value. */
+ break;
+ }
+
pr_debug("Convert LUI at offset 0x%x to kuseg\n",
i - 1 << 2);
lui->i.imm = kunseg(lui->i.imm << 16) >> 16;
case OP_ANDI:
case OP_ORI:
case OP_XORI:
- case OP_META_MOV:
- case OP_META_EXTC:
- case OP_META_EXTS:
case OP_META_MULT2:
case OP_META_MULTU2:
- if (is_known_zero(v, op->i.rs))
- op->i.rs = 0;
+ case OP_META:
+ if (is_known_zero(v, op->m.rs))
+ op->m.rs = 0;
break;
case OP_SB:
case OP_SH:
for (i = 0; i < block->nb_ops; i++) {
op = &list[i];
- if (op_flag_local_branch(op->flags) && has_delay_slot(op->c)) {
- offset = i + 1 + (s16)op->i.imm;
- list[offset].flags |= LIGHTREC_SYNC;
+ if (has_delay_slot(op->c)) {
+ if (op_flag_local_branch(op->flags)) {
+ offset = i + 1 - op_flag_no_ds(op->flags) + (s16)op->i.imm;
+ list[offset].flags |= LIGHTREC_SYNC;
+ }
+
+ if (op_flag_emulate_branch(op->flags) && i + 2 < block->nb_ops)
+ list[i + 2].flags |= LIGHTREC_SYNC;
}
}
}
for (i = 0; i < block->nb_ops; i++) {
op = &list[i];
- lightrec_consts_propagate(list, i, v);
+ lightrec_consts_propagate(block, i, v);
lightrec_patch_known_zero(op, v);
case OP_ADDIU:
if (op->i.imm == 0) {
pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
- op->i.op = OP_META_MOV;
- op->r.rd = op->i.rt;
+ op->m.rd = op->i.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
break;
case OP_ANDI:
if (op->i.rs == op->i.rt) {
op->opcode = 0;
} else {
- op->i.op = OP_META_MOV;
- op->r.rd = op->i.rt;
+ op->m.rd = op->i.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
}
break;
case OP_SPECIAL_SRA:
if (op->r.imm == 0) {
pr_debug("Convert SRA #0 to MOV\n");
- op->i.op = OP_META_MOV;
- op->r.rs = op->r.rt;
+ op->m.rs = op->r.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
break;
}
break;
case OP_SPECIAL_SLL:
if (op->r.imm == 0) {
pr_debug("Convert SLL #0 to MOV\n");
- op->i.op = OP_META_MOV;
- op->r.rs = op->r.rt;
+ op->m.rs = op->r.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
lightrec_optimize_sll_sra(block->opcode_list, i, v);
case OP_SPECIAL_SRL:
if (op->r.imm == 0) {
pr_debug("Convert SRL #0 to MOV\n");
- op->i.op = OP_META_MOV;
- op->r.rs = op->r.rt;
+ op->m.rs = op->r.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
break;
op->r.op = ctz32(v[op->r.rt].value);
break;
+ case OP_SPECIAL_NOR:
+ if (op->r.rs == 0 || op->r.rt == 0) {
+ pr_debug("Convert NOR $zero to COM\n");
+ op->i.op = OP_META;
+ op->m.op = OP_META_COM;
+ if (!op->m.rs)
+ op->m.rs = op->r.rt;
+ }
+ break;
case OP_SPECIAL_OR:
case OP_SPECIAL_ADD:
case OP_SPECIAL_ADDU:
if (op->r.rs == 0) {
pr_debug("Convert OR/ADD $zero to MOV\n");
- op->i.op = OP_META_MOV;
- op->r.rs = op->r.rt;
+ op->m.rs = op->r.rt;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
fallthrough;
case OP_SPECIAL_SUB:
case OP_SPECIAL_SUBU:
if (op->r.rt == 0) {
pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
- op->i.op = OP_META_MOV;
+ op->m.op = OP_META_MOV;
+ op->i.op = OP_META;
}
fallthrough;
default:
if (op_flag_sync(next->flags))
continue;
+ if (op_flag_load_delay(next->flags) && opcode_is_load(next_op))
+ continue;
+
if (!lightrec_can_switch_delay_slot(list->c, next_op))
continue;
return 0;
}
-static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size)
-{
- struct opcode_list *list, *old_list;
-
- if (new_size >= block->nb_ops) {
- pr_err("Invalid shrink size (%u vs %u)\n",
- new_size, block->nb_ops);
- return -EINVAL;
- }
-
- list = lightrec_malloc(state, MEM_FOR_IR,
- sizeof(*list) + sizeof(struct opcode) * new_size);
- if (!list) {
- pr_err("Unable to allocate memory\n");
- return -ENOMEM;
- }
-
- old_list = container_of(block->opcode_list, struct opcode_list, ops);
- memcpy(list->ops, old_list->ops, sizeof(struct opcode) * new_size);
-
- lightrec_free_opcode_list(state, block->opcode_list);
- list->nb_ops = new_size;
- block->nb_ops = new_size;
- block->opcode_list = list->ops;
-
- pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n",
- block->pc, new_size);
-
- return 0;
-}
-
static int lightrec_detect_impossible_branches(struct lightrec_state *state,
struct block *block)
{
struct opcode *op, *list = block->opcode_list, *next = &list[0];
unsigned int i;
int ret = 0;
- s16 offset;
for (i = 0; i < block->nb_ops - 1; i++) {
op = next;
next = &list[i + 1];
if (!has_delay_slot(op->c) ||
- (!load_in_delay_slot(next->c) &&
- !has_delay_slot(next->c) &&
+ (!has_delay_slot(next->c) &&
+ !opcode_is_mfc(next->c) &&
!(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
continue;
continue;
}
- offset = i + 1 + (s16)op->i.imm;
- if (load_in_delay_slot(next->c) &&
- (offset >= 0 && offset < block->nb_ops) &&
- !opcode_reads_register(list[offset].c, next->c.i.rt)) {
- /* The 'impossible' branch is a local branch - we can
- * verify here that the first opcode of the target does
- * not use the target register of the delay slot */
-
- pr_debug("Branch at offset 0x%x has load delay slot, "
- "but is local and dest opcode does not read "
- "dest register\n", i << 2);
+ op->flags |= LIGHTREC_EMULATE_BRANCH;
+
+ if (OPT_LOCAL_BRANCHES && i + 2 < block->nb_ops) {
+ /* The interpreter will only emulate the branch, then
+ * return to the compiled code. Add a SYNC after the
+ * branch + delay slot in the case where the branch
+ * was not taken. */
+ list[i + 2].flags |= LIGHTREC_SYNC;
+ }
+ }
+
+ return ret;
+}
+
+static bool is_local_branch(const struct block *block, unsigned int idx)
+{
+ const struct opcode *op = &block->opcode_list[idx];
+ s32 offset;
+
+ switch (op->c.i.op) {
+ case OP_BEQ:
+ case OP_BNE:
+ case OP_BLEZ:
+ case OP_BGTZ:
+ case OP_REGIMM:
+ offset = idx + 1 + (s16)op->c.i.imm;
+ if (offset >= 0 && offset < block->nb_ops)
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
+
+static int lightrec_handle_load_delays(struct lightrec_state *state,
+ struct block *block)
+{
+ struct opcode *op, *list = block->opcode_list;
+ unsigned int i;
+ s16 imm;
+
+ for (i = 0; i < block->nb_ops; i++) {
+ op = &list[i];
+
+ if (!opcode_is_load(op->c) || !op->c.i.rt || op->c.i.op == OP_LWC2)
+ continue;
+
+ if (!is_delay_slot(list, i)) {
+ /* Only handle load delays in delay slots.
+ * PSX games never abused load delay slots otherwise. */
continue;
}
- op->flags |= LIGHTREC_EMULATE_BRANCH;
+ if (is_local_branch(block, i - 1)) {
+ imm = (s16)list[i - 1].c.i.imm;
- if (op == list) {
- pr_debug("First opcode of block PC 0x%08x is an impossible branch\n",
- block->pc);
+ if (!opcode_reads_register(list[i + imm].c, op->c.i.rt)) {
+ /* The target opcode of the branch is inside
+ * the block, and it does not read the register
+ * written to by the load opcode; we can ignore
+ * the load delay. */
+ continue;
+ }
+ }
- /* If the first opcode is an 'impossible' branch, we
- * only keep the first two opcodes of the block (the
- * branch itself + its delay slot) */
- if (block->nb_ops > 2)
- ret = shrink_opcode_list(state, block, 2);
- break;
+ op->flags |= LIGHTREC_LOAD_DELAY;
+ }
+
+ return 0;
+}
+
+static int lightrec_swap_load_delays(struct lightrec_state *state,
+ struct block *block)
+{
+ unsigned int i;
+ union code c, next;
+ bool in_ds = false, skip_next = false;
+ struct opcode op;
+
+ if (block->nb_ops < 2)
+ return 0;
+
+ for (i = 0; i < block->nb_ops - 2; i++) {
+ c = block->opcode_list[i].c;
+
+ if (skip_next) {
+ skip_next = false;
+ } else if (!in_ds && opcode_is_load(c) && c.i.op != OP_LWC2) {
+ next = block->opcode_list[i + 1].c;
+
+ if (c.i.op == OP_LWL && next.i.op == OP_LWR)
+ continue;
+
+ if (opcode_reads_register(next, c.i.rt)
+ && !opcode_writes_register(next, c.i.rs)) {
+ pr_debug("Swapping opcodes at offset 0x%x to "
+ "respect load delay\n", i << 2);
+
+ op = block->opcode_list[i];
+ block->opcode_list[i] = block->opcode_list[i + 1];
+ block->opcode_list[i + 1] = op;
+ skip_next = true;
+ }
}
+
+ in_ds = has_delay_slot(c);
}
- return ret;
+ return 0;
}
static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
{
+ const struct opcode *ds;
struct opcode *list;
unsigned int i;
s32 offset;
for (i = 0; i < block->nb_ops; i++) {
list = &block->opcode_list[i];
- if (should_emulate(list))
+ if (should_emulate(list) || !is_local_branch(block, i))
continue;
- switch (list->i.op) {
- case OP_BEQ:
- case OP_BNE:
- case OP_BLEZ:
- case OP_BGTZ:
- case OP_REGIMM:
- offset = i + 1 + (s16)list->i.imm;
- if (offset >= 0 && offset < block->nb_ops)
- break;
- fallthrough;
- default:
- continue;
- }
+ offset = i + 1 + (s16)list->c.i.imm;
pr_debug("Found local branch to offset 0x%x\n", offset << 2);
+ ds = get_delay_slot(block->opcode_list, i);
+ if (op_flag_load_delay(ds->flags) && opcode_is_load(ds->c)) {
+ pr_debug("Branch delay slot has a load delay - skip\n");
+ continue;
+ }
+
if (should_emulate(&block->opcode_list[offset])) {
pr_debug("Branch target must be emulated - skip\n");
continue;
{
switch (c.i.op) {
case OP_SPECIAL:
- case OP_META_MOV:
+ case OP_META:
return true;
default:
return false;
struct opcode *op;
s16 last_r[34], last_w[34], last_sync = 0, next_sync = 0;
u64 mask_r, mask_w, dirty = 0, loaded = 0;
- u8 reg;
+ u8 reg, load_delay_reg = 0;
memset(last_r, 0xff, sizeof(last_r));
memset(last_w, 0xff, sizeof(last_w));
for (i = 0; i < block->nb_ops; i++) {
op = &block->opcode_list[i];
+ if (OPT_HANDLE_LOAD_DELAYS && load_delay_reg) {
+ /* Handle delayed register write from load opcodes in
+ * delay slots */
+ last_w[load_delay_reg] = i;
+ load_delay_reg = 0;
+ }
+
if (op_flag_sync(op->flags) || should_emulate(op)) {
/* The next opcode has the SYNC flag set, or is a branch
* that should be emulated: unload all registers. */
mask_r = opcode_read_mask(op->c);
mask_w = opcode_write_mask(op->c);
+ if (op_flag_load_delay(op->flags) && opcode_is_load(op->c)) {
+ /* If we have a load opcode in a delay slot, its target
+ * register is actually not written there but at a
+ * later point, in the dispatcher. Prevent the algorithm
+ * from discarding its previous value. */
+ load_delay_reg = op->c.i.rt;
+ mask_w &= ~BIT(op->c.i.rt);
+ }
+
for (reg = 0; reg < 34; reg++) {
if (mask_r & BIT(reg)) {
if (dirty & BIT(reg) && last_w[reg] < last_sync) {
for (i = 0; i < block->nb_ops; i++) {
list = &block->opcode_list[i];
- lightrec_consts_propagate(block->opcode_list, i, v);
+ lightrec_consts_propagate(block, i, v);
switch (list->i.op) {
case OP_SB:
case OP_SH:
case OP_SW:
- if (OPT_FLAG_STORES) {
- /* Mark all store operations that target $sp or $gp
- * as not requiring code invalidation. This is based
- * on the heuristic that stores using one of these
- * registers as address will never hit a code page. */
- if (list->i.rs >= 28 && list->i.rs <= 29 &&
- !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
- pr_debug("Flaging opcode 0x%08x as not "
- "requiring invalidation\n",
- list->opcode);
- list->flags |= LIGHTREC_NO_INVALIDATE;
- list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
- }
+ /* Mark all store operations that target $sp or $gp
+ * as not requiring code invalidation. This is based
+ * on the heuristic that stores using one of these
+ * registers as address will never hit a code page. */
+ if (list->i.rs >= 28 && list->i.rs <= 29 &&
+ !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
+ pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n",
+ list->opcode);
+ list->flags |= LIGHTREC_NO_INVALIDATE;
+ }
- /* Detect writes whose destination address is inside the
- * current block, using constant propagation. When these
- * occur, we mark the blocks as not compilable. */
- if (is_known(v, list->i.rs) &&
- kunseg(v[list->i.rs].value) >= kunseg(block->pc) &&
- kunseg(v[list->i.rs].value) < (kunseg(block->pc) +
- block->nb_ops * 4)) {
- pr_debug("Self-modifying block detected\n");
- block_set_flags(block, BLOCK_NEVER_COMPILE);
- list->flags |= LIGHTREC_SMC;
- }
+ /* Detect writes whose destination address is inside the
+ * current block, using constant propagation. When these
+ * occur, we mark the blocks as not compilable. */
+ if (is_known(v, list->i.rs) &&
+ kunseg(v[list->i.rs].value) >= kunseg(block->pc) &&
+ kunseg(v[list->i.rs].value) < (kunseg(block->pc) + block->nb_ops * 4)) {
+ pr_debug("Self-modifying block detected\n");
+ block_set_flags(block, BLOCK_NEVER_COMPILE);
+ list->flags |= LIGHTREC_SMC;
}
fallthrough;
case OP_SWL:
case OP_LWL:
case OP_LWR:
case OP_LWC2:
- if (OPT_FLAG_IO &&
- (v[list->i.rs].known | v[list->i.rs].sign)) {
+ if (v[list->i.rs].known | v[list->i.rs].sign) {
psx_map = lightrec_get_constprop_map(state, v,
list->i.rs,
(s16) list->i.imm);
break;
}
}
+
+ if (!LIGHTREC_FLAGS_GET_IO_MODE(list->flags)
+ && list->i.rs >= 28 && list->i.rs <= 29
+ && !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
+ /* Assume that all I/O operations that target
+ * $sp or $gp will always only target a mapped
+ * memory (RAM, BIOS, scratchpad). */
+ list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
+ }
+
fallthrough;
default:
break;
for (i = 0; i < block->nb_ops - 1; i++) {
list = &block->opcode_list[i];
- lightrec_consts_propagate(block->opcode_list, i, v);
+ lightrec_consts_propagate(block, i, v);
switch (list->i.op) {
case OP_SPECIAL:
IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
+ IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_handle_load_delays),
+ IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_swap_load_delays),
IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_branches),
IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
- IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io),
+ IF_OPT(OPT_FLAG_IO, &lightrec_flag_io),
IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
};
struct block;
struct opcode;
-_Bool opcode_reads_register(union code op, u8 reg);
-_Bool opcode_writes_register(union code op, u8 reg);
-_Bool has_delay_slot(union code op);
+__cnst _Bool opcode_reads_register(union code op, u8 reg);
+__cnst _Bool opcode_writes_register(union code op, u8 reg);
+__cnst u64 opcode_write_mask(union code op);
+__cnst _Bool has_delay_slot(union code op);
_Bool is_delay_slot(const struct opcode *list, unsigned int offset);
-_Bool load_in_delay_slot(union code op);
-_Bool opcode_is_io(union code op);
-_Bool is_unconditional_jump(union code c);
-_Bool is_syscall(union code c);
+__cnst _Bool opcode_is_mfc(union code op);
+__cnst _Bool opcode_is_load(union code op);
+__cnst _Bool opcode_is_io(union code op);
+__cnst _Bool is_unconditional_jump(union code c);
+__cnst _Bool is_syscall(union code c);
_Bool should_emulate(const struct opcode *op);
"lo", "hi",
};
+/* Forward declaration(s) */
+static void clean_reg(jit_state_t *_jit,
+ struct native_register *nreg, u8 jit_reg, bool clean);
+
const char * lightrec_reg_name(u8 reg)
{
return mips_regs[reg];
static void lightrec_unload_nreg(struct regcache *cache, jit_state_t *_jit,
struct native_register *nreg, u8 jit_reg)
{
- /* If we get a dirty register, store back the old value */
- if (nreg->prio == REG_IS_DIRTY) {
- s16 offset = offsetof(struct lightrec_state, regs.gpr)
- + (nreg->emulated_register << 2);
-
- jit_stxi_i(offset, LIGHTREC_REG_STATE, jit_reg);
- }
-
+ clean_reg(_jit, nreg, jit_reg, false);
lightrec_discard_nreg(nreg);
}
static void clean_reg(jit_state_t *_jit,
struct native_register *nreg, u8 jit_reg, bool clean)
{
+ /* If we get a dirty register, store back the old value */
if (nreg->prio == REG_IS_DIRTY) {
s16 offset = offsetof(struct lightrec_state, regs.gpr)
+ (nreg->emulated_register << 2);
}
}
+bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg)
+{
+ return !!find_mapped_reg(cache, reg, false);
+}
+
void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit,
u16 reg, bool unload)
{
#include "lightning-wrapper.h"
-#define NUM_REGS (JIT_V_NUM - 1)
-#define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1))
+#if defined(__sh__)
+# define NUM_REGS JIT_V_NUM
+# define LIGHTREC_REG_STATE _GBR
+#else
+# define NUM_REGS (JIT_V_NUM - 1)
+# define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1))
+#endif
#if defined(__powerpc__)
# define NUM_TEMPS JIT_R_NUM
void lightrec_storeback_regs(struct regcache *cache, jit_state_t *_jit);
_Bool lightrec_has_dirty_regs(struct regcache *cache);
+_Bool lightrec_reg_is_loaded(struct regcache *cache, u16 reg);
void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit,
u16 reg, _Bool unload);
void lightrec_discard_reg_if_loaded(struct regcache *cache, u16 reg);
struct regcache * lightrec_regcache_init(struct lightrec_state *state);
void lightrec_free_regcache(struct regcache *cache);
-const char * lightrec_reg_name(u8 reg);
+__cnst const char * lightrec_reg_name(u8 reg);
void lightrec_regcache_mark_live(struct regcache *cache, jit_state_t *_jit);