#define FC 24 // emulated processor flags: C (bit 0), others 0
#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others x
+// All operations but ptr ops are using the lower 32 bits of the registers.
+// The upper 32 bits always contain the sign extension from the lower 32 bits.
// unified conditions; virtual, not corresponding to anything real on MIPS
#define DCOND_EQ 0x0
emith_lohi_nops(); \
EMIT(MIPS_MULT(s1, s2)); \
EMIT(MIPS_MFLO(AT)); \
- emith_add_r_r(dlo, AT); \
- EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \
- EMIT(MIPS_MFHI(AT)); \
+ EMIT(MIPS_MFHI(t_)); \
last_lohi = (u8 *)tcache_ptr; \
+ emith_add_r_r(dlo, AT); \
+ EMIT(MIPS_SLTU_REG(AT, dlo, AT)); \
emith_add_r_r(dhi, AT); \
emith_add_r_r(dhi, t_); \
rcache_free_tmp(t_); \
// NB: ABI SP alignment is 8 for compatibility with MIPS IV
#define emith_push_ret(r) do { \
- emith_add_r_r_ptr_imm(SP, SP, -8-16); /* ABI: 16 byte arg save area */ \
+ emith_add_r_r_ptr_imm(SP, SP, -8-16); /* O32: 16 byte arg save area */ \
emith_write_r_r_offs(LR, SP, 4+16); \
if ((r) > 0) emith_write_r_r_offs(r, SP, 0+16); \
} while (0)
#define FC 29 // emulated processor flags: C (bit 0), others 0
#define FV 28 // emulated processor flags: Nt^Ns (bit 31). others x
+// All operations but ptr ops are using the lower 32 bits of the registers.
+// The upper 32 bits always contain the sign extension from the lower 32 bits.
// unified conditions; virtual, not corresponding to anything real on RISC-V
#define DCOND_EQ 0x0
// NB: must split 64 bit result into 2 32 bit registers
// NB: expects 32 bit values in s1+s2, correctly sign extended to 64 bits
#define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \
- /*EMIT(R5_ADDW_IMM(s1, s1, 0));*/ \
- /*EMIT(R5_ADDW_IMM(s2, s2, 0));*/ \
EMIT(R5_MUL(dlo, s1, s2)); \
EMIT(R5_ASR_IMM(dhi, dlo, 32)); \
- EMIT(R5_LSL_IMM(dlo, dlo, 32)); \
- EMIT(R5_ASR_IMM(dlo, dlo, 32)); \
+ EMIT(R5_ADDW_IMM(dlo, dlo, 0)); \
} while (0)
#define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) \
static inline int emith_pool_literal(uintptr_t imm)
{
int idx = literal_pindex - 8; // max look behind in pool
- // see if one of the last literals was the same (or close enough)
+ // see if one of the last literals was the same
for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++)
if (imm == literal_pool[idx])
break;
* See COPYING file in the top-level directory.
*
* notes:
- * - tcache, block descriptor, link buffer overflows result in sh2_translate()
- * failure, followed by full tcache invalidation for that region
+ * - tcache, block descriptor, block entry buffer overflows result in oldest
+ * blocks being deleted until enough space is available
+ * - link and list element buffer overflows result in failure and exit
* - jumps between blocks are tracked for SMC handling (in block_entry->links),
- * except jumps between different tcaches
+ * except jumps from global to CPU-local tcaches
*
* implemented:
* - static register allocation
* - remaining register caching and tracking in temporaries
* - block-local branch linking
- * - block linking (except between tcaches)
+ * - block linking
* - some constant propagation
+ * - call stack caching for host block entry address
+ * - delay, poll, and idle loop detection and handling
+ * - some T/M flag optimizations where the value is known or isn't used
*
* TODO:
* - better constant propagation
- * - stack caching?
* - bug fixing
*/
#include <stddef.h>
if (be != NULL)
dbg(1, "block override for %08x", addr);
- if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size ||
+ if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size ||
entry_ring[tcache_id].used + entries > entry_ring[tcache_id].size) {
dbg(1, "bd overflow for tcache %d", tcache_id);
return NULL;
static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
{
// branch targets in current block
- struct linkage branch_targets[MAX_LOCAL_TARGETS];
+ static struct linkage branch_targets[MAX_LOCAL_TARGETS];
int branch_target_count = 0;
// unresolved local or external targets with block link/exit area if needed
- struct linkage blx_targets[MAX_LOCAL_BRANCHES];
+ static struct linkage blx_targets[MAX_LOCAL_BRANCHES];
int blx_target_count = 0;
- u8 op_flags[BLOCK_INSN_LIMIT];
+ static u8 op_flags[BLOCK_INSN_LIMIT];
enum flg_states { FLG_UNKNOWN, FLG_UNUSED, FLG_0, FLG_1 };
struct drcf {
#if LOOP_OPTIMIZER
// loops with pinned registers for optimzation
// pinned regs are like statics and don't need saving/restoring inside a loop
- struct linkage pinned_loops[MAX_LOCAL_TARGETS/16];
+ static struct linkage pinned_loops[MAX_LOCAL_TARGETS/16];
int pinned_loop_count = 0;
#endif
// no sense in looking any further than the next rcache flush
tmp = ((op_flags[i+v] & OF_BTARGET) || (op_flags[i+v-1] & OF_DELAY_OP) ||
(OP_ISBRACND(opd[v-1].op) && !(op_flags[i+v] & OF_DELAY_OP)));
+ // XXX looking behind cond branch to avoid evicting regs used later?
if (pc + 2*v <= end_pc && !tmp) { // (pc already incremented above)
late |= opd[v].source & ~write;
// ignore source regs after they have been written to
rcache_invalidate();
}
} else
+ // no space for resolving forward branch, handle it as external
dbg(1, "warning: too many unresolved branches");
}
EMITH_JMP_START(emith_invert_cond(cond));
if (bl) {
bl->jump = tcache_ptr;
+ emith_flush(); // flush to inhibit insn swapping
bl->type = BL_LDJMP;
}
tmp = rcache_get_tmp_arg(0);
i = tcache_ptr - tcache;
RING_INIT(&tcache_ring[0], tcache_ptr, tcache_sizes[0] - i);
for (i = 1; i < ARRAY_SIZE(tcache_ring); i++) {
- RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_sizes[i-1],
+ RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_ring[i-1].size,
tcache_sizes[i]);
}