From 83bafe8e0b62ab02850011c443c1086d61e96d71 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Apr 2019 18:57:18 +0200 Subject: [PATCH] add literal pool to sh2 drc (for armv[456] without MOVT/W) --- cpu/drc/emit_arm.c | 125 ++++++++++++++++++++++++++++++------- cpu/drc/emit_x86.c | 3 + cpu/sh2/compiler.c | 10 ++- pico/carthw/svp/compiler.c | 1 + 4 files changed, 114 insertions(+), 25 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 4744b127..d8674a03 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -261,13 +261,30 @@ #define EOP_MOVT(rd,imm) \ EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) -static int count_bits(unsigned val) +static inline int count_bits(unsigned val) { - val = (val & 0x55555555) + ((val >> 1) & 0x55555555); + val = val - ((val >> 1) & 0x55555555); val = (val & 0x33333333) + ((val >> 2) & 0x33333333); - val = (val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f); - val = (val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff); - return (val & 0xffff) + (val >> 16); + return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; +} + +// host literal pool; must be significantly smaller than 1024 (max LDR offset = 4096) +#define MAX_HOST_LITERALS 128 +static u32 literal_pool[MAX_HOST_LITERALS]; +static u32 *literal_insn[MAX_HOST_LITERALS]; +static int literal_pindex, literal_iindex; + +static int emith_pool_literal(u32 imm, int *offs) +{ + int idx = literal_pindex - 8; // max look behind in pool + // see if one of the last literals was the same (or close enough) + for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) + if (abs((int)(imm - literal_pool[idx])) <= 0xff) + break; + if (idx == literal_pindex) // store new literal + literal_pool[literal_pindex++] = imm; + *offs = imm - literal_pool[idx]; + return idx; } // XXX: RSB, *S will break if 1 insn is not enough @@ -275,6 +292,7 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int { int ror2; u32 v; + int i; switch (op) { case A_OP_MOV: @@ -284,19 +302,48 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm = ~imm; op = A_OP_MVN; } -#ifdef HAVE_ARMV7 - for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) - ror2--; - if (v >> 8) { - /* 2+ insns needed - prefer movw/movt */ + // count insns needed for mov/orr #imm + for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); + for (i = 2; i > 0; i--, v >>= 8) + while (v > 0xff && !(v & 3)) + v >>= 2; + if (v) { // 3+ insns needed... if (op == A_OP_MVN) imm = ~imm; +#ifdef HAVE_ARMV7 + // ...prefer movw/movt EOP_MOVW(rd, imm); if (imm & 0xffff0000) EOP_MOVT(rd, imm); +#else + // ...emit literal load + int idx, o; + if (literal_iindex >= MAX_HOST_LITERALS) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool overflow"); + exit(1); + } + idx = emith_pool_literal(imm, &o); + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EOP_LDR_IMM2(cond, rd, 15, idx * sizeof(u32)); + if (o > 0) + EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o); + else if (o < 0) + EOP_C_DOP_IMM(cond, A_OP_SUB, 0, rd, rd, 0, -o); +#endif return; } -#endif + break; + + case A_OP_AND: + // AND must fit into 1 insn. if not, use BIC + for (v = imm, ror2 = 0; (v >> 8) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); + if (v >> 8) { + imm = ~imm; + op = A_OP_BIC; + } break; case A_OP_SUB: @@ -314,20 +361,13 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int break; } - again: - v = imm, ror2 = 32/2; // arm imm shift is ROR, so rotate for best fit - while ((v >> 24) && !(v & 0xc0)) - v = (v << 2) | (v >> 30), ror2++; + // try to get the topmost byte empty to possibly save an insn + for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); do { // shift down to get 'best' rot2 while (v > 0xff && !(v & 3)) v >>= 2, ror2--; - // AND must fit into 1 insn. if not, use BIC - if (op == A_OP_AND && v != (v & 0xff)) { - imm = ~imm; - op = A_OP_BIC; - goto again; - } EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff); switch (op) { @@ -385,6 +425,47 @@ static int emith_xbranch(int cond, void *target, int is_call) return (u32 *)tcache_ptr - start_ptr; } +static void emith_pool_commit(int jumpover) +{ + int i, sz = literal_pindex * sizeof(u32); + u8 *pool = (u8 *)tcache_ptr; + + // nothing to commit if pool is empty + if (sz == 0) + return; + // need branch over pool if not at block end + if (jumpover) { + pool += sizeof(u32); + emith_xbranch(A_COND_AL, (u8 *)pool + sz, 0); + } + // safety check - pool must be after insns and reachable + if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0xfff) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool offset out of range"); + exit(1); + } + // copy pool and adjust addresses in insns accessing the pool + memcpy(pool, literal_pool, sz); + for (i = 0; i < literal_iindex; i++) { + *literal_insn[i] += (u8 *)pool - ((u8 *)literal_insn[i] + 8); + } + // count pool constants as insns for statistics + for (i = 0; i < literal_pindex; i++) + COUNT_OP; + + tcache_ptr = (void *)((u8 *)pool + sz); + literal_pindex = literal_iindex = 0; +} + +static inline void emith_pool_check(void) +{ + // check if pool must be committed + if (literal_iindex > MAX_HOST_LITERALS-4 || + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00) + // pool full, or displacement is approaching the limit + emith_pool_commit(1); +} + #define JMP_POS(ptr) \ ptr = tcache_ptr; \ tcache_ptr += sizeof(u32) @@ -769,7 +850,7 @@ static int emith_xbranch(int cond, void *target, int is_call) b_ = tmpr; \ } \ op(b_,v_); \ -} while(0) +} while (0) #define emith_ctx_read_multiple(r, offs, count, tmpr) \ emith_ctx_do_multiple(EOP_LDMIA, r, offs, count, tmpr) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index ce13c618..1ac4ee01 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1104,3 +1104,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) + +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index fa0a6b71..bc63e18b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -369,7 +369,7 @@ enum { HR_STATIC, // vreg has a static mapping HR_CACHED, // vreg has sh2_reg_e HR_TEMP, // reg used for temp storage -} cach_reg_type; +} cache_reg_type; enum { HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx @@ -2569,8 +2569,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) return NULL; block_entry_ptr = tcache_ptr; - dbg(2, "== %csh2 block #%d,%d crc %04x %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', - tcache_id, blkid_main, crc, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); + dbg(2, "== %csh2 block #%d,%d %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + tcache_id, blkid_main, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); // clear stale state after compile errors @@ -2715,6 +2715,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif + emith_pool_check(); pc += 2; if (skip_op > 0) { @@ -3892,6 +3893,8 @@ end_op: emith_jump_patch(branch_patch_ptr[i], target); } + emith_pool_commit(0); + dr_mark_memory(1, block, tcache_id, 0); tcache_ptrs[tcache_id] = tcache_ptr; @@ -4124,6 +4127,7 @@ static void sh2_generate_utils(void) MAKE_WRITE_WRAPPER(sh2_drc_write32); #endif + emith_pool_commit(0); rcache_invalidate(); #if (DRC_DEBUG & 4) host_dasm_new_symbol(sh2_drc_entry); diff --git a/pico/carthw/svp/compiler.c b/pico/carthw/svp/compiler.c index b31197c2..1ec71e75 100644 --- a/pico/carthw/svp/compiler.c +++ b/pico/carthw/svp/compiler.c @@ -1795,6 +1795,7 @@ void *ssp_translate_block(int pc) tr_flush_dirty_ST(); tr_flush_dirty_pmcrs(); block_end = emit_block_epilogue(ccount, end_cond, jump_pc, pc); + emith_pool_commit(0); if (tcache_ptr - (u32 *)tcache > DRC_TCACHE_SIZE/4) { elprintf(EL_ANOMALY|EL_STATUS|EL_SVP, "tcache overflow!\n"); -- 2.39.2