From 0e12269073557d8e7bc6e917db0d362d8552237a Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 21 Dec 2019 16:33:52 +0100 Subject: [PATCH] sh2 drc: optimize T bit handling for A64 --- Makefile | 5 ++-- cpu/drc/emit_arm64.c | 64 +++++++++++++++++++++++++++++--------------- cpu/drc/emit_riscv.c | 5 +--- cpu/sh2/compiler.c | 12 ++++----- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 0a0ab127..49116ce0 100644 --- a/Makefile +++ b/Makefile @@ -36,10 +36,11 @@ endif ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp +CFLAGS += -finline-limit=43 -fno-unroll-loops -fno-ipa-cp -ffast-math # this gets you about 20% better execution speed on 32bit arm/mips -CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math +CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -fno-regmove endif +#OBJS += align.o # default settings ifeq "$(ARCH)" "arm" diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 8d1a7dd1..2e873161 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -44,10 +44,11 @@ #define A64_COND_LE 0xd #define A64_COND_CS A64_COND_HS #define A64_COND_CC A64_COND_LO +// "fake" conditions for T bit handling #define A64_COND_AL 0xe #define A64_COND_NV 0xf -/* unified conditions */ +// DRC conditions #define DCOND_EQ A64_COND_EQ #define DCOND_NE A64_COND_NE #define DCOND_MI A64_COND_MI @@ -261,6 +262,13 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_BCOND(cond, offs19) \ A64_INSN(0xa,0x2,_,_,_,_,_,(offs19) >> 2,(cond)) +// conditional select + +#define A64_CINC(cond, rn, rm) \ + A64_INSN(0xd,0x0,0x2,0,rm,(cond)^1,0x1,rm,rn) /* CSINC */ +#define A64_CSET(cond, rn) \ + A64_CINC(cond, rn, Z0) + // load pc-relative #define A64_LDRLIT_IMM(rd, offs19) \ @@ -1356,38 +1364,52 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #ifdef T // T bit handling +static int tcond = -1; + #define emith_invert_cond(cond) \ ((cond) ^ 1) -static void emith_clr_t_cond(int sr) -{ - emith_bic_r_imm(sr, T); -} +#define emith_clr_t_cond(sr) \ + (void)sr -static void emith_set_t_cond(int sr, int cond) -{ - EMITH_SJMP_START(emith_invert_cond(cond)); - emith_or_r_imm_c(cond, sr, T); - EMITH_SJMP_END(emith_invert_cond(cond)); -} +#define emith_set_t_cond(sr, cond) \ + tcond = cond -#define emith_get_t_cond() -1 +#define emith_get_t_cond() \ + tcond -#define emith_sync_t(sr) ((void)sr) +#define emith_invalidate_t() \ + tcond = -1 -#define emith_invalidate_t() +#define emith_set_t(sr, val) \ + tcond = ((val) ? A64_COND_AL: A64_COND_NV) -static void emith_set_t(int sr, int val) +static void emith_sync_t(int sr) { - if (val) - emith_or_r_imm(sr, T); - else - emith_bic_r_imm(sr, T); + if (tcond == A64_COND_AL) + emith_or_r_imm(sr, T); + else if (tcond == A64_COND_NV) + emith_bic_r_imm(sr, T); + else if (tcond >= 0) { + int tmp = rcache_get_tmp(); + EMIT(A64_CSET(tcond, tmp)); + EMIT(A64_BFI_IMM(sr, tmp, 0, 1)); // assumes SR.T = bit 0 + rcache_free_tmp(tmp); + } + tcond = -1; } static int emith_tst_t(int sr, int tf) { - emith_tst_r_imm(sr, T); - return tf ? DCOND_NE: DCOND_EQ; + if (tcond < 0) { + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else if (tcond >= A64_COND_AL) { + // MUST sync because A64_COND_AL/NV isn't a real condition + emith_sync_t(sr); + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else + return tf ? tcond : emith_invert_cond(tcond); } #endif diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 90234b22..69ed530e 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -87,8 +87,6 @@ enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; // LD/ST // func7 enum { F2_ALT=0x20, F2_MULDIV=0x01 }; -#define __(n) o##n // enum marker for "undefined" - #define R5_NOP R5_I_INSN(OP_IMM, F1_ADD, Z0, Z0, 0) // nop: ADDI r0, r0, #0 // arithmetic/logical @@ -687,9 +685,8 @@ static void emith_pool_check(void) static void emith_move_imm(int r, uintptr_t imm) { - u32 lui = imm + _CB(imm,1,11,12); + u32 lui = imm + _CB(imm,1,11,12); // compensate for ADDI sign extension if (lui >> 12) { - // take out the effect of the sign extension of ADDI EMIT(R5_MOVT_IMM(r, lui)); if (imm & 0xfff) EMIT(R5_ADD_IMM(r, r, imm)); diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ca9a0550..bd3e5b43 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -446,7 +446,6 @@ static void rcache_free_tmp(int hr); // there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. // SR must and R0 should by all means be statically mapped. // XXX the static definition of SR MUST match that in compiler.h -// PC and PR must not be statically mapped (accessed in context by utils). #ifdef __arm__ #include "../drc/emit_arm.c" @@ -3365,7 +3364,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); - tmp4 = rcache_get_tmp_arg(3); + tmp4 = rcache_get_tmp(); emith_move_r_ptr_imm(tmp2, tcache_ptr); emith_move_r_r_ptr(tmp3, CONTEXT_REG); emith_move_r_imm(tmp4, pc); @@ -5049,11 +5048,12 @@ static void sh2_generate_utils(void) emith_add_r_imm(arg2, (u32)(2*sizeof(void *))); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0); - emith_ctx_read(arg3, SHR_PR * 4); + emith_add_r_r_r_lsl_ptr(arg3, CONTEXT_REG, arg2, 0); + rcache_get_reg_arg(2, SHR_PR, NULL); emith_add_r_ret(arg1); - emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache)+sizeof(void *)); - emith_write_r_r_offs(arg3, arg2, offsetof(SH2, rts_cache)); + emith_write_r_r_offs_ptr(arg1, arg3, offsetof(SH2, rts_cache)+sizeof(void *)); + emith_write_r_r_offs(arg2, arg3, offsetof(SH2, rts_cache)); + rcache_flush(); emith_ret(); emith_flush(); -- 2.39.5