various smallish optimizations, cleanups, and bug fixes

author kub <derkub@gmail.com>

Tue, 17 Sep 2019 20:48:32 +0000 (22:48 +0200)

committer kub <derkub@gmail.com>

Tue, 17 Sep 2019 21:05:35 +0000 (23:05 +0200)
author kub <derkub@gmail.com>
Tue, 17 Sep 2019 20:48:32 +0000 (22:48 +0200)
committer kub <derkub@gmail.com>
Tue, 17 Sep 2019 21:05:35 +0000 (23:05 +0200)
diff --git a/Makefile b/Makefile

index 47463d5..63e9c83 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,9 @@ endif
  
  ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1"))
  # very small caches, avoid optimization options making the binary much bigger
-CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -fno-common -fno-stack-protector -ffast-math
+CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp
+# this gets you about 20% better execution speed on 32bit arm/mips
+CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math
  endif
  
  # default settings
diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c

index 1d70866..66a5b06 100644 (file)
--- a/cpu/drc/emit_arm.c
+++ b/cpu/drc/emit_arm.c
@@ -36,6 +36,47 @@
  #define M5(x,y,z,a,b)  (M4(x,y,z,a)|M1(b))
  #define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j))
  
+// sys_cacheflush always flushes whole pages, and it's rather expensive on ARMs
+// hold a list of pending cache updates and merge requests to reduce cacheflush
+static struct { void *base, *end; } pageflush[4];
+static unsigned pagesize = 4096;
+
+static void emith_update_cache(void)
+{
+       int i;
+
+       for (i = 0; i < 4 && pageflush[i].base; i++) {
+               cache_flush_d_inval_i(pageflush[i].base, pageflush[i].end + pagesize-1);
+               pageflush[i].base = NULL;
+       }
+}
+
+static inline void emith_update_add(void *base, void *end)
+{
+       void *p_base = (void *)((uintptr_t)(base) & ~(pagesize-1));
+       void *p_end  = (void *)((uintptr_t)(end ) & ~(pagesize-1));
+       int i;
+
+       for (i = 0; i < 4 && pageflush[i].base; i++) {
+               if (p_base <= pageflush[i].end+pagesize && p_end >= pageflush[i].end) {
+                       if (p_base < pageflush[i].base) pageflush[i].base = p_base;
+                       pageflush[i].end = p_end;
+                       return;
+               }
+               if (p_base <= pageflush[i].base && p_end >= pageflush[i].base-pagesize) {
+                       if (p_end > pageflush[i].end) pageflush[i].end = p_end;
+                       pageflush[i].base = p_base;
+                       return;
+               }
+       }
+       if (i == 4) {
+               /* list full and not mergeable -> flush list */
+               emith_update_cache();
+               i = 0;
+       }
+       pageflush[i].base = p_base, pageflush[i].end = p_end;
+}
+
  // peephole optimizer. ATM only tries to reduce interlock
  #define EMIT_CACHE_SIZE 3
  struct emit_op {
@@ -48,8 +89,8 @@ static struct emit_op emit_cache[EMIT_CACHE_SIZE+3];
  static int emit_index;
  #define emith_insn_ptr()       (u8 *)((u32 *)tcache_ptr-emit_index)
  
-static int emith_pool_index(int tcache_offs);
-static void emith_pool_adjust(int pool_index, int move_offs);
+static inline int emith_pool_index(int tcache_offs);
+static inline void emith_pool_adjust(int pool_index, int move_offs);
  
  static NOINLINE void EMIT(u32 op, u32 dst, u32 src)
  {
@@ -1106,6 +1147,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
         (u8 *)ptr; \
  })
  
+#define emith_jump_cond_inrange(target) !0
  #define emith_jump_patch_size() 4
  
  #define emith_jump_at(ptr, target) do { \
@@ -1170,7 +1212,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
  } while (0)
  
  #define host_instructions_updated(base, end) \
-       cache_flush_d_inval_i(base, end)
+       emith_update_add(base, end)
  
  #define host_arg2reg(rd, arg) \
         rd = arg
diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c

index de58761..8ce2ef3 100644 (file)
--- a/cpu/drc/emit_arm64.c
+++ b/cpu/drc/emit_arm64.c
@@ -1038,6 +1038,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
  #define emith_jump_cond_patchable(cond, target) \
         emith_bcond(tcache_ptr, 1, cond, target)
  
+#define emith_jump_cond_inrange(target) \
+       !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 22)
+
  #define emith_jump_patch(ptr, target) ({ \
         u32 *ptr_ = (u32 *)ptr; \
         u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \
@@ -1116,6 +1119,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
  #define emith_insn_ptr()       ((u8 *)tcache_ptr)
  #define        emith_flush()           /**/
  #define host_instructions_updated(base, end) __builtin___clear_cache(base, end)
+#define        emith_update_cache()    /**/
  #define emith_jump_patch_size()        8
  #define emith_rw_offs_max()    0xff
  
diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c

index e200db0..0e85f92 100644 (file)
--- a/cpu/drc/emit_mips.c
+++ b/cpu/drc/emit_mips.c
@@ -209,20 +209,25 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 };
  
  // FIFO for 2 instructions, for delay slot handling
  u32 emith_last_insns[2] = { -1,-1 };
-int emith_last_idx;
+int emith_last_idx, emith_last_cnt;
  
  #define EMIT_PUSHOP() \
         do { \
                 emith_last_idx ^= 1; \
-               if (emith_last_insns[emith_last_idx] != -1) \
-                       EMIT_PTR(tcache_ptr, emith_last_insns[emith_last_idx]);\
+               if (emith_last_insns[emith_last_idx] != -1) { \
+                       u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \
+                       EMIT_PTR(p, emith_last_insns[emith_last_idx]);\
+                       emith_last_cnt --; \
+               } \
                 emith_last_insns[emith_last_idx] = -1; \
         } while (0)
  
  #define EMIT(op) \
         do { \
                 EMIT_PUSHOP(); \
+               tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \
                 emith_last_insns[emith_last_idx] = op; \
+               emith_last_cnt ++; \
                 COUNT_OP; \
         } while (0)
  
@@ -231,8 +236,7 @@ int emith_last_idx;
                 int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \
         } while (0)
  
-#define emith_insn_ptr()       (u8 *)((u32 *)tcache_ptr + \
-               (emith_last_insns[0] != -1) + (emith_last_insns[1] != -1))
+#define emith_insn_ptr()       (u8 *)((u32 *)tcache_ptr - emith_last_cnt)
  
  // delay slot stuff
  static int emith_is_j(u32 op)  // J, JAL
@@ -305,12 +309,14 @@ static void *emith_branch(u32 op)
         }
  
         if (bop) { // can swap
+               tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt);
                 if (emith_last_insns[idx^1] != -1)
                         EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]);
                 bp = tcache_ptr;
                 EMIT_PTR(tcache_ptr, bop); COUNT_OP;
                 EMIT_PTR(tcache_ptr, emith_last_insns[idx]);
                 emith_last_insns[0] = emith_last_insns[1] = -1;
+               emith_last_cnt = 0;
         } else { // can't swap
                 emith_flush();
                 bp = tcache_ptr;
@@ -325,13 +331,13 @@ static void *emith_branch(u32 op)
         ptr = emith_branch(MIPS_BCONDZ(cond_m, cond_r, 0));
  
  #define JMP_EMIT(cond, ptr) { \
-       u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \
+       u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \
         EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \
         emith_flush(); /* NO delay slot handling across jump targets */ \
  }
  
  #define JMP_EMIT_NC(ptr) { \
-       u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \
+       u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \
         EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \
         emith_flush(); \
  }
@@ -881,14 +887,14 @@ static u8 *last_lohi;
  static void emith_lohi_nops(void)
  {
         u32 d;
-       while ((d = emith_insn_ptr() - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP);
+       while ((d = (u8 *)tcache_ptr - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP);
  }
  
  #define emith_mul(d, s1, s2) do { \
         emith_lohi_nops(); \
         EMIT(MIPS_MULTU(s1, s2)); \
         EMIT(MIPS_MFLO(d)); \
-       last_lohi = emith_insn_ptr(); \
+       last_lohi = (u8 *)tcache_ptr; \
  } while (0)
  
  #define emith_mul_u64(dlo, dhi, s1, s2) do { \
@@ -896,7 +902,7 @@ static void emith_lohi_nops(void)
         EMIT(MIPS_MULTU(s1, s2)); \
         EMIT(MIPS_MFLO(dlo)); \
         EMIT(MIPS_MFHI(dhi)); \
-       last_lohi = emith_insn_ptr(); \
+       last_lohi = (u8 *)tcache_ptr; \
  } while (0)
  
  #define emith_mul_s64(dlo, dhi, s1, s2) do { \
@@ -904,7 +910,7 @@ static void emith_lohi_nops(void)
         EMIT(MIPS_MULT(s1, s2)); \
         EMIT(MIPS_MFLO(dlo)); \
         EMIT(MIPS_MFHI(dhi)); \
-       last_lohi = emith_insn_ptr(); \
+       last_lohi = (u8 *)tcache_ptr; \
  } while (0)
  
  #define emith_mula_s64(dlo, dhi, s1, s2) do { \
@@ -915,7 +921,7 @@ static void emith_lohi_nops(void)
         emith_add_r_r(dlo, AT); \
         EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \
         EMIT(MIPS_MFHI(AT)); \
-       last_lohi = emith_insn_ptr(); \
+       last_lohi = (u8 *)tcache_ptr; \
         emith_add_r_r(dhi, AT); \
         emith_add_r_r(dhi, t_); \
         rcache_free_tmp(t_); \
@@ -1174,14 +1180,14 @@ static int emith_cond_check(int cond, int *r)
  // NB: MIPS conditional branches have only +/- 128KB range
  #define emith_jump_cond(cond, target) do { \
         int r_, mcond_ = emith_cond_check(cond, &r_); \
-       u32 disp_ = (u8 *)target - emith_insn_ptr() - 4; \
+       u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \
         if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \
                 emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \
         } else { /* far branch if near branch isn't possible */ \
                 mcond_ = emith_invert_branch(mcond_); \
                 u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \
                 emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \
-               EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \
+               EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \
         } \
  } while (0)
  
@@ -1190,9 +1196,12 @@ static int emith_cond_check(int cond, int *r)
         mcond_ = emith_invert_branch(mcond_); \
         u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\
         emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \
-       EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \
+       EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \
  } while (0)
  
+#define emith_jump_cond_inrange(target) \
+       !(((u8 *)target - (u8 *)tcache_ptr + 0x10000) >> 18)
+
  // NB: returns position of patch for cache maintenance
  #define emith_jump_patch(ptr, target) ({ \
         u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \
@@ -1261,6 +1270,7 @@ static int emith_cond_check(int cond, int *r)
  #define emith_pool_commit(j)   /**/
  // NB: mips32r2 has SYNCI
  #define host_instructions_updated(base, end) __builtin___clear_cache(base, end)
+#define        emith_update_cache()    /**/
  #define emith_jump_patch_size()        4
  #define emith_rw_offs_max()    0x7fff
  
diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c

index d515cd2..caade3a 100644 (file)
--- a/cpu/drc/emit_x86.c
+++ b/cpu/drc/emit_x86.c
@@ -877,6 +877,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
         ptr; \
  })
  
+#define emith_jump_cond_inrange(ptr) !0
  #define emith_jump_patch_size() 6
  
  #define emith_jump_at(ptr, target) do { \
@@ -986,6 +987,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
  } while (0)
  
  #define host_instructions_updated(base, end)
+#define        emith_update_cache()    /**/
  
  #define emith_rw_offs_max()    0xffffffff
  
@@ -993,7 +995,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
  
  #define HOST_REGS 16
  #define PTR_SCALE 3
-#define NA_TMP_REG xAX // non-arg tmp from reg_temp[]
  
  #define EMIT_XREX_IF(w, r, rm, rs) do { \
         int xr_ = (r) > 7 ? 1 : 0; \
@@ -1078,7 +1079,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,        // x86-64,i386 common
  
  #define HOST_REGS 8
  #define PTR_SCALE 2
-#define NA_TMP_REG xBX // non-arg tmp from reg_temp[]
  
  #define EMIT_REX_IF(w, r, rm) do { \
         assert((u32)(r) < 8u); \
diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c

index 677c8ad..6eaf712 100644 (file)
--- a/cpu/sh2/compiler.c
+++ b/cpu/sh2/compiler.c
@@ -2920,6 +2920,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
      // mark memory for overwrite detection
      dr_mark_memory(1, block, tcache_id, 0);
      block->active = 1;
+    emith_update_cache();
      return block->entryp[0].tcache_ptr;
    }
  
@@ -3113,8 +3114,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
        tmp = rcache_get_tmp_arg(0);
        sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL);
        emith_cmp_r_imm(sr, 0);
-      emith_move_r_imm_c(DCOND_LE, tmp, pc);
-      emith_jump_cond(DCOND_LE, sh2_drc_exit);
+      if (emith_jump_cond_inrange(sh2_drc_exit)) {
+        emith_move_r_imm_c(DCOND_LE, tmp, pc);
+        emith_jump_cond(DCOND_LE, sh2_drc_exit);
+      } else {
+        EMITH_JMP_START(DCOND_GT);
+        emith_move_r_imm(tmp, pc);
+        emith_jump(sh2_drc_exit);
+        EMITH_JMP_END(DCOND_GT);
+      }
        rcache_free_tmp(tmp);
  
  #if (DRC_DEBUG & 32)
@@ -3249,7 +3257,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
        }
      }
      rcache_set_usage_now(opd[0].source);   // current insn
-    rcache_set_usage_soon(late);           // insns 1-3
+    rcache_set_usage_soon(soon);           // insns 1-3
      rcache_set_usage_late(late & ~soon);   // insns 4-9
      rcache_set_usage_discard(write & ~(late|soon) & ~opd[0].source);
  
@@ -4442,12 +4450,16 @@ end_op:
    fflush(stdout);
  #endif
  
+  emith_update_cache();
    return block_entry_ptr;
  }
  
  static void sh2_generate_utils(void)
  {
    int arg0, arg1, arg2, arg3, sr, tmp, tmp2;
+#if DRC_DEBUG
+  int hic = host_insn_count; // don't count utils for insn statistics
+#endif
  
    host_arg2reg(arg0, 0);
    host_arg2reg(arg1, 1);
@@ -4794,6 +4806,10 @@ static void sh2_generate_utils(void)
    host_dasm_new_symbol(sh2_drc_read16_poll);
    host_dasm_new_symbol(sh2_drc_read32_poll);
  #endif
+
+#if DRC_DEBUG
+  host_insn_count = hic;
+#endif
  }
  
  static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free)
@@ -4847,6 +4863,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol
      bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0;
      bd->entry_count = 0;
    }
+  emith_update_cache();
  }
  
  static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift)
@@ -5197,6 +5214,7 @@ int sh2_drc_init(SH2 *sh2)
      tcache_ptr = tcache;
      sh2_generate_utils();
      host_instructions_updated(tcache, tcache_ptr);
+    emith_update_cache();
  
      tcache_bases[0] = tcache_ptrs[0] = tcache_ptr;
      tcache_limit[0] = tcache_bases[0] + tcache_sizes[0] - (tcache_ptr-tcache);
diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h

index 1ad922b..187ad71 100644 (file)
--- a/cpu/sh2/compiler.h
+++ b/cpu/sh2/compiler.h
@@ -33,26 +33,24 @@ unsigned short scan_block(unsigned int base_pc, int is_slave,
  
  #if defined(DRC_SH2)
  // direct access to some host CPU registers used by the DRC
-// XXX MUST match definitions in cpu/sh2/compiler.c
+// XXX MUST match definitions for SHR_SR in cpu/sh2/compiler.c
  #if defined(__arm__)
-#define        DRC_SR_REG      r10
+#define        DRC_SR_REG      "r10"
  #elif defined(__aarch64__)
-#define        DRC_SR_REG      r22
+#define        DRC_SR_REG      "r22"
  #elif defined(__mips__)
-#define        DRC_SR_REG      s6
+#define        DRC_SR_REG      "s6"
  #elif defined(__i386__)
-#define        DRC_SR_REG      edi
+#define        DRC_SR_REG      "edi"
  #elif defined(__x86_64__)
-#define        DRC_SR_REG      ebx
+#define        DRC_SR_REG      "ebx"
  #else
  #warning "direct DRC register access not available for this host"
  #endif
  #endif
  
  #ifdef DRC_SR_REG
-#define        __DRC_DECLARE_SR(SR)    register int sh2_sr asm(#SR)
-#define        _DRC_DECLARE_SR(SR)     __DRC_DECLARE_SR(SR)
-#define        DRC_DECLARE_SR  _DRC_DECLARE_SR(DRC_SR_REG)
+#define        DRC_DECLARE_SR  register int sh2_sr asm(DRC_SR_REG)
  #define DRC_SAVE_SR(sh2) \
      if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \
          sh2->sr = sh2_sr;
diff --git a/pico/32x/32x.c b/pico/32x/32x.c

index e9d8ff6..f6d1a15 100644 (file)
--- a/pico/32x/32x.c
+++ b/pico/32x/32x.c
@@ -471,7 +471,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
        if (!(ssh2.state & SH2_IDLE_STATES)) {
          cycles = target - ssh2.m68krcycles_done;
          if (cycles > 0) {
-          run_sh2(&ssh2, cycles > 20 ? cycles : 20);
+          run_sh2(&ssh2, cycles > 20U ? cycles : 20U);
  
            if (event_time_next && CYCLES_GT(target, event_time_next))
              target = event_time_next;
@@ -483,7 +483,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
        if (!(msh2.state & SH2_IDLE_STATES)) {
          cycles = target - msh2.m68krcycles_done;
          if (cycles > 0) {
-          run_sh2(&msh2, cycles > 20 ? cycles : 20);
+          run_sh2(&msh2, cycles > 20U ? cycles : 20U);
  
            if (event_time_next && CYCLES_GT(target, event_time_next))
              target = event_time_next;
author	kub <derkub@gmail.com>
	Tue, 17 Sep 2019 20:48:32 +0000 (22:48 +0200)
committer	kub <derkub@gmail.com>
	Tue, 17 Sep 2019 21:05:35 +0000 (23:05 +0200)
Makefile		patch \| blob \| blame \| history
cpu/drc/emit_arm.c		patch \| blob \| blame \| history
cpu/drc/emit_arm64.c		patch \| blob \| blame \| history
cpu/drc/emit_mips.c		patch \| blob \| blame \| history
cpu/drc/emit_x86.c		patch \| blob \| blame \| history
cpu/sh2/compiler.c		patch \| blob \| blame \| history
cpu/sh2/compiler.h		patch \| blob \| blame \| history
pico/32x/32x.c		patch \| blob \| blame \| history