sh2 drc, fix optimized standard division

author kub <derkub@gmail.com>

Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)

committer kub <derkub@gmail.com>

Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)
author kub <derkub@gmail.com>
Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)
committer kub <derkub@gmail.com>
Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)
diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c

index 586175d..8085bfc 100644 (file)
--- a/cpu/sh2/compiler.c
+++ b/cpu/sh2/compiler.c
@@ -3014,12 +3014,21 @@ static void emit_do_static_regs(int is_write, int tmpr)
  // divide operation replacement functions, called by compiled code. Only the
  // 32:16 cases and the 64:32 cases described in the SH2 prog man are replaced.
  
-static uint32_t REGPARM(2) sh2_drc_divu32(uint32_t dv, uint32_t ds)
-{
-  if (ds && ds >= dv) {
-    // good case: no divide by 0, and no result overflow
+// This is surprisingly difficult since the SH2 division operation is generating// the result in the dividend during the operation, leaving some remainder-like
+// stuff in the bits unused for the result, and leaving the T and Q status bits
+// in a state depending on the operands and the result. Q always reflects the
+// last result bit generated (i.e. bit 0 of the result). For T:
+//     32:16   T = top bit of the 16 bit remainder-like
+//     64:32   T = resulting T of the DIV0U/S operation
+// The remainder-like depends on outcome of the last generated result bit.
+
+static uint32_t REGPARM(3) sh2_drc_divu32(uint32_t dv, uint32_t *dt, uint32_t ds)
+{
+  if (ds > dv && (uint16_t)ds == 0) {
+    // good case: no overflow, divisor not 0, lower 16 bits 0
      uint32_t quot = dv / (ds>>16), rem = dv - (quot * (ds>>16));
      if (~quot&1) rem -= ds>>16;
+    *dt = (rem>>15) & 1;
      return (uint16_t)quot | ((2*rem + (quot>>31)) << 16);
    } else {
      // bad case: use the sh2 algo to get the right result
@@ -3029,20 +3038,21 @@ static uint32_t REGPARM(2) sh2_drc_divu32(uint32_t dv, uint32_t ds)
        dv = (dv<<1) | t;
        t = v;
        v = dv;
-      if (q)  dv += ds, q =   dv < v;
-      else    dv -= ds, q = !(dv < v);
+      if (q)  dv += ds, q = dv < v;
+      else    dv -= ds, q = dv > v;
        q ^= t, t = !q;
      }
+    *dt = dv>>31;
      return (dv<<1) | t;
    }
  }
  
  static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds)
  {
-  if (ds > 1 && ds >= dh) {
-    // good case: no divide by 0, and no result overflow
+  if (ds > dh) {
+    // good case: no overflow, divisor not 0
      uint64_t dv = *dl | ((uint64_t)dh << 32);
-    uint32_t quot = dv / ds, rem = dv - (quot * ds);
+    uint32_t quot = dv / ds, rem = dv - ((uint64_t)quot * ds);
      if (~quot&1) rem -= ds;
      *dl = quot;
      return rem;
@@ -3055,8 +3065,8 @@ static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds
        dv = (dv<<1) | t;
        t = v;
        v = dv;
-      if (q)  dv += ((uint64_t)ds << 32), q =   dv < v;
-      else    dv -= ((uint64_t)ds << 32), q = !(dv < v);
+      if (q)  dv += ((uint64_t)ds << 32), q = dv < v;
+      else    dv -= ((uint64_t)ds << 32), q = dv > v;
        q ^= t, t = !q;
      }
      *dl = (dv<<1) | t;
@@ -3064,16 +3074,17 @@ static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds
    }
  }
  
-static uint32_t REGPARM(2) sh2_drc_divs32(int32_t dv, int32_t ds)
+static uint32_t REGPARM(3) sh2_drc_divs32(int32_t dv, uint32_t *dt, int32_t ds)
  {
    uint32_t adv = abs(dv), ads = abs(ds)>>16;
-  if (ads > 1 && ads > adv>>16 && (int32_t)ads > 0 && !(uint16_t)ds) {
-    // good case: no divide by 0, and no result overflow
+  if (ads > adv>>16 && ds != 0x80000000 && (int16_t)ds == 0) {
+    // good case: no overflow, divisor not 0 and not MIN_INT, lower 16 bits 0
      uint32_t quot = adv / ads, rem = adv - (quot * ads);
      int m1 = (rem ? dv^ds : ds) < 0;
      if (rem && dv < 0)  rem = (quot&1 ? -rem : +ads-rem);
      else                rem = (quot&1 ? +rem : -ads+rem);
      quot = ((dv^ds)<0 ? -quot : +quot) - m1;
+    *dt = (rem>>15) & 1;
      return (uint16_t)quot | ((2*rem + (quot>>31)) << 16);
    } else {
      // bad case: use the sh2 algo to get the right result
@@ -3083,10 +3094,11 @@ static uint32_t REGPARM(2) sh2_drc_divs32(int32_t dv, int32_t ds)
        dv = (dv<<1) | t;
        t = v;
        v = dv;
-      if (m^q)  dv += ds, q =   (uint32_t)dv < v;
-      else      dv -= ds, q = !((uint32_t)dv < v);
+      if (m^q)  dv += ds, q = (uint32_t)dv < v;
+      else      dv -= ds, q = (uint32_t)dv > v;
        q ^= m^t, t = !(m^q);
      }
+    *dt = (uint32_t)dv>>31;
      return (dv<<1) | t;
    }
  }
@@ -3096,8 +3108,8 @@ static uint32_t REGPARM(3) sh2_drc_divs64(int32_t dh, uint32_t *dl, int32_t ds)
    int64_t _dv = *dl | ((int64_t)dh << 32);
    uint64_t adv = (_dv < 0 ? -_dv : _dv); // llabs isn't in older toolchains
    uint32_t ads = abs(ds);
-  if (ads > 1 && ads > adv>>32 && (int64_t)adv > 0) {
-    // good case: no divide by 0, and no result overflow
+  if (ads > adv>>32 && ds != 0x80000000) {
+    // good case: no overflow, divisor not 0 and not MIN_INT
      uint32_t quot = adv / ads, rem = adv - ((uint64_t)quot * ads);
      int m1 = (rem ? dh^ds : ds) < 0;
      if (rem && dh < 0) rem = (quot&1 ? -rem : +ads-rem);
@@ -3110,12 +3122,12 @@ static uint32_t REGPARM(3) sh2_drc_divs64(int32_t dh, uint32_t *dl, int32_t ds)
      uint64_t dv = *dl | ((uint64_t)dh << 32);
      int m = (uint32_t)ds>>31, q = (uint64_t)dv>>63, t = m^q, s = 32;
      while (s--) {
-      int64_t v = (uint64_t)dv>>63;
+      uint64_t v = (uint64_t)dv>>63;
        dv = (dv<<1) | t;
        t = v;
        v = dv;
-      if (m^q)  dv += ((uint64_t)ds << 32), q =   dv < v;
-      else      dv -= ((uint64_t)ds << 32), q = !(dv < v);
+      if (m^q)  dv += ((uint64_t)ds << 32), q = dv < v;
+      else      dv -= ((uint64_t)ds << 32), q = dv > v;
        q ^= m^t, t = !(m^q);
      }
      *dl = (dv<<1) | t;
@@ -3921,8 +3933,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
  #if DIV_OPTIMIZER
            if (div(opd).div1 == 16 && div(opd).ro == div(opd).rn) {
              // divide 32/16
+            tmp = rcache_get_tmp_arg(1);
+            emith_add_r_r_ptr_imm(tmp, CONTEXT_REG, offsetof(SH2, drc_tmp));
              rcache_get_reg_arg(0, div(opd).rn, NULL);
-            rcache_get_reg_arg(1, div(opd).rm, NULL);
+            rcache_get_reg_arg(2, div(opd).rm, NULL);
              rcache_invalidate_tmp();
              emith_abicall(sh2_drc_divu32);
              tmp = rcache_get_tmp_ret();
@@ -3934,8 +3948,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
              emith_and_r_r_imm(tmp3, tmp2, 1);     // Q = !Rn[0]
              emith_eor_r_r_imm(tmp3, tmp3, 1);
              emith_or_r_r_lsl(sr, tmp3, Q_SHIFT);
+            emith_ctx_read(tmp3, offsetof(SH2, drc_tmp));
+            emith_or_r_r_r(sr, sr, tmp3);         // T
              rcache_free_tmp(tmp3);
-            emith_or_r_r_r_lsr(sr, sr, tmp2, 31); // T = Rn[31]
              skip_op = div(opd).div1 + div(opd).rotcl;
            }
            else if (div(opd).div1 == 32 && div(opd).ro != div(opd).rn) {
@@ -3960,7 +3975,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
              emith_eor_r_r_imm(tmp3, tmp3, 1);
              emith_or_r_r_lsl(sr, tmp3, Q_SHIFT);
              rcache_free_tmp(tmp3);
-            emith_or_r_r_r_lsr(sr, sr, tmp4, 31); // T = Ro[31]
              skip_op = div(opd).div1 + div(opd).rotcl;
            }
  #endif
@@ -4035,8 +4049,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
  #if DIV_OPTIMIZER
          if (div(opd).div1 == 16 && div(opd).ro == div(opd).rn) {
            // divide 32/16
+          tmp = rcache_get_tmp_arg(1);
+          emith_add_r_r_ptr_imm(tmp, CONTEXT_REG, offsetof(SH2, drc_tmp));
            rcache_get_reg_arg(0, div(opd).rn, NULL);
-          tmp2 = rcache_get_reg_arg(1, div(opd).rm, NULL);
+          tmp2 = rcache_get_reg_arg(2, div(opd).rm, NULL);
            tmp3 = rcache_get_tmp();
            emith_lsr(tmp3, tmp2, 31);
            emith_or_r_r_lsl(sr, tmp3, M_SHIFT);        // M = Rm[31]
@@ -4052,19 +4068,22 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
            emith_and_r_r_imm(tmp3, tmp3, 1);
            emith_eor_r_r_imm(tmp3, tmp3, 1);
            emith_or_r_r_lsl(sr, tmp3, Q_SHIFT);        // Q = !Rn[0]^M
+          emith_ctx_read(tmp3, offsetof(SH2, drc_tmp));
+          emith_or_r_r_r(sr, sr, tmp3);               // T
            rcache_free_tmp(tmp3);
-          emith_or_r_r_r_lsr(sr, sr, tmp2, 31);       // T = Rn[31]
            skip_op = div(opd).div1 + div(opd).rotcl;
          }
          else if (div(opd).div1 == 32 && div(opd).ro != div(opd).rn) {
            // divide 64/32
            tmp4 = rcache_get_reg(div(opd).ro, RC_GR_READ, NULL);
            emith_ctx_write(tmp4, offsetof(SH2, drc_tmp));
-          rcache_get_reg_arg(0, div(opd).rn, NULL);
+          tmp  = rcache_get_reg_arg(0, div(opd).rn, NULL);
            tmp2 = rcache_get_reg_arg(2, div(opd).rm, NULL);
            tmp3 = rcache_get_tmp_arg(1);
            emith_lsr(tmp3, tmp2, 31);
-          emith_or_r_r_lsl(sr, tmp3, M_SHIFT);         // M = Rm[31]
+          emith_or_r_r_lsl(sr, tmp3, M_SHIFT);        // M = Rm[31]
+          emith_eor_r_r_lsr(tmp3, tmp, 31);
+          emith_or_r_r(sr, tmp3);                     // T = Rn[31]^M
            emith_add_r_r_ptr_imm(tmp3, CONTEXT_REG, offsetof(SH2, drc_tmp));
            rcache_invalidate_tmp();
            emith_abicall(sh2_drc_divs64);
@@ -4081,7 +4100,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
            emith_eor_r_r_imm(tmp3, tmp3, 1);
            emith_or_r_r_lsl(sr, tmp3, Q_SHIFT);        // Q = !Ro[0]^M
            rcache_free_tmp(tmp3);
-          emith_or_r_r_r_lsr(sr, sr, tmp4, 31);       // T = Ro[31]
            skip_op = div(opd).div1 + div(opd).rotcl;
          } else
  #endif
author	kub <derkub@gmail.com>
	Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)
committer	kub <derkub@gmail.com>
	Wed, 7 Apr 2021 20:24:03 +0000 (22:24 +0200)