From: kub Date: Wed, 7 Apr 2021 20:24:03 +0000 (+0200) Subject: sh2 drc, fix optimized standard division X-Git-Tag: v2.00~561 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5f9795186544582b31482e59702cf72e05518cf4;p=picodrive.git sh2 drc, fix optimized standard division still turned off, needs testing and performance checks --- diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 586175de..8085bfcc 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -3014,12 +3014,21 @@ static void emit_do_static_regs(int is_write, int tmpr) // divide operation replacement functions, called by compiled code. Only the // 32:16 cases and the 64:32 cases described in the SH2 prog man are replaced. -static uint32_t REGPARM(2) sh2_drc_divu32(uint32_t dv, uint32_t ds) -{ - if (ds && ds >= dv) { - // good case: no divide by 0, and no result overflow +// This is surprisingly difficult since the SH2 division operation is generating// the result in the dividend during the operation, leaving some remainder-like +// stuff in the bits unused for the result, and leaving the T and Q status bits +// in a state depending on the operands and the result. Q always reflects the +// last result bit generated (i.e. bit 0 of the result). For T: +// 32:16 T = top bit of the 16 bit remainder-like +// 64:32 T = resulting T of the DIV0U/S operation +// The remainder-like depends on outcome of the last generated result bit. + +static uint32_t REGPARM(3) sh2_drc_divu32(uint32_t dv, uint32_t *dt, uint32_t ds) +{ + if (ds > dv && (uint16_t)ds == 0) { + // good case: no overflow, divisor not 0, lower 16 bits 0 uint32_t quot = dv / (ds>>16), rem = dv - (quot * (ds>>16)); if (~quot&1) rem -= ds>>16; + *dt = (rem>>15) & 1; return (uint16_t)quot | ((2*rem + (quot>>31)) << 16); } else { // bad case: use the sh2 algo to get the right result @@ -3029,20 +3038,21 @@ static uint32_t REGPARM(2) sh2_drc_divu32(uint32_t dv, uint32_t ds) dv = (dv<<1) | t; t = v; v = dv; - if (q) dv += ds, q = dv < v; - else dv -= ds, q = !(dv < v); + if (q) dv += ds, q = dv < v; + else dv -= ds, q = dv > v; q ^= t, t = !q; } + *dt = dv>>31; return (dv<<1) | t; } } static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds) { - if (ds > 1 && ds >= dh) { - // good case: no divide by 0, and no result overflow + if (ds > dh) { + // good case: no overflow, divisor not 0 uint64_t dv = *dl | ((uint64_t)dh << 32); - uint32_t quot = dv / ds, rem = dv - (quot * ds); + uint32_t quot = dv / ds, rem = dv - ((uint64_t)quot * ds); if (~quot&1) rem -= ds; *dl = quot; return rem; @@ -3055,8 +3065,8 @@ static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds dv = (dv<<1) | t; t = v; v = dv; - if (q) dv += ((uint64_t)ds << 32), q = dv < v; - else dv -= ((uint64_t)ds << 32), q = !(dv < v); + if (q) dv += ((uint64_t)ds << 32), q = dv < v; + else dv -= ((uint64_t)ds << 32), q = dv > v; q ^= t, t = !q; } *dl = (dv<<1) | t; @@ -3064,16 +3074,17 @@ static uint32_t REGPARM(3) sh2_drc_divu64(uint32_t dh, uint32_t *dl, uint32_t ds } } -static uint32_t REGPARM(2) sh2_drc_divs32(int32_t dv, int32_t ds) +static uint32_t REGPARM(3) sh2_drc_divs32(int32_t dv, uint32_t *dt, int32_t ds) { uint32_t adv = abs(dv), ads = abs(ds)>>16; - if (ads > 1 && ads > adv>>16 && (int32_t)ads > 0 && !(uint16_t)ds) { - // good case: no divide by 0, and no result overflow + if (ads > adv>>16 && ds != 0x80000000 && (int16_t)ds == 0) { + // good case: no overflow, divisor not 0 and not MIN_INT, lower 16 bits 0 uint32_t quot = adv / ads, rem = adv - (quot * ads); int m1 = (rem ? dv^ds : ds) < 0; if (rem && dv < 0) rem = (quot&1 ? -rem : +ads-rem); else rem = (quot&1 ? +rem : -ads+rem); quot = ((dv^ds)<0 ? -quot : +quot) - m1; + *dt = (rem>>15) & 1; return (uint16_t)quot | ((2*rem + (quot>>31)) << 16); } else { // bad case: use the sh2 algo to get the right result @@ -3083,10 +3094,11 @@ static uint32_t REGPARM(2) sh2_drc_divs32(int32_t dv, int32_t ds) dv = (dv<<1) | t; t = v; v = dv; - if (m^q) dv += ds, q = (uint32_t)dv < v; - else dv -= ds, q = !((uint32_t)dv < v); + if (m^q) dv += ds, q = (uint32_t)dv < v; + else dv -= ds, q = (uint32_t)dv > v; q ^= m^t, t = !(m^q); } + *dt = (uint32_t)dv>>31; return (dv<<1) | t; } } @@ -3096,8 +3108,8 @@ static uint32_t REGPARM(3) sh2_drc_divs64(int32_t dh, uint32_t *dl, int32_t ds) int64_t _dv = *dl | ((int64_t)dh << 32); uint64_t adv = (_dv < 0 ? -_dv : _dv); // llabs isn't in older toolchains uint32_t ads = abs(ds); - if (ads > 1 && ads > adv>>32 && (int64_t)adv > 0) { - // good case: no divide by 0, and no result overflow + if (ads > adv>>32 && ds != 0x80000000) { + // good case: no overflow, divisor not 0 and not MIN_INT uint32_t quot = adv / ads, rem = adv - ((uint64_t)quot * ads); int m1 = (rem ? dh^ds : ds) < 0; if (rem && dh < 0) rem = (quot&1 ? -rem : +ads-rem); @@ -3110,12 +3122,12 @@ static uint32_t REGPARM(3) sh2_drc_divs64(int32_t dh, uint32_t *dl, int32_t ds) uint64_t dv = *dl | ((uint64_t)dh << 32); int m = (uint32_t)ds>>31, q = (uint64_t)dv>>63, t = m^q, s = 32; while (s--) { - int64_t v = (uint64_t)dv>>63; + uint64_t v = (uint64_t)dv>>63; dv = (dv<<1) | t; t = v; v = dv; - if (m^q) dv += ((uint64_t)ds << 32), q = dv < v; - else dv -= ((uint64_t)ds << 32), q = !(dv < v); + if (m^q) dv += ((uint64_t)ds << 32), q = dv < v; + else dv -= ((uint64_t)ds << 32), q = dv > v; q ^= m^t, t = !(m^q); } *dl = (dv<<1) | t; @@ -3921,8 +3933,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if DIV_OPTIMIZER if (div(opd).div1 == 16 && div(opd).ro == div(opd).rn) { // divide 32/16 + tmp = rcache_get_tmp_arg(1); + emith_add_r_r_ptr_imm(tmp, CONTEXT_REG, offsetof(SH2, drc_tmp)); rcache_get_reg_arg(0, div(opd).rn, NULL); - rcache_get_reg_arg(1, div(opd).rm, NULL); + rcache_get_reg_arg(2, div(opd).rm, NULL); rcache_invalidate_tmp(); emith_abicall(sh2_drc_divu32); tmp = rcache_get_tmp_ret(); @@ -3934,8 +3948,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_and_r_r_imm(tmp3, tmp2, 1); // Q = !Rn[0] emith_eor_r_r_imm(tmp3, tmp3, 1); emith_or_r_r_lsl(sr, tmp3, Q_SHIFT); + emith_ctx_read(tmp3, offsetof(SH2, drc_tmp)); + emith_or_r_r_r(sr, sr, tmp3); // T rcache_free_tmp(tmp3); - emith_or_r_r_r_lsr(sr, sr, tmp2, 31); // T = Rn[31] skip_op = div(opd).div1 + div(opd).rotcl; } else if (div(opd).div1 == 32 && div(opd).ro != div(opd).rn) { @@ -3960,7 +3975,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_eor_r_r_imm(tmp3, tmp3, 1); emith_or_r_r_lsl(sr, tmp3, Q_SHIFT); rcache_free_tmp(tmp3); - emith_or_r_r_r_lsr(sr, sr, tmp4, 31); // T = Ro[31] skip_op = div(opd).div1 + div(opd).rotcl; } #endif @@ -4035,8 +4049,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if DIV_OPTIMIZER if (div(opd).div1 == 16 && div(opd).ro == div(opd).rn) { // divide 32/16 + tmp = rcache_get_tmp_arg(1); + emith_add_r_r_ptr_imm(tmp, CONTEXT_REG, offsetof(SH2, drc_tmp)); rcache_get_reg_arg(0, div(opd).rn, NULL); - tmp2 = rcache_get_reg_arg(1, div(opd).rm, NULL); + tmp2 = rcache_get_reg_arg(2, div(opd).rm, NULL); tmp3 = rcache_get_tmp(); emith_lsr(tmp3, tmp2, 31); emith_or_r_r_lsl(sr, tmp3, M_SHIFT); // M = Rm[31] @@ -4052,19 +4068,22 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_and_r_r_imm(tmp3, tmp3, 1); emith_eor_r_r_imm(tmp3, tmp3, 1); emith_or_r_r_lsl(sr, tmp3, Q_SHIFT); // Q = !Rn[0]^M + emith_ctx_read(tmp3, offsetof(SH2, drc_tmp)); + emith_or_r_r_r(sr, sr, tmp3); // T rcache_free_tmp(tmp3); - emith_or_r_r_r_lsr(sr, sr, tmp2, 31); // T = Rn[31] skip_op = div(opd).div1 + div(opd).rotcl; } else if (div(opd).div1 == 32 && div(opd).ro != div(opd).rn) { // divide 64/32 tmp4 = rcache_get_reg(div(opd).ro, RC_GR_READ, NULL); emith_ctx_write(tmp4, offsetof(SH2, drc_tmp)); - rcache_get_reg_arg(0, div(opd).rn, NULL); + tmp = rcache_get_reg_arg(0, div(opd).rn, NULL); tmp2 = rcache_get_reg_arg(2, div(opd).rm, NULL); tmp3 = rcache_get_tmp_arg(1); emith_lsr(tmp3, tmp2, 31); - emith_or_r_r_lsl(sr, tmp3, M_SHIFT); // M = Rm[31] + emith_or_r_r_lsl(sr, tmp3, M_SHIFT); // M = Rm[31] + emith_eor_r_r_lsr(tmp3, tmp, 31); + emith_or_r_r(sr, tmp3); // T = Rn[31]^M emith_add_r_r_ptr_imm(tmp3, CONTEXT_REG, offsetof(SH2, drc_tmp)); rcache_invalidate_tmp(); emith_abicall(sh2_drc_divs64); @@ -4081,7 +4100,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_eor_r_r_imm(tmp3, tmp3, 1); emith_or_r_r_lsl(sr, tmp3, Q_SHIFT); // Q = !Ro[0]^M rcache_free_tmp(tmp3); - emith_or_r_r_r_lsr(sr, sr, tmp4, 31); // T = Ro[31] skip_op = div(opd).div1 + div(opd).rotcl; } else #endif