release r24
[pcsx_rearmed.git] / deps / lightning / lib / jit_mips-cpu.c
index 0b1b3b4..e8b1aa5 100644 (file)
  */
 
 #if PROTO
+/* FIXME Should need qemu 7.2 -- apparently broken with qemu 7.0 */
+#define PCREL_BROKEN           1
+#define BALC_BROKEN            1
+
 typedef union {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
     struct {   jit_uint32_t _:26;      jit_uint32_t b :  6; } hc;
     struct {   jit_uint32_t _:21;      jit_uint32_t b :  5; } rs;
     struct {   jit_uint32_t _:21;      jit_uint32_t b :  5; } fm;
+    struct {   jit_uint32_t _:21;      jit_uint32_t b :  5; } fr;
+    struct {   jit_uint32_t _:18;      jit_uint32_t b :  3; } pD;
+    struct {   jit_uint32_t _:19;      jit_uint32_t b :  2; } pW;
     struct {   jit_uint32_t _:16;      jit_uint32_t b :  5; } rt;
     struct {   jit_uint32_t _:16;      jit_uint32_t b :  5; } ft;
     struct {   jit_uint32_t _:11;      jit_uint32_t b :  5; } rd;
@@ -34,11 +41,16 @@ typedef union {
     struct {                           jit_uint32_t b :  5; } cn;
     struct {                           jit_uint32_t b : 11; } cc;
     struct {                           jit_uint32_t b : 16; } is;
+    struct {                           jit_uint32_t b : 18; } iD;
+    struct {                           jit_uint32_t b : 19; } iW;
     struct {                           jit_uint32_t b : 26; } ii;
 #else
     struct {                           jit_uint32_t b :  6; } hc;
     struct {   jit_uint32_t _: 6;      jit_uint32_t b :  5; } rs;
     struct {   jit_uint32_t _: 6;      jit_uint32_t b :  5; } fm;
+    struct {   jit_uint32_t _: 6;      jit_uint32_t b :  5; } fr;
+    struct {   jit_uint32_t _:11;      jit_uint32_t b :  3; } pD;
+    struct {   jit_uint32_t _:11;      jit_uint32_t b :  2; } pW;
     struct {   jit_uint32_t _:11;      jit_uint32_t b :  5; } rt;
     struct {   jit_uint32_t _:11;      jit_uint32_t b :  5; } ft;
     struct {   jit_uint32_t _:16;      jit_uint32_t b :  5; } rd;
@@ -50,12 +62,15 @@ typedef union {
     struct {   jit_uint32_t _:27;      jit_uint32_t b :  5; } cn;
     struct {   jit_uint32_t _:21;      jit_uint32_t b : 11; } cc;
     struct {   jit_uint32_t _:16;      jit_uint32_t b : 16; } is;
+    struct {   jit_uint32_t _:14;      jit_uint32_t b : 18; } iD;
+    struct {   jit_uint32_t _:13;      jit_uint32_t b : 19; } iW;
     struct {   jit_uint32_t _: 6;      jit_uint32_t b : 26; } ii;
 #endif
     int                                        op;
 } jit_instr_t;
 #define jit_mips2_p()                  (jit_cpu.release >= 2)
 #define jit_mips6_p()                  (jit_cpu.release >= 6)
+#define jit_unaligned_p()              (jit_cpu.unaligned)
 #  define _ZERO_REGNO                  0
 #  define _T0_REGNO                    0x08
 #  define _T1_REGNO                    0x09
@@ -89,20 +104,31 @@ typedef union {
 #  if __WORDSIZE == 32
 #    define ldr(u,v)                   ldr_i(u,v)
 #    define ldi(u,v)                   ldi_i(u,v)
+#    define ldxr(u,v,w)                        ldxr_i(u,v,w)
 #    define ldxi(u,v,w)                        ldxi_i(u,v,w)
+#    define str(u,v)                   str_i(u,v)
 #    define sti(u,v)                   sti_i(u,v)
+#    define stxr(u,v,w)                        stxr_i(u,v,w)
 #    define stxi(u,v,w)                        stxi_i(u,v,w)
 #  else
 #    define ldr(u,v)                   ldr_l(u,v)
 #    define ldi(u,v)                   ldi_l(u,v)
+#    define ldxr(u,v,w)                        ldxr_l(u,v,w)
 #    define ldxi(u,v,w)                        ldxi_l(u,v,w)
+#    define str(u,v)                   str_l(u,v)
 #    define sti(u,v)                   sti_l(u,v)
+#    define stxr(u,v,w)                        stxr_l(u,v,w)
 #    define stxi(u,v,w)                        stxi_l(u,v,w)
 #  endif
 /* can_relative_jump_p(im) => can_sign_extend_short_p(im << 2) */
 #  define can_relative_jump_p(im)      ((im) >= -130712 && (im) <= 131068)
-#  define can_sign_extend_short_p(im)  ((im) >= -32678 && (im) <= 32767)
+/* can_compact_jump_p(im) => can_sign_extend_i26_p(im << 2) */
+#  define can_compact_jump_p(im)       ((im) >= -268435456 && (im) <= 268435452)
+#  define can_sign_extend_short_p(im)  ((im) >= -32768 && (im) <= 32767)
 #  define can_zero_extend_short_p(im)  ((im) >= 0 && (im) <= 65535)
+#  define can_sign_extend_i18_p(im)    ((im) >= -262144 && (im) <= 262143)
+#  define can_sign_extend_i19_p(im)    ((im) >= -524288 && (im) <= 524287)
+#  define can_sign_extend_i26_p(im)    ((im) >= -67108864 && (im) <= 67108863)
 #  define is_low_mask(im)              (((im) & 1) ? (__builtin_popcountl((im) + 1) <= 1) : 0)
 #  define is_middle_mask(im)           ((im) ? (__builtin_popcountl((im) + (1 << __builtin_ctzl(im))) <= 1) : 0)
 #  define is_high_mask(im)             ((im) ? (__builtin_popcountl((im) + (1 << __builtin_ctzl(im))) == 0) : 0)
@@ -149,6 +175,7 @@ typedef union {
 #  define MIPS_LDR                     0x1b
 #  define MIPS_SPECIAL2                        0x1c
 #  define MIPS_JALX                    0x1d
+#  define MIPS_DAUI                    0x1d
 #  define MIPS_SPECIAL3                        0x1f
 #  define MIPS_LB                      0x20
 #  define MIPS_LH                      0x21
@@ -162,6 +189,8 @@ typedef union {
 #  define MIPS_SH                      0x29
 #  define MIPS_SWL                     0x2a
 #  define MIPS_SW                      0x2b
+#  define MIPS_SDL                     0x2c
+#  define MIPS_SDR                     0x2d
 #  define MIPS_SWR                     0x2e
 #  define MIPS_CACHE                   0x2f
 #  define MIPS_LL                      0x30
@@ -173,6 +202,9 @@ typedef union {
 #  define MIPS_LDC2                    0x36
 #  define MIPS_LD                      0x37
 #  define MIPS_SC                      0x38
+#  define MIPS_BC_R6                   0x32
+#  define MIPS_BALC                    0x3a
+#  define MIPS_PCREL                   0x3b
 #  define MIPS_SCD                     0x3c
 #  define MIPS_SDC1                    0x3d
 #  define MIPS_SDC2                    0x3e
@@ -213,7 +245,7 @@ typedef union {
 #  define MIPS_BGEZALL                 0x13
 #  define MIPS_SYNCI                   0x1f
 #  define MIPS_WSBH                    0x02
-#  define MIPS_DBSH                    0x02
+#  define MIPS_DSBH                    0x02
 #  define MIPS_DSHD                    0x05
 #  define MIPS_SEB                     0x10
 #  define MIPS_SEH                     0x18
@@ -321,6 +353,10 @@ static void _hrri(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define hrri9(hc,rs,rt,i9,tc)                _hrri9(_jit,hc,rs,rt,i9,tc)
 static void _hrri9(jit_state_t*,jit_int32_t,jit_int32_t,
                   jit_int32_t,jit_int32_t,jit_int32_t);
+#  define hriD(hc,rs,pD,iD)            _hriD(_jit,hc,rs,pD,iD)
+static void _hriD(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define hriW(hc,rs,pW,iW)            _hriW(_jit,hc,rs,pW,iW)
+static void _hriW(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define hi(hc,im)                    _hi(_jit,hc,im)
 static void _hi(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define NOP(i0)                      instr(0)
@@ -328,7 +364,11 @@ static void _hi(jit_state_t*,jit_int32_t,jit_int32_t);
 static void _nop(jit_state_t*,jit_int32_t);
 #  define h_ri(hc,rt,im)               _hrri(_jit,hc,0,rt,im)
 #  define rrit(rt,rd,im,tc)            _hrrrit(_jit,0,0,rt,rd,im,tc)
+#  define AUIPC(rs,im)                 hrri(MIPS_PCREL,rs,30,im)
+#  define ALUIPC(rs,im)                        hrri(MIPS_PCREL,rs,31,im)
+#  define ADDIUPC(rs,im)               hriW(MIPS_PCREL,rs,0,im)
 #  define LUI(rt,im)                   h_ri(MIPS_LUI,rt,im)
+#  define AUI(rs,rt)                   hrri(MIPS_LUI,rs,rt,im) /* mips r6 */
 #  define ADDU(rd,rs,rt)               rrr_t(rs,rt,rd,MIPS_ADDU)
 #  define DADDU(rd,rs,rt)              rrr_t(rs,rt,rd,MIPS_DADDU)
 #  define ADDIU(rt,rs,im)              hrri(MIPS_ADDIU,rs,rt,im)
@@ -384,7 +424,10 @@ static void _nop(jit_state_t*,jit_int32_t);
 #  define DEXTU(rt,rs,pos,size)                hrrrit(MIPS_SPECIAL3,rs,rt,size-1,pos-32,MIPS_DEXTU)
 #  define DEXTM(rt,rs,pos,size)                hrrrit(MIPS_SPECIAL3,rs,rt,size-32-1,pos,MIPS_DEXTM)
 #  define ROTR(rd,rt,sa)               hrrrit(MIPS_SPECIAL,1,rt,rd,sa,MIPS_SRL)
+#  define ROTRV(rd,rt,rs)              hrrrit(MIPS_SPECIAL,rs,rt,rd,1,MIPS_SRLV)
 #  define DROTR(rd,rt,sa)              hrrrit(MIPS_SPECIAL,1,rt,rd,sa,MIPS_DSRL)
+#  define DROTR32(rd,rt,sa)            hrrrit(MIPS_SPECIAL,1,rt,rd,sa,MIPS_DSRL32)
+#  define DROTRV(rd,rt,rs)             hrrrit(MIPS_SPECIAL,rs,rt,rd,1,MIPS_DSRLV)
 #  define SYNC()                       rrr_t(_ZERO_REGNO,_ZERO_REGNO,_ZERO_REGNO,MIPS_SYNC)
 #  define MFHI(rd)                     rrr_t(_ZERO_REGNO,_ZERO_REGNO,rd,MIPS_MFHI)
 #  define MFLO(rd)                     rrr_t(_ZERO_REGNO,_ZERO_REGNO,rd,MIPS_MFLO)
@@ -402,27 +445,43 @@ static void _nop(jit_state_t*,jit_int32_t);
 #  define LH(rt,of,rb)                 hrri(MIPS_LH,rb,rt,of)
 #  define LHU(rt,of,rb)                        hrri(MIPS_LHU,rb,rt,of)
 #  define LW(rt,of,rb)                 hrri(MIPS_LW,rb,rt,of)
+#  define LWPC(rs,im)                  hriW(MIPS_PCREL,rs,1,im)
 #  define LWU(rt,of,rb)                        hrri(MIPS_LWU,rb,rt,of)
+#  define LWUPC(rs,im)                 hriW(MIPS_PCREL,rs,2,im)
+#  define LWL(rt,of,rb)                        hrri(MIPS_LWL,rb,rt,of)
+#  define LWR(rt,of,rb)                        hrri(MIPS_LWR,rb,rt,of)
 #  define LD(rt,of,rb)                 hrri(MIPS_LD,rb,rt,of)
+#  define LDPC(rs,im)                  hriD(MIPS_PCREL,rs,6,im)
 #  define LL(rt,of,rb)                 hrri(MIPS_LL,rb,rt,of)
 #  define LL_R6(rt,of,rb)              hrri9(MIPS_SPECIAL3,rb,rt,of,54)
 #  define LLD(rt,of,rb)                        hrri(MIPS_LLD,rb,rt,of)
 #  define LLD_R6(rt,of,rb)             hrri9(MIPS_SPECIAL3,rb,rt,of,55)
+#  define LDL(rt,of,rb)                        hrri(MIPS_LDL,rb,rt,of)
+#  define LDR(rt,of,rb)                        hrri(MIPS_LDR,rb,rt,of)
 #  define SB(rt,of,rb)                 hrri(MIPS_SB,rb,rt,of)
 #  define SH(rt,of,rb)                 hrri(MIPS_SH,rb,rt,of)
 #  define SW(rt,of,rb)                 hrri(MIPS_SW,rb,rt,of)
+#  define SWL(rt,of,rb)                        hrri(MIPS_SWL,rb,rt,of)
+#  define SWR(rt,of,rb)                        hrri(MIPS_SWR,rb,rt,of)
 #  define SD(rt,of,rb)                 hrri(MIPS_SD,rb,rt,of)
 #  define SC(rt,of,rb)                 hrri(MIPS_SC,rb,rt,of)
 #  define SC_R6(rt,of,rb)              hrri9(MIPS_SPECIAL3,rb,rt,of,38)
 #  define SCD(rt,of,rb)                        hrri(MIPS_SCD,rb,rt,of)
 #  define SCD_R6(rt,of,rb)             hrri9(MIPS_SPECIAL3,rb,rt,of,39)
+#  define SDL(rt,of,rb)                        hrri(MIPS_SDL,rb,rt,of)
+#  define SDR(rt,of,rb)                        hrri(MIPS_SDR,rb,rt,of)
 #  define WSBH(rd,rt)                  hrrrit(MIPS_SPECIAL3,0,rt,rd,MIPS_WSBH,MIPS_BSHFL)
 #  define SEB(rd,rt)                   hrrrit(MIPS_SPECIAL3,0,rt,rd,MIPS_SEB,MIPS_BSHFL)
 #  define SEH(rd,rt)                   hrrrit(MIPS_SPECIAL3,0,rt,rd,MIPS_SEH,MIPS_BSHFL)
+#  define DSBH(rd,rt)                  hrrrit(MIPS_SPECIAL3,0,rt,rd,MIPS_DSBH,MIPS_DBSHFL)
+#  define DSHD(rd,rt)                  hrrrit(MIPS_SPECIAL3,0,rt,rd,MIPS_DSHD,MIPS_DBSHFL)
 #  define SLT(rd,rs,rt)                        rrr_t(rs,rt,rd,MIPS_SLT)
 #  define SLTU(rd,rs,rt)               rrr_t(rs,rt,rd,MIPS_SLTU)
 #  define SLTI(rt,rs,im)               hrri(MIPS_SLTI,rs,rt,im)
 #  define SLTIU(rt,rs,im)              hrri(MIPS_SLTIU,rs,rt,im)
+#  define DAUI(rt,rs,im)               hrri(MIPS_DAUI,rs,rt,im)
+#  define DAHI(rs,im)                  hrri(MIPS_REGIMM,rs,6,im)
+#  define DATI(rs,im)                  hrri(MIPS_REGIMM,rs,30,im)
 #  define BLTZ(rs,im)                  hrri(MIPS_REGIMM,rs,MIPS_BLTZ,im)
 #  define BLEZ(rs,im)                  hrri(MIPS_BLEZ,rs,_ZERO_REGNO,im)
 #  define BEQ(rs,rt,im)                        hrri(MIPS_BEQ,rs,rt,im)
@@ -448,14 +507,14 @@ static void _nop(jit_state_t*,jit_int32_t);
 #  define DCLZ(rd,rs)                  hrrrit(MIPS_SPECIAL2,rs,rd,rd,0,MIPS_DCLZ)
 #  define J(i0)                                hi(MIPS_J,i0)
 #  define JAL(i0)                      hi(MIPS_JAL,i0)
+#  define BC_R6(i0)                    hi(MIPS_BC_R6,i0)
+#  define BALC(i0)                     hi(MIPS_BALC,i0)
 #  define MOVN(rd,rs,rt)               hrrrit(0,rs,rt,rd,0,MIPS_MOVN)
 #  define MOVZ(rd,rs,rt)               hrrrit(0,rs,rt,rd,0,MIPS_MOVZ)
 #  define SELEQZ(rd,rs,rt)             hrrrit(0,rs,rt,rd,0,53)
 #  define SELNEZ(rd,rs,rt)             hrrrit(0,rs,rt,rd,0,55)
 #  define comr(r0,r1)                  xori(r0,r1,-1)
 #  define negr(r0,r1)                  subr(r0,_ZERO_REGNO,r1)
-#  define bitswap(r0,r1)               _bitswap(_jit, r0, r1);
-static void _bitswap(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define clor(r0, r1)                 _clor(_jit, r0, r1)
 static void _clor(jit_state_t*, jit_int32_t, jit_int32_t);
 #  define clzr(r0, r1)                 _clzr(_jit, r0, r1)
@@ -464,6 +523,8 @@ static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t);
 static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t);
 #  define ctzr(r0, r1)                 _ctzr(_jit, r0, r1)
 static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t);
+#  define rbitr(r0, r1)                        _rbitr(_jit, r0, r1)
+static void _rbitr(jit_state_t*, jit_int32_t, jit_int32_t);
 #  if __WORDSIZE == 32
 #    define addr(rd,rs,rt)             ADDU(rd,rs,rt)
 #    define addiu(r0,r1,i0)            ADDIU(r0,r1,i0)
@@ -497,8 +558,8 @@ static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t);
 #    define mod_r6(rd,rs,rt)           DMOD_R6(rd,rs,rt)
 #    define modu_r6(rd,rs,rt)          DMODU_R6(rd,rs,rt)
 #  endif
-#  define extr(rd,rt,lsb,nb)   _extr(_jit,rd,rt,lsb,nb)
-static void _extr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define mips_extr(rd,rt,lsb,nb)      _mips_extr(_jit,rd,rt,lsb,nb)
+static void _mips_extr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define insr(rd,rt,lsb,nb)   _insr(_jit,rd,rt,lsb,nb)
 static void _insr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define addi(r0,r1,i0)               _addi(_jit,r0,r1,i0)
@@ -527,6 +588,14 @@ static void _rsbi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 static void _mulr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define muli(r0,r1,i0)               _muli(_jit,r0,r1,i0)
 static void _muli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define hmulr(r0,r1,r2)              _hmulr(_jit,r0,r1,r2)
+static void _hmulr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define hmuli(r0,r1,i0)              _hmuli(_jit,r0,r1,i0)
+static void _hmuli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define hmulr_u(r0,r1,r2)            _hmulr_u(_jit,r0,r1,r2)
+static void _hmulr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define hmuli_u(r0,r1,i0)            _hmuli_u(_jit,r0,r1,i0)
+static void _hmuli_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  define qmulr(r0,r1,r2,r3)           iqmulr(r0,r1,r2,r3,1)
 #  define qmulr_u(r0,r1,r2,r3)         iqmulr(r0,r1,r2,r3,0)
 #  define iqmulr(r0,r1,r2,r3,cc)       _iqmulr(_jit,r0,r1,r2,r3,cc)
@@ -581,6 +650,33 @@ static void _rshi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #    define rshi_u(r0,r1,i0)           _rshi_u(_jit,r0,r1,i0)
 static void _rshi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  endif
+#  define qlshr(r0,r1,r2,r3)           xlshr(1,r0,r1,r2,r3)
+#  define qlshr_u(r0, r1, r2, r3)      xlshr(0, r0, r1, r2, r3)
+#  define xlshr(s,r0,r1,r2,r3)         _xlshr(_jit,s,r0,r1,r2,r3)
+static void
+_xlshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define qlshi(r0, r1, r2, i0)                xlshi(1, r0, r1, r2, i0)
+#  define qlshi_u(r0, r1, r2, i0)      xlshi(0, r0, r1, r2, i0)
+#  define xlshi(s, r0, r1, r2, i0)     _xlshi(_jit, s, r0, r1, r2, i0)
+static void
+_xlshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qrshr(r0, r1, r2, r3)                xrshr(1, r0, r1, r2, r3)
+#  define qrshr_u(r0, r1, r2, r3)      xrshr(0, r0, r1, r2, r3)
+#  define xrshr(s, r0, r1, r2, r3)     _xrshr(_jit, s, r0, r1, r2, r3)
+static void
+_xrshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t);
+#  define qrshi(r0, r1, r2, i0)                xrshi(1, r0, r1, r2, i0)
+#  define qrshi_u(r0, r1, r2, i0)      xrshi(0, r0, r1, r2, i0)
+#  define xrshi(s, r0, r1, r2, i0)     _xrshi(_jit, s, r0, r1, r2, i0)
+static void
+_xrshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t);
+#    define lrotr(r0,r1,r2)            _lrotr(_jit,r0,r1,r2)
+static void _lrotr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#    define lroti(r0,r1,i0)            rroti(r0,r1,__WORDSIZE-i0)
+#    define rrotr(r0,r1,r2)            _rrotr(_jit,r0,r1,r2)
+static void _rrotr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+#    define rroti(r0,r1,i0)            _rroti(_jit,r0,r1,i0)
+static void _rroti(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  define andr(r0,r1,r2)               AND(r0,r1,r2)
 #  define andi(r0,r1,i0)               _andi(_jit,r0,r1,i0)
 static void _andi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
@@ -658,6 +754,14 @@ static void _ldxr_l(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #    define ldxi_l(r0,r1,i0)           _ldxi_l(_jit,r0,r1,i0)
 static void _ldxi_l(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  endif
+#  define unldr(r0, r1, i0)            _unldr(_jit, r0, r1, i0)
+static void _unldr(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
+#  define unldi(r0, i0, i1)            _unldi(_jit, r0, i0, i1)
+static void _unldi(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t);
+#  define unldr_u(r0, r1, i0)          _unldr_u(_jit, r0, r1, i0)
+static void _unldr_u(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
+#  define unldi_u(r0, i0, i1)          _unldi_u(_jit, r0, i0, i1)
+static void _unldi_u(jit_state_t*, jit_int32_t, jit_word_t, jit_word_t);
 #  define str_c(r0,r1)                 SB(r1,0,r0)
 #  define sti_c(i0,r0)                 _sti_c(_jit,i0,r0)
 static void _sti_c(jit_state_t*,jit_word_t,jit_int32_t);
@@ -690,13 +794,23 @@ static void _stxr_l(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #    define stxi_l(i0,r0,r1)           _stxi_l(_jit,i0,r0,r1)
 static void _stxi_l(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
 #  endif
+#  define unstr(r0, r1, i0)            _unstr(_jit, r0, r1, i0)
+static void _unstr(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
+#  define unsti(i0, r0, i1)            _unsti(_jit, i0, r0, i1)
+static void _unsti(jit_state_t*, jit_word_t, jit_int32_t, jit_word_t);
 #  define bswapr_us(r0,r1)             _bswapr_us(_jit,r0,r1)
 static void _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define bswapr_ui(r0,r1)             _bswapr_ui(_jit,r0,r1)
 static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
 #  if __WORDSIZE == 64
-#    define bswapr_ul(r0,r1)           generic_bswapr_ul(_jit,r0,r1)
+#  define bswapr_ul(r0,r1)             _bswapr_ul(_jit,r0,r1)
+static void _bswapr_ul(jit_state_t*,jit_int32_t,jit_int32_t);
 #  endif
+#define extr(r0,r1,i0,i1)              fallback_ext(r0,r1,i0,i1)
+#define extr_u(r0,r1,i0,i1)            _extr_u(_jit,r0,r1,i0,i1)
+static void _extr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t);
+#define depr(r0,r1,i0,i1)              _depr(_jit,r0,r1,i0,i1)
+static void _depr(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t);
 #  define extr_c(r0,r1)                        _extr_c(_jit,r0,r1)
 static void _extr_c(jit_state_t*,jit_int32_t,jit_int32_t);
 #  define extr_uc(r0,r1)               ANDI(r0,r1,0xff)
@@ -891,7 +1005,7 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                            jit_int32_t reg0, jit_int32_t reg1)
 {
     jit_instr_t                i;
-    jit_int32_t                reg, r0, r1, r2, regs[3];
+    jit_int32_t                reg, r0, r1, r2/*, xreg*/, regs[3];
     /* If will emit a pending instruction */
     if (_jitc->inst.pend)
        i.op = _jitc->inst.op;
@@ -901,7 +1015,7 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
     /* Else, a nop */
     else
        i.op = 0;
-    regs[0] = regs[1] = regs[2] = -1;
+    regs[0] = regs[1] = regs[2]/* = xreg*/ = -1;
     switch (i.hc.b) {
        case MIPS_SPECIAL:              /* 00 */
            switch (i.tc.b) {
@@ -993,6 +1107,9 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                    }
                    break;
                case MIPS_SLL:          /* 00 */
+                   /* If cannot have a shift in delay slot */
+                   if (!jit_cpu.sll_delay)
+                       flush();
                case MIPS_SRL:          /* 02 */
                case MIPS_SRA:          /* 03 */
                case MIPS_DSLL:         /* 38 */
@@ -1012,7 +1129,7 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                case MIPS_SYNC:         /* 0f */
                    assert(i.rs.b == 0 && i.rt.b == 0 && i.rd.b == 0);
                    if (mask & jit_class_gpr)
-                       regs[0] = regs[1] = regs[1] = 0;
+                       regs[0] = regs[1] = regs[2] = 0;
                    break;
                case MIPS_MOVZ:         /* 0a */
                case MIPS_MOVN:         /* 0b */
@@ -1040,6 +1157,11 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
            break;
        case MIPS_REGIMM:               /* 01 */
            switch (i.rt.b) {
+               /* DAHI */
+               case 6:                 /* 06 */
+               /* DATI */
+               case 15:                /* 1e */
+                   assert(jit_mips6_p());
                case MIPS_BLTZ:         /* 00 */
                case MIPS_BGEZ:         /* 01 */
                case MIPS_BGEZAL:       /* 11 */
@@ -1052,16 +1174,20 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                regs[1] = regs[2] = 0;
            }
            break;
+       case MIPS_BC_R6:                /* 32 */
+       case MIPS_BALC:                 /* 3a */
+           assert(jit_mips6_p());
        case MIPS_J:                    /* 02 */
        case MIPS_JAL:                  /* 03 */
            if (mask & jit_class_gpr)
                regs[0] = regs[1] = regs[2] = 0;
            break;
        case MIPS_LUI:                  /* 0f */
-           assert(i.rs.b == 0);
+           assert(jit_mips6_p() || i.rs.b == 0);
            if (mask & jit_class_gpr) {
                regs[0] = i.rt.b;
-               regs[1] = regs[1] = 0;
+               regs[1] = i.rs.b;       /* AUI if non zero */
+               regs[1] = 0;
            }
            break;
        case MIPS_SPECIAL2:             /* 1c */
@@ -1110,7 +1236,9 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                /* DBITSWAP */
                case MIPS_DBSHFL:       /* 24 */
                    switch (i.ic.b) {
+                       /* DSBH */
                        case MIPS_WSBH: /* 02 */
+                       case MIPS_DSHD: /* 05 */
                        case MIPS_SEB:  /* 10 */
                        case MIPS_SEH:  /* 18 */
                            if (mask & jit_class_gpr) {
@@ -1162,6 +1290,10 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                        case MIPS_DMT:  /* 05 */
                        case MIPS_MTH:  /* 07 */
                            assert(i.ic.b == 0);
+                           /* If these cop1 instructions in delay slot
+                            * wont work  */
+                           if (!jit_cpu.cop1_delay)
+                               flush();
                            if (mask & jit_class_gpr) {
                                regs[0] = i.rt.b;
                                regs[1] = regs[2] = 0;
@@ -1173,6 +1305,9 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                            goto three_fprs;
                    }
                    break;
+               case MIPS_MADDF:        /* 18 */
+               case MIPS_MSUBF:        /* 19 */
+                   assert(jit_mips6_p());
                case MIPS_SUB_fmt:      /* 01 */
                case MIPS_MUL_fmt:      /* 02 */
                case MIPS_DIV_fmt:      /* 03 */
@@ -1313,6 +1448,36 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                    break;
            }
            break;
+       case MIPS_COP1X:                /* 13 */
+           switch (i.tc.b) {
+               case MIPS_MADD_fmt_S:
+               case MIPS_MADD_fmt_D:
+               case MIPS_MSUB_fmt_S:
+               case MIPS_MSUB_fmt_D:
+               case MIPS_NMADD_fmt_S:
+               case MIPS_NMADD_fmt_D:
+               case MIPS_NMSUB_fmt_S:
+               case MIPS_NMSUB_fmt_D:
+                   assert(!jit_mips6_p());
+                   if (mask & jit_class_gpr)
+                       regs[0] = regs[1] = regs[2] = 0;
+                   else {
+                       regs[0] = i.ft.b;
+                       regs[1] = i.fs.b;
+                       regs[2] = i.fd.b;
+                       /* FIXME No need to compute and check it.
+                        * If asking for a tmeporary fpr, code will
+                        * be flushed. */
+                       /* xreg = i.fr.b; */
+                   }
+                   break;
+               default:
+                   abort();
+           }
+           break;
+       case MIPS_DAUI: /* JALX */      /* 1d */
+           /* Do not generate JALX. No microMIPS64 or MIPS16e support */
+           assert(jit_mips6_p() && i.rs.b != 0);
        case MIPS_ADDIU:                /* 09 */
        case MIPS_SLTI:                 /* 0a */
        case MIPS_SLTIU:                /* 0b */
@@ -1320,6 +1485,8 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
        case MIPS_ORI:                  /* 0d */
        case MIPS_XORI:                 /* 0e */
        case MIPS_DADDIU:               /* 18 */
+       case MIPS_LDL:                  /* 1a */
+       case MIPS_LDR:                  /* 1b */
        case MIPS_LB:                   /* 20 */
        case MIPS_LH:                   /* 21 */
        case MIPS_LW:                   /* 23 */
@@ -1337,6 +1504,21 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                regs[2] = 0;
            }
            break;
+       case MIPS_LWL:                  /* 22 */
+       case MIPS_LWR:                  /* 26 */
+           if (!jit_cpu.lwl_lwr_delay)
+               flush();
+       case MIPS_SWL:                  /* 2a */
+       case MIPS_SWR:                  /* 2e */
+       case MIPS_SDL:                  /* 2c */
+       case MIPS_SDR:                  /* 2d */
+           assert(!(jit_mips6_p()));
+           if (mask & jit_class_gpr) {
+               regs[0] = i.rs.b;
+               regs[1] = i.rt.b;
+               regs[2] = 0;
+           }
+           break;
        case MIPS_LL:                   /* 30 */
        case MIPS_LLD:                  /* 34 */
        case MIPS_SC:                   /* 38 */
@@ -1356,13 +1538,24 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
                regs[1] = regs[2] = 0;
            }
            break;
-       case MIPS_BEQ:                  /* 04 */
-       case MIPS_BNE:                  /* 05 */
-           assert(i.rt.b == 0);
        case MIPS_LWC1:                 /* 31 */
        case MIPS_LDC1:                 /* 35 */
        case MIPS_SWC1:                 /* 39 */
        case MIPS_SDC1:                 /* 3d */
+           /* If these cop1 instructions in delay wont not work  */
+           if (!jit_cpu.cop1_delay)
+               flush();
+           if (mask & jit_class_gpr) {
+               regs[0] = i.rs.b;
+               regs[1] = i.rt.b;
+               regs[2] = 0;
+           }
+           else
+               regs[0] = i.rt.b;
+           break;
+       case MIPS_BEQ:                  /* 04 */
+       case MIPS_BNE:                  /* 05 */
+           assert(i.rt.b == 0);
            if (mask & jit_class_gpr) {
                regs[0] = i.rs.b;
                regs[1] = i.rt.b;
@@ -1371,6 +1564,24 @@ _jit_get_reg_for_delay_slot(jit_state_t *_jit, jit_int32_t mask,
            else
                regs[0] = i.rt.b;
            break;
+       case MIPS_PCREL:                /* 0x3b */
+           assert(jit_mips6_p());
+           switch (i.rt.b) {
+               case 0x1e:              /* AUIPC */
+               case 0x1f:              /* ALUIPC */
+                   break;
+               default:
+                   assert(i.pD.b == 1 ||/* LDPC */
+                          i.pW.b == 0 ||/* ADDIUPC */
+                          i.pW.b == 1 ||/* LWPC */
+                          i.pW.b == 2); /* LWUPC */
+                   break;
+           }
+           if (mask & jit_class_gpr) {
+               regs[0] = i.rs.b;
+               regs[1] = regs[2] = 0;
+           }
+           break;
        default:
            abort();
     }
@@ -1460,6 +1671,30 @@ _hrri9(jit_state_t *_jit, jit_int32_t hc,
     instr(i.op);
 }
 
+static void
+_hriD(jit_state_t *_jit, jit_int32_t hc,
+       jit_int32_t rs, jit_int32_t pD, jit_int32_t iD)
+{
+    jit_instr_t                i;
+    i.iD.b = iD;
+    i.pD.b = pD;
+    i.rs.b = rs;
+    i.hc.b = hc;
+    instr(i.op);
+}
+
+static void
+_hriW(jit_state_t *_jit, jit_int32_t hc,
+       jit_int32_t rs, jit_int32_t pW, jit_int32_t iW)
+{
+    jit_instr_t                i;
+    i.iW.b = iW;
+    i.pD.b = pW;
+    i.rs.b = rs;
+    i.hc.b = hc;
+    instr(i.op);
+}
+
 static void
 _hi(jit_state_t *_jit, jit_int32_t hc, jit_int32_t im)
 {
@@ -1478,7 +1713,7 @@ _nop(jit_state_t *_jit, jit_int32_t i0)
 }
 
 static void
-_extr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+_mips_extr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
       jit_int32_t pos, jit_int32_t size)
 {
     assert(size > 0);
@@ -1509,58 +1744,18 @@ _insr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
         DINS(r0, r1, pos, size);
 }
 
-/* http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */
-/*
-unsigned int s = sizeof(v) * CHAR_BIT; // bit size; must be power of 2 
-unsigned int mask = ~0;         
-while ((s >>= 1) > 0) 
-{
-  mask ^= (mask << s);
-  v = ((v >> s) & mask) | ((v << s) & ~mask);
-}
-*/
-static void
-_bitswap(jit_state_t *_jit, jit_int32_t v, jit_int32_t r1)
-{
-    jit_int32_t                s, mask;
-    jit_word_t         loop, done, t0, t1;
-    movr(v, r1);
-    s = jit_get_reg(jit_class_gpr);
-    movi(rn(s), __WORDSIZE);                   /* s = sizeof(v) * CHAR_BIT; */
-    mask = jit_get_reg(jit_class_gpr);
-    movi(rn(mask), ~0L);                       /* mask = ~0; */
-    flush();
-    loop = _jit->pc.w;                         /* while ((s >>= 1) > 0) */
-    rshi(rn(s), rn(s), 1);                     /*        (s >>= 1) */
-    done = blei(_jit->pc.w, rn(s), 0);         /* no loop if s <= 0 */
-    t0 = jit_get_reg(jit_class_gpr);
-    lshr(rn(t0), rn(mask), rn(s));             /* t0 = (mask << s) */
-    xorr(rn(mask), rn(mask), rn(t0));          /* mask ^= t0 */
-    rshr(rn(t0), v, rn(s));                    /* t0 = v >> s */
-    andr(rn(t0), rn(t0), rn(mask));            /* t0 = t0 & mask */
-    t1 = jit_get_reg(jit_class_gpr);
-    lshr(rn(t1), v, rn(s));                    /* t1 = v << s */
-    comr(v, rn(mask));                         /* v = ~mask */
-    andr(rn(t1), v, rn(t1));                   /* t1 = t1 & v */
-    orr(v, rn(t0), rn(t1));                    /* v = t0 | t1 */
-    jmpi(loop, 0);
-    flush();
-    patch_at(done, _jit->pc.w);
-    jit_unget_reg(t1);
-    jit_unget_reg(t0);
-    jit_unget_reg(mask);
-    jit_unget_reg(s);
-}
-
 static void
 _clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
 #if __WORDSIZE == 32
     if (jit_mips6_p())
        CLO_R6(r0, r1);
-    else
+    else if (jit_mips2_p())
        CLO(r0, r1);
+    else
+       fallback_clo(r0, r1);
 #else
+    assert(jit_mips2_p());
     if (jit_mips6_p())
        DCLO_R6(r0, r1);
     else
@@ -1574,9 +1769,12 @@ _clzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 #if __WORDSIZE == 32
     if (jit_mips6_p())
        CLZ_R6(r0, r1);
-    else
+    else if (jit_mips2_p())
        CLZ(r0, r1);
+    else
+       fallback_clz(r0, r1);
 #else
+    assert(jit_mips2_p());
     if (jit_mips6_p())
        DCLZ_R6(r0, r1);
     else
@@ -1588,40 +1786,53 @@ static void
 _ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
     if (jit_mips6_p()) {
-#if __WORDSIZE == 32
-       BITSWAP(r0, r1);
-       bswapr_ui(r0, r0);
-       CLO_R6(r0, r0);
-#else
-       DBITSWAP(r0, r1);
-       bswapr_ul(r0, r0);
-       DCLO_R6(r0, r0);
-#endif
+        rbitr(r0, r1);
+        clor(r0, r0);
     }
     else {
-       bitswap(r0, r1);
-       clor(r0, r0);
+        comr(r0, r1);
+        ctzr(r0, r0);
     }
 }
 
 static void
 _ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_mips6_p()) {
+        rbitr(r0, r1);
+        clzr(r0, r0);
+    }
+    else {
+        jit_int32_t     t0, t1;
+
+        t0 = jit_get_reg(jit_class_gpr);
+        t1 = jit_get_reg(jit_class_gpr);
+
+        negr(rn(t0), r1);
+        andr(rn(t0), rn(t0), r1);
+        clzr(r0, rn(t0));
+        xori(rn(t1), r0, __WORDSIZE - 1);
+        movnr(r0, rn(t1), rn(t0));
+
+        jit_unget_reg(t0);
+        jit_unget_reg(t1);
+    }
+}
+
+static void
+_rbitr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
     if (jit_mips6_p()) {
 #if __WORDSIZE == 32
        BITSWAP(r0, r1);
        bswapr_ui(r0, r0);
-       CLZ_R6(r0, r0);
 #else
        DBITSWAP(r0, r1);
        bswapr_ul(r0, r0);
-       DCLZ_R6(r0, r0);
 #endif
     }
-    else {
-       bitswap(r0, r1);
-       clzr(r0, r0);
-    }
+    else
+       fallback_rbit(r0, r1);
 }
 
 static void
@@ -1633,11 +1844,50 @@ _addi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
     else if (can_sign_extend_short_p(i0))
        addiu(r0, r1, i0);
     else {
+       if (jit_mips6_p()) {
+           union {
+               struct {
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   jit_word_t  _ :     16;
+                   jit_word_t  aui :   16;
+#   if __WORDSIZE == 64
+                   jit_word_t  ahi :   16;
+                   jit_word_t  ati :   16;
+#   endif
+#  else
+#   if __WORDSIZE == 64
+                   jit_word_t  ati :   16;
+                   jit_word_t  ahi :   16;
+#   endif
+                   jit_word_t  aui :   16;
+                   jit_word_t  _ :     16;
+#  endif
+               } b;
+               jit_word_t              w;
+           } bits;
+           bits.w = i0;
+           if (r0 == r1 && ((jit_word_t)bits.b.aui << 16) == i0)
+               /* FIXME It should not be required r0 == r1 per
+                * documentation, but this is now it works in qemu
+                * for DAUI. Assume AUI has the same restriction. */
+               DAUI(r1, r0, bits.b.aui & 0xffff);
+#if __WORDSIZE == 64
+           else if (r0 == r1 && ((jit_word_t)bits.b.ahi << 32) == i0)
+               DAHI(r0, bits.b.ahi & 0xffff);
+           else if (r0 == r1 && ((jit_word_t)bits.b.ati << 48) == i0)
+               DATI(r0, bits.b.ati & 0xffff);
+#endif
+           else
+               goto fallback;
+           goto done;
+       }
+    fallback:
        reg = jit_get_reg(jit_class_gpr);
        movi(rn(reg), i0);
        addr(r0, r1, rn(reg));
        jit_unget_reg(reg);
     }
+done:;
 }
 
 static void
@@ -1841,6 +2091,48 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
     jit_unget_reg(reg);
 }
 
+static void
+_hmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    if (jit_mips6_p())
+       muh_r6(r0, r1, r2);
+    else {
+       mult(r1, r2);
+       MFHI(r0);
+    }
+}
+
+static void
+_hmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    hmulr(r0, r1, rn(reg));
+    jit_unget_reg(reg);
+}
+
+static void
+_hmulr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    if (jit_mips6_p())
+       muhu_r6(r0, r1, r2);
+    else {
+       multu(r1, r2);
+       MFHI(r0);
+    }
+}
+
+static void
+_hmuli_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    hmulr_u(r0, r1, rn(reg));
+    jit_unget_reg(reg);
+}
+
 static void
 _iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
        jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
@@ -2056,6 +2348,202 @@ _rshi_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 }
 #endif
 
+static void
+_xlshr(jit_state_t *_jit, jit_bool_t sign,
+       jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
+{
+    jit_int32_t                t0, s0, t2, s2, t3, s3;
+    s0 = jit_get_reg(jit_class_gpr);
+    t0 = rn(s0);
+    if (r0 == r2 || r1 == r2) {
+        s2 = jit_get_reg(jit_class_gpr);
+        t2 = rn(s2);
+        movr(t2, r2);
+    }
+    else
+        t2 = r2;
+    if (r0 == r3 || r1 == r3) {
+        s3 = jit_get_reg(jit_class_gpr);
+        t3 = rn(s3);
+        movr(t3, r3);
+    }
+    else
+        t3 = r3;
+    rsbi(t0, t3, __WORDSIZE);
+    lshr(r0, t2, t3);
+    if (sign)
+        rshr(r1, t2, t0);
+    else
+        rshr_u(r1, t2, t0);
+    if (sign) {
+        rshi(t0, t2, __WORDSIZE - 1);
+        /* zero? */
+        movzr(r1, t0, t3);
+    }
+    else {
+        /* zero? */
+        movzr(r1, t3, t3);
+    }
+    /* overflow? */
+    nei(t0, t3, __WORDSIZE);
+    movzr(r0, t0, t0);
+
+    jit_unget_reg(s0);
+    if (t2 != r2)
+       jit_unget_reg(s2);
+    if (t3 != r3)
+       jit_unget_reg(s3);
+}
+
+static void
+_xlshi(jit_state_t *_jit, jit_bool_t sign,
+       jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0)
+{
+    if (i0 == 0) {
+       movr(r0, r2);
+       if (sign)
+           rshi(r1, r2, __WORDSIZE - 1);
+       else
+           movi(r1, 0);
+    }
+    else if (i0 == __WORDSIZE) {
+       movr(r1, r2);
+       movi(r0, 0);
+    }
+    else {
+       assert((jit_uword_t)i0 <= __WORDSIZE);
+       if (sign)
+           rshi(r1, r2, __WORDSIZE - i0);
+       else
+           rshi_u(r1, r2, __WORDSIZE - i0);
+       lshi(r0, r2, i0);
+    }
+}
+
+static void
+_xrshr(jit_state_t *_jit, jit_bool_t sign,
+       jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3)
+{
+    jit_int32_t                t0, s0, t2, s2, t3, s3;
+    s0 = jit_get_reg(jit_class_gpr);
+    t0 = rn(s0);
+    if (r0 == r2 || r1 == r2) {
+        s2 = jit_get_reg(jit_class_gpr);
+        t2 = rn(s2);
+        movr(t2, r2);
+    }
+    else
+        t2 = r2;
+    if (r0 == r3 || r1 == r3) {
+        s3 = jit_get_reg(jit_class_gpr);
+        t3 = rn(s3);
+        movr(t3, r3);
+    }
+    else
+        t3 = r3;
+
+    if (sign) {
+        /* underflow? */
+        eqi(t0, t3, __WORDSIZE);
+        subr(t0, t3, t0);
+        rshr(r0, t2, t0);
+    } else {
+        /* underflow? */
+        nei(t0, t3, __WORDSIZE);
+        rshr_u(r0, t2, t3);
+        movzr(r0, t0, t0);
+    }
+
+    rsbi(t0, t3, __WORDSIZE);
+    lshr(r1, t2, t0);
+
+    /* zero? */
+    movzr(r1, t3, t3);
+
+    jit_unget_reg(s0);
+    if (t2 != r2)
+        jit_unget_reg(s2);
+    if (t3 != r3)
+        jit_unget_reg(s3);
+}
+
+static void
+_xrshi(jit_state_t *_jit, jit_bool_t sign,
+       jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0)
+{
+    if (i0 == 0) {
+       movr(r0, r2);
+       movi(r1, 0);
+    }
+    else if (i0 == __WORDSIZE) {
+       movr(r1, r2);
+       if (sign)
+           rshi(r0, r2, __WORDSIZE - 1);
+       else
+           movi(r0, 0);
+    }
+    else {
+       assert((jit_uword_t)i0 <= __WORDSIZE);
+       lshi(r1, r2, __WORDSIZE - i0);
+       if (sign)
+           rshi(r0, r2, i0);
+       else
+           rshi_u(r0, r2, i0);
+    }
+}
+
+static void
+_lrotr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    jit_int32_t                reg;
+    if (jit_mips2_p()) {
+       if (r0 != r1 && r0 != r2) {
+           rsbi(r0, r2, __WORDSIZE);
+           rrotr(r0, r1, r0);
+       }
+       else {
+           reg = jit_get_reg(jit_class_gpr);
+           rsbi(rn(reg), r2, __WORDSIZE);
+           rrotr(r0, r1, rn(reg));
+           jit_unget_reg(reg);
+       }
+    }
+    else
+       fallback_lrotr(r0, r1, r2);
+}
+
+static void
+_rrotr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+    if (jit_mips2_p()) {
+#if __WORDSIZE == 32
+       ROTRV(r0, r1, r2);
+#else
+       DROTRV(r0, r1, r2);
+#endif
+    }
+    else
+       fallback_rrotr(r0, r1, r2);
+}
+
+static void
+_rroti(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    assert(i0 >= 0 && i0 <= __WORDSIZE - 1);
+    if (jit_mips2_p()) {
+#if __WORDSIZE == 32
+       ROTR(r0, r1, i0);
+#else
+       if (i0 < 32)
+           DROTR(r0, r1, i0);
+       else
+           DROTR32(r0, r1, i0 - 32);
+#endif
+    }
+    else
+       fallback_rroti(r0, r1, i0);
+}
+
 static void
 _andi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 {
@@ -2064,7 +2552,7 @@ _andi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
         ANDI(r0, r1, i0);
     else if (is_low_mask(i0)) {
         if (jit_mips2_p())
-            extr(r0, r1, 0, masked_bits_count(i0));
+            mips_extr(r0, r1, 0, masked_bits_count(i0));
         else {
             lshi(r0, r1, unmasked_bits_count(i0));
             rshi_u(r0, r0, unmasked_bits_count(i0));
@@ -2077,7 +2565,7 @@ _andi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
             lshi(r0, r0, unmasked_bits_count(i0));
         }
     } else if (jit_mips2_p() && is_middle_mask(i0)) {
-        extr(r0, r1, __builtin_ctzl(i0), masked_bits_count(i0));
+        mips_extr(r0, r1, __builtin_ctzl(i0), masked_bits_count(i0));
         lshi(r0, r0, __builtin_ctzl(i0));
     } else if (jit_mips2_p() && is_middle_mask(~i0)) {
         if (r0 != r1)
@@ -2136,6 +2624,28 @@ _movi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
     else if (can_zero_extend_short_p(i0))
        ORI(r0, _ZERO_REGNO, i0);
     else {
+       /* Check if loading some constant reachable address */
+       if (jit_mips6_p()) {
+           jit_word_t          w, d;
+           w = i0 - (_jit->pc.w + (_jitc->inst.pend ? 4 : 0));
+#if !PCREL_BROKEN
+           if (!(i0 & 3)) {
+               d = w >> 2;
+               if (can_sign_extend_i19_p(d)) {
+                   ADDIUPC(r0, d);
+                   goto done;
+               }
+           }
+#endif
+           if (can_sign_extend_int_p(w)) {
+               jit_int32_t     lo = (jit_int32_t)w << 16 >> 16;
+               jit_int32_t     hi = w - lo;
+               AUIPC(r0, hi >> 16);
+               if (lo)
+                   addiu(r0, r0, lo);
+               goto done;
+           }
+       }
        if (can_sign_extend_int_p(i0))
            LUI(r0, i0 >> 16);
        else if (can_zero_extend_int_p(i0)) {
@@ -2159,6 +2669,7 @@ _movi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
        if (i0 & 0xffff)
            ORI(r0, r0, i0);
     }
+done:;
 }
 
 static jit_word_t
@@ -2323,6 +2834,17 @@ static void
 _ldi_i(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
 {
     jit_int32_t                reg;
+#if !PCREL_BROKEN
+    if (jit_mips6_p()) {
+       jit_word_t      w;
+       assert(!(i0 & 3));
+       w = (i0 - (_jit->pc.w + (_jitc->inst.pend ? 4 : 0))) >> 2;
+       if (can_sign_extend_i19_p(w)) {
+           LWPC(r0, w);
+           goto done;
+       }
+    }
+#endif
     if (can_sign_extend_short_p(i0))
        LW(r0, i0, _ZERO_REGNO);
     else {
@@ -2331,6 +2853,9 @@ _ldi_i(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
        ldr_i(r0, rn(reg));
        jit_unget_reg(reg);
     }
+#if !PCREL_BROKEN
+done:;
+#endif
 }
 
 #if __WORDSIZE == 64
@@ -2338,6 +2863,17 @@ static void
 _ldi_ui(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
 {
     jit_int32_t                reg;
+#if !PCREL_BROKEN
+    if (jit_mips6_p()) {
+       jit_word_t      w;
+       assert(!(i0 & 3));
+       w = (i0 - (_jit->pc.w + (_jitc->inst.pend ? 4 : 0))) >> 2;
+       if (can_sign_extend_i19_p(w)) {
+           LWUPC(r0, w);
+           goto done;
+       }
+    }
+#endif
     if (can_sign_extend_short_p(i0))
        LWU(r0, i0, _ZERO_REGNO);
     else {
@@ -2346,12 +2882,24 @@ _ldi_ui(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
        ldr_ui(r0, rn(reg));
        jit_unget_reg(reg);
     }
+#if !PCREL_BROKEN
+done:;
+#endif
 }
 
 static void
 _ldi_l(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
 {
+    jit_word_t         w;
     jit_int32_t                reg;
+    if (jit_mips6_p()) {
+       assert(!(i0 & 7));
+       w = (i0 - (_jit->pc.w + (_jitc->inst.pend ? 4 : 0))) >> 3;
+       if (can_sign_extend_i18_p(w)) {
+           LDPC(r0, w);
+           goto done;
+       }
+    }
     if (can_sign_extend_short_p(i0))
        LD(r0, i0, _ZERO_REGNO);
     else {
@@ -2360,6 +2908,7 @@ _ldi_l(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
        ldr_l(r0, rn(reg));
        jit_unget_reg(reg);
     }
+done:;
 }
 #endif
 
@@ -2491,6 +3040,201 @@ _ldxi_l(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 }
 #endif
 
+#if __WORDSIZE == 32
+#  define LOAD_LEFT                    LWL
+#  define LOAD_RIGHT                   LWR
+#else
+#  define LOAD_LEFT                    LDL
+#  define LOAD_RIGHT                   LDR
+#endif
+static void
+_unldr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    jit_int32_t                t0, r2;
+    if (jit_unaligned_p()) {
+       assert(i0 >= 1 && i0 <= sizeof(jit_word_t));
+       if (i0 == 1)
+           ldr_c(r0, r1);
+       else {
+           if (r0 == r1) {
+               t0 = jit_get_reg(jit_class_gpr);
+               r2 = rn(t0);
+               movr(r2, r1);
+           }
+           else
+               r2 = r1;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+           LOAD_LEFT(r0, sizeof(jit_word_t) - 1, r2);
+           LOAD_RIGHT(r0, 0, r2);
+#else
+           LOAD_LEFT(r0, 0, r2);
+           LOAD_RIGHT(r0, sizeof(jit_word_t) - 1, r2);
+#endif
+           if (r0 == r1)
+               jit_unget_reg(t0);
+           switch (i0) {
+               case 2:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                   extr_s(r0, r0);
+#else
+                   rshi(r0, r0, __WORDSIZE - 16);
+#endif
+                   break;
+               case 3:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 24);
+#endif
+                   rshi(r0, r0, __WORDSIZE - 24);
+                   break;
+#if __WORDSIZE == 32
+               default:
+#else
+               case 4:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   extr_i(r0, r0);
+#  else
+                   rshi(r0, r0, __WORDSIZE - 32);
+#  endif
+#endif
+                   break;
+#if __WORDSIZE == 64
+               case 5:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 40);
+#  endif
+                   rshi(r0, r0, __WORDSIZE - 40);
+                   break;
+               case 6:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 48);
+#  endif
+                   rshi(r0, r0, __WORDSIZE - 48);
+                   break;
+               case 7:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 56);
+#  endif
+                   rshi(r0, r0, __WORDSIZE - 56);
+                   break;
+               default:
+                   break;
+#endif
+           }
+       }
+    }
+    else
+       generic_unldr(r0, r1, i0);
+}
+
+static void
+_unldi(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1)
+{
+    jit_int32_t                t0;
+    if (jit_unaligned_p()) {
+       t0 = jit_get_reg(jit_class_gpr);
+       movi(rn(t0), i0);
+       unldr(r0, rn(t0), i1);
+       jit_unget_reg(t0);
+    }
+    else
+       generic_unldi(r0, i0, i1);
+}
+
+static void
+_unldr_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    jit_int32_t                t0, r2;
+    if (jit_unaligned_p()) {
+       assert(i0 >= 1 && i0 <= sizeof(jit_word_t));
+       if (i0 == 1)
+           ldr_uc(r0, r1);
+       else {
+           if (r0 == r1) {
+               t0 = jit_get_reg(jit_class_gpr);
+               r2 = rn(t0);
+               movr(r2, r1);
+           }
+           else
+               r2 = r1;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+           LOAD_LEFT(r0, sizeof(jit_word_t) - 1, r2);
+           LOAD_RIGHT(r0, 0, r2);
+#else
+           LOAD_LEFT(r0, 0, r2);
+           LOAD_RIGHT(r0, sizeof(jit_word_t) - 1, r2);
+#endif
+           if (r0 == r1)
+               jit_unget_reg(t0);
+           switch (i0) {
+               case 2:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                   extr_us(r0, r0);
+#else
+                   rshi_u(r0, r0, __WORDSIZE - 16);
+#endif
+                   break;
+               case 3:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 24);
+#endif
+                   rshi_u(r0, r0, __WORDSIZE - 24);
+                   break;
+#if __WORDSIZE == 32
+               default:
+#else
+               case 4:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   extr_ui(r0, r0);
+#  else
+                   rshi_u(r0, r0, __WORDSIZE - 32);
+#  endif
+#endif
+                   break;
+#if __WORDSIZE == 64
+               case 5:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 40);
+#  endif
+                   rshi_u(r0, r0, __WORDSIZE - 40);
+                   break;
+               case 6:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 48);
+#  endif
+                   rshi_u(r0, r0, __WORDSIZE - 48);
+                   break;
+               case 7:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+                   lshi(r0, r0, __WORDSIZE - 56);
+#  endif
+                   rshi_u(r0, r0, __WORDSIZE - 56);
+                   break;
+               default:
+                   break;
+#endif
+           }
+       }
+    }
+    else
+       generic_unldr_u(r0, r1, i0);
+}
+#undef LOAD_LEFT
+#undef LOAD_RIGHT
+
+static void
+_unldi_u(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0, jit_word_t i1)
+{
+    jit_int32_t                t0;
+    if (jit_unaligned_p()) {
+       t0 = jit_get_reg(jit_class_gpr);
+       movi(rn(t0), i0);
+       unldr_u(r0, rn(t0), i1);
+       jit_unget_reg(t0);
+    }
+    else
+       generic_unldi_u(r0, i0, i1);
+}
+
 static void
 _sti_c(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0)
 {
@@ -2647,6 +3391,63 @@ _stxi_l(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1)
 }
 #endif
 
+static void
+_unstr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
+{
+    assert(i0 > 0 && i0 <= sizeof(jit_word_t));
+    if (jit_unaligned_p()) {
+       switch (i0) {
+           case 4:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+               SWL(r1, 3, r0);
+               SWR(r1, 0, r0);
+#else
+               SWL(r1, 0, r0);
+               SWR(r1, 3, r0);
+#endif
+               break;
+#if __WORDSIZE == 64
+           case 8:
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+               SDL(r1, 7, r0);
+               SDR(r1, 0, r0);
+#  else
+               SDL(r1, 0, r0);
+               SDR(r1, 7, r0);
+#  endif
+               break;
+#endif
+           default:
+               /* FIXME Cost of loading memory contents, creating masks,
+                * and'ing, and or'ing values to use SW* or SD* might
+                * larger than using fallback. */
+                /* FIXME Probably not, and would be without branches. */
+               fallback_unstr(r0, r1, i0);
+               break;
+       }
+    }
+    else
+       generic_unstr(r0, r1, i0);
+}
+
+static void
+_unsti(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_word_t i1)
+{
+    jit_int32_t                reg;
+    if (jit_unaligned_p()) {
+       if (i1 == 4 || i1 == 8) {
+           reg = jit_get_reg(jit_class_gpr);
+           movi(rn(reg), i0);
+           unstr(rn(reg), r0, i1);
+           jit_unget_reg(reg);
+       }
+       else
+           fallback_unsti(i0, r0, i1);
+    }
+    else
+       generic_unsti(i0, r0, i1);
+}
+
 static void
 _bswapr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
@@ -2666,7 +3467,7 @@ _bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
             SLL(r0, r1, 0);
             WSBH(r0, r0);
             ROTR(r0, r0, 16);
-            extr(r0, r0, 0, 32);
+            mips_extr(r0, r0, 0, 32);
         } else {
             WSBH(r0, r1);
             ROTR(r0, r0, 16);
@@ -2676,6 +3477,79 @@ _bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
     }
 }
 
+#if __WORDSIZE == 64
+static void
+_bswapr_ul(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+    if (jit_mips2_p()) {
+       DSBH(r0, r1);
+       DSHD(r0, r0);
+    }
+    else
+       generic_bswapr_ul(_jit, r0, r1);
+}
+#endif
+
+static void
+_extr_u(jit_state_t *_jit,
+       jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1)
+{
+    assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE);
+    if (jit_mips2_p()) {
+       if (i1 == __WORDSIZE)
+           movr(r0, r1);
+       else {
+#  if __BYTE_ORDER == __BIG_ENDIAN
+           i0 = __WORDSIZE - (i0 + i1);
+#  endif
+#if __WORDSIZE == 32
+           EXT(r0, r1, i0, i1);
+#else
+           if (i0 < 32 && i1 <= 32)
+               DEXT(r0, r1, i0, i1);
+           else if (i0 < 32 && i1 > 32)
+               DEXTM(r0, r1, i0, i1);
+           else {
+               assert(i0 >= 32 && i1 <= 32);
+               DEXTU(r0, r1, i0, i1);
+           }
+#endif
+       }
+    }
+    else
+       fallback_ext_u(r0, r1, i0, i1);
+}
+
+static void
+_depr(jit_state_t *_jit,
+      jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1)
+{
+    assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE);
+    if (jit_mips2_p()) {
+       if (i1 == __WORDSIZE)
+           movr(r0, r1);
+       else {
+#  if __BYTE_ORDER == __BIG_ENDIAN
+           i0 = __WORDSIZE - (i0 + i1);
+#  endif
+#if __WORDSIZE == 32
+           INS(r0, r1, i0, i1);
+#else
+           if (i0 < 32 && i1 <= 32 && (i0 + i1) <= 32)
+               DINS(r0, r1, i0, i1);
+           else if (i0 < 32 && i1 >= 2 && (i0 + i1) > 32)
+               DINSM(r0, r1, i0, i1);
+           else {
+               assert(i0 >= 32 && i1 >= 1 && i1 <= 32 && (i0 + i1) > 32);
+               DINSU(r0, r1, i0, i1);
+           }
+#endif
+       }
+    }
+    else
+       fallback_dep(r0, r1, i0, i1);
+}
+
 static void
 _extr_c(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 {
@@ -3139,6 +4013,18 @@ _jmpi(jit_state_t *_jit, jit_word_t i0, jit_bool_t patch)
 {
     jit_int32_t                op, t0;
     jit_word_t         w, disp;
+#if !BALC_BROKEN
+    if (jit_mips6_p() && !(i0 & 3)) {
+       disp = ((i0 - (_jit->pc.w + (_jitc->inst.pend ? 4 : 0))) >> 2) - 1;
+       if (patch || can_sign_extend_i26_p(disp)) {
+           flush();
+           w = _jit->pc.w;
+           /* Compact branch instructions do not have a delay slot */
+           BC_R6(disp);
+           goto done_without_delay;
+       }
+    }
+#endif
     /* try to get a pending instruction before the jump */
     t0 = jit_get_reg_for_delay_slot(jit_class_gpr, _ZERO_REGNO, _ZERO_REGNO);
     op = pending();
@@ -3163,6 +4049,9 @@ _jmpi(jit_state_t *_jit, jit_word_t i0, jit_bool_t patch)
 done:
     delay(op);
     jit_unget_reg(t0);
+#if !BALC_BROKEN
+done_without_delay:
+#endif
     return (w);
 }
 
@@ -3171,9 +4060,9 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0)
 {
     jit_word_t         w;
     jit_int32_t                op, t0;
-    /* make sure delay slot does not use _T9_REGNO */
-    t0 = jit_get_reg_for_delay_slot(jit_class_gpr|jit_class_chk,
-                                   _T9_REGNO, _ZERO_REGNO);
+    /* Get a register without side effects in delay slot */
+    t0 = jit_get_reg_for_delay_slot(jit_class_gpr, _ZERO_REGNO, _ZERO_REGNO);
+    /* Check for a instruction that can be executed in the delay slot */
     op = pending();
     /* implicit flush() */
     w = _jit->pc.w;
@@ -3181,8 +4070,7 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0)
     flush();                   /* movi_p will be patched */
     JR(rn(t0));
     delay(op);
-    if (t0 != JIT_NOREG)
-       jit_unget_reg(t0);
+    jit_unget_reg(t0);
     return (w);
 }
 
@@ -3764,8 +4652,20 @@ _calli(jit_state_t *_jit, jit_word_t i0, jit_bool_t patch)
     jit_int32_t                op, t0;
     jit_word_t         w, disp;
     w = _jit->pc.w;
+#if !BALC_BROKEN
+    if (jit_mips6_p() && !(i0 & 3)) {
+       disp = ((i0 - (w + (_jitc->inst.pend ? 4 : 0))) >> 2) - 1;
+       if (patch || can_sign_extend_i26_p(disp)) {
+           flush();
+           w = _jit->pc.w;
+           /* Compact branch instructions do not have a delay slot */
+           BALC(disp);
+           goto done;
+       }
+    }
+#endif
     if (jit_mips2_p()) {
-       disp = ((i0 - w) >> 2) - 1;
+       disp = ((i0 - (w + _jitc->inst.pend ? 4 : 0)) >> 2) - 1;
        if (patch || can_sign_extend_short_p(disp)) {
            op = pending();
            BGEZAL(_ZERO_REGNO, disp);  /* Renamed to BAL in mips release 6 */
@@ -4112,6 +5012,12 @@ _patch_at(jit_state_t *_jit, jit_word_t instr, jit_word_t label)
            u.i[0] = i.op;
            break;
 
+       case MIPS_BALC:                 case MIPS_BC_R6:
+           assert(jit_mips6_p());
+           i.ii.b = ((label - instr) >> 2) - 1;
+           u.i[0] = i.op;
+           break;
+
        default:
            assert(!"unhandled branch opcode");
            break;