sh2 drc: drc exit, block linking and branch handling revised
authorkub <derkub@gmail.com>
Sat, 28 Sep 2019 14:39:26 +0000 (16:39 +0200)
committerkub <derkub@gmail.com>
Sat, 28 Sep 2019 14:39:26 +0000 (16:39 +0200)
cpu/drc/emit_arm.c
cpu/drc/emit_arm64.c
cpu/drc/emit_mips.c
cpu/drc/emit_x86.c
cpu/sh2/compiler.c

index 71a1092..b8c6419 100644 (file)
@@ -631,8 +631,8 @@ static void emith_pool_commit(int jumpover)
 static inline void emith_pool_check(void)
 {
        // check if pool must be committed
-       if (literal_iindex > MAX_HOST_LITERALS-4 ||
-                   (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00)
+       if (literal_iindex > MAX_HOST_LITERALS-4 || (literal_pindex &&
+                   (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00))
                // pool full, or displacement is approaching the limit
                emith_pool_commit(1);
 }
@@ -889,11 +889,19 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
 #define emith_tst_r_imm_c(cond, r, imm) \
        emith_top_imm(cond, A_OP_TST, r, imm)
 
-#define emith_move_r_imm_s8(r, imm) do { \
+#define emith_move_r_imm_s8_patchable(r, imm) do { \
+       emith_flush(); \
        if ((s8)(imm) < 0) \
-               EOP_MVN_IMM(r, 0, ((u8)(imm) ^ 0xff)); \
+               EOP_MVN_IMM(r, 0, (u8)~(imm)); \
        else \
-               EOP_MOV_IMM(r, 0, (u8)imm); \
+               EOP_MOV_IMM(r, 0, (u8)(imm)); \
+} while (0)
+#define emith_move_r_imm_s8_patch(ptr, imm) do { \
+       u32 *ptr_ = (u32 *)ptr; u32 op_ = *ptr_ & 0xfe1ff000; \
+       if ((s8)(imm) < 0) \
+               EMIT_PTR(ptr_, op_ | (A_OP_MVN<<21) | (u8)~(imm));\
+       else \
+               EMIT_PTR(ptr_, op_ | (A_OP_MOV<<21) | (u8)(imm));\
 } while (0)
 
 #define emith_and_r_r_imm(d, s, imm) \
@@ -1125,7 +1133,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
 
 #define emith_jump_patchable(target) \
        emith_jump(target)
-#define emith_jump_patchable_size() 4
 
 #define emith_jump_cond(cond, target) \
        emith_xbranch(cond, target, 0)
@@ -1135,18 +1142,19 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
        emith_jump_cond(cond, target)
 
 #define emith_jump_patch(ptr, target, pos) do { \
-       u32 *ptr_ = ptr; \
+       u32 *ptr_ = (u32 *)ptr; \
        u32 val_ = (u32 *)(target) - ptr_ - 2; \
        *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \
        if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \
 } while (0)
+#define emith_jump_patch_inrange(ptr, target) !0
 #define emith_jump_patch_size() 4
 
 #define emith_jump_at(ptr, target) do { \
        u32 val_ = (u32 *)(target) - (u32 *)(ptr) - 2; \
-       emith_flush(); \
        EOP_C_B_PTR(ptr, A_COND_AL, 0, val_ & 0xffffff); \
 } while (0)
+#define emith_jump_at_size() 4
 
 #define emith_jump_reg_c(cond, r) \
        EOP_C_BX(cond, r)
@@ -1187,8 +1195,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
 #define emith_ret_to_ctx(offs) \
        emith_ctx_write(LR, offs)
 
-#define emith_add_r_ret_imm(r, imm) \
-       emith_add_r_r_ptr_imm(r, LR, imm)
+#define emith_add_r_ret(r) \
+       emith_add_r_r_ptr(r, LR)
 
 /* pushes r12 for eabi alignment */
 #define emith_push_ret(r) do { \
index 72f53dd..688649b 100644 (file)
@@ -447,6 +447,8 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE };
 #define emith_eor_r_r_r(d, s1, s2) \
        emith_eor_r_r_r_lsl(d, s1, s2, 0)
 
+#define emith_add_r_r_r_ptr(d, s1, s2) \
+       emith_add_r_r_r_lsl_ptr(d, s1, s2, 0)
 #define emith_and_r_r_r(d, s1, s2) \
        emith_and_r_r_r_lsl(d, s1, s2, 0)
 
@@ -546,6 +548,20 @@ static void emith_move_imm64(int r, int wx, int64_t imm)
 #define emith_move_r_imm_c(cond, r, imm) \
        emith_move_r_imm(r, imm)
 
+#define emith_move_r_imm_s8_patchable(r, imm) do { \
+       if ((s8)(imm) < 0) \
+               EMIT(A64_MOVN_IMM(r, ~(s8)(imm), 0)); \
+       else \
+               EMIT(A64_MOVZ_IMM(r, (s8)(imm), 0)); \
+} while (0)
+#define emith_move_r_imm_s8_patch(ptr, imm) do { \
+       u32 *ptr_ = (u32 *)ptr; \
+       int r_ = *ptr_ & 0x1f; \
+       if ((s8)(imm) < 0) \
+               EMIT_PTR(ptr_, A64_MOVN_IMM(r_, ~(s8)(imm), 0)); \
+       else \
+               EMIT_PTR(ptr_, A64_MOVZ_IMM(r_, (s8)(imm), 0)); \
+} while (0)
 
 // arithmetic, immediate
 static void emith_arith_imm(int op, int wx, int rd, int rn, s32 imm)
@@ -995,16 +1011,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
        emith_move_r_imm(arg, imm)
 
 // branching; NB: A64 B.cond has only +/- 1MB range
-#define emith_bcond(ptr, patch, cond, target) do { \
-       u32 disp_ = (u8 *)target - (u8 *)ptr; \
-       if (disp_ >= 0xfff00000 || disp_ <= 0x000fffff) { /* can use near B.c */ \
-               EMIT_PTR(ptr, A64_BCOND(cond, disp_ & 0x001fffff)); \
-               if (patch) EMIT_PTR(ptr, A64_NOP); /* reserve space for far B */ \
-       } else { /* far branch if near branch isn't possible */ \
-               EMIT_PTR(ptr, A64_BCOND(emith_invert_cond(cond), 8)); \
-               EMIT_PTR(ptr, A64_B((disp_ - 4) & 0x0fffffff)); \
-       } \
-} while (0)
 
 #define emith_jump(target) do {\
        u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \
@@ -1013,30 +1019,37 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
 
 #define emith_jump_patchable(target) \
        emith_jump(target)
-#define emith_jump_patchable_size() 4
 
-#define emith_jump_cond(cond, target) \
-       emith_bcond(tcache_ptr, 0, cond, target)
+#define emith_jump_cond(cond, target) do { \
+       u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \
+       EMIT(A64_BCOND(cond, disp_ & 0x001fffff)); \
+} while (0)
 
 #define emith_jump_cond_patchable(cond, target) \
-       emith_bcond(tcache_ptr, 1, cond, target)
+       emith_jump_cond(cond, target)
 
 #define emith_jump_cond_inrange(target) \
        !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 21)
 
 #define emith_jump_patch(ptr, target, pos) do { \
        u32 *ptr_ = (u32 *)ptr; \
-       u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \
-       int cond_ = ptr_[0] & 0xf; \
-       if ((ptr_[0] & 0xff000000) == 0x54000000) { /* B.cond */ \
-               if (ptr_[1] != A64_NOP) cond_ = emith_invert_cond(cond_); \
-               emith_bcond(ptr_, 1, cond_, target); \
-       } else if (ptr_[0] & 0x80000000) \
-               EMIT_PTR(ptr_, A64_BL((disp_) & 0x0fffffff)); \
-       else    EMIT_PTR(ptr_, A64_B((disp_) & 0x0fffffff)); \
-       if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \
+       u32 disp_ = (u8 *)target - (u8 *)ptr, mask_; \
+       if ((*ptr_ & 0xff000000) == 0x54000000) \
+               mask_ = 0xff00001f, disp_ <<= 5; /* B.cond, range 21 bit */ \
+       else    mask_ = 0xfc000000;              /* B[L], range 28 bit */ \
+       EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \
+       if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \
+} while (0)
+
+#define emith_jump_patch_inrange(ptr, target) \
+       !(((u8 *)target - (u8 *)ptr + 0x100000) >> 21)
+#define emith_jump_patch_size()        4
+
+#define emith_jump_at(ptr, target) do { \
+       u32 disp_ = (u8 *)target - (u8 *)ptr; \
+       EMIT_PTR(ptr, A64_B(disp_ & 0x0fffffff)); \
 } while (0)
-#define emith_jump_patch_size()        8
+#define emith_jump_at_size() 4
 
 #define emith_jump_reg(r) \
        EMIT(A64_BR(r))
@@ -1079,8 +1092,8 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
 #define emith_ret_to_ctx(offs) \
        emith_ctx_write_ptr(LR, offs)
 
-#define emith_add_r_ret_imm(r, imm) \
-       emith_add_r_r_ptr_imm(r, LR, imm)
+#define emith_add_r_ret(r) \
+       emith_add_r_r_r_ptr(r, LR, r)
 
 // NB: pushes r or r18 for SP hardware alignment
 #define emith_push_ret(r) do { \
index 6ff134d..ad02ff2 100644 (file)
@@ -285,7 +285,7 @@ static int emith_b_isswap(u32 bop, u32 lop)
                return bop;
        else if (emith_is_b(bop) &&  emith_rd(lop) != emith_rs(bop))
                if ((bop & 0xffff) != 0x7fff)   // displacement overflow?
-                       return (bop & 0xffff0000) | ((bop & 0xffff)+1);
+                       return (bop & 0xffff0000) | ((bop+1) & 0x0000ffff);
        return 0;
 }
 
@@ -332,14 +332,14 @@ static void *emith_branch(u32 op)
 
 #define JMP_EMIT(cond, ptr) { \
        u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \
-       EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \
        emith_flush(); /* NO delay slot handling across jump targets */ \
+       EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \
 }
 
 #define JMP_EMIT_NC(ptr) { \
        u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \
-       EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \
        emith_flush(); \
+       EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \
 }
 
 #define EMITH_JMP_START(cond) { \
@@ -645,6 +645,13 @@ static void emith_move_imm(int r, uintptr_t imm)
 #define emith_move_r_imm_c(cond, r, imm) \
        emith_move_r_imm(r, imm)
 
+#define emith_move_r_imm_s8_patchable(r, imm) \
+       EMIT(MIPS_ADD_IMM(r, Z0, (s8)(imm)))
+#define emith_move_r_imm_s8_patch(ptr, imm) do { \
+       u32 *ptr_ = (u32 *)ptr; \
+       while (*ptr_ >> 26 != OP_ADDIU) ptr_++; \
+       EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \
+} while (0)
 
 // arithmetic, immediate
 static void emith_arith_imm(int op, int rd, int rs, u32 imm)
@@ -1162,41 +1169,44 @@ static int emith_cond_check(int cond, int *r)
        emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff))
 #define emith_jump_patchable(target) \
        emith_jump(target)
-#define emith_jump_patchable_size() 8 /* J+delayslot */
 
 // NB: MIPS conditional branches have only +/- 128KB range
 #define emith_jump_cond(cond, target) do { \
        int r_, mcond_ = emith_cond_check(cond, &r_); \
        u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \
-       if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \
-               emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \
-       } else { /* far branch if near branch isn't possible */ \
-               mcond_ = emith_invert_branch(mcond_); \
-               u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \
-               emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \
-               EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \
-       } \
+       emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \
 } while (0)
-#define emith_jump_cond_inrange(target) \
-       !(((u8 *)target - (u8 *)tcache_ptr + 0x20000) >> 18)
+#define emith_jump_cond_patchable(cond, target) \
+       emith_jump_cond(cond, target)
 
-#define emith_jump_cond_patchable(cond, target) do { \
-       int r_, mcond_ = emith_cond_check(cond, &r_); \
-       mcond_ = emith_invert_branch(mcond_); \
-       u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\
-       emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \
-       EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \
-} while (0)
+#define emith_jump_cond_inrange(target) \
+       ((u8 *)target - (u8 *)tcache_ptr - 4 <  0x00020000U || \
+        (u8 *)target - (u8 *)tcache_ptr - 4 >= 0xfffe0010U) // mind cond_check
 
 // NB: returns position of patch for cache maintenance
 #define emith_jump_patch(ptr, target, pos) do { \
        u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \
-       while ((ptr_[0] & 0xf8000000) != OP_J << 26) ptr_ ++; \
-       EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \
+       u32 disp_, mask_; \
+       while (!emith_is_j(*ptr_) && !emith_is_b(*ptr_)) ptr_ ++; \
+       if (emith_is_b(*ptr_)) \
+               mask_ = 0xffff0000, disp_ = (u8 *)target - (u8 *)ptr_ - 4; \
+       else    mask_ = 0xfc000000, disp_ = (uintptr_t)target; \
+       EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \
        if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \
 } while (0)
+
+#define emith_jump_patch_inrange(ptr, target) \
+       ((u8 *)target - (u8 *)ptr - 4 <  0x00020000U || \
+        (u8 *)target - (u8 *)ptr - 4 >= 0xfffe0010U) // mind cond_check
 #define emith_jump_patch_size() 4
 
+#define emith_jump_at(ptr, target) do { \
+       u32 *ptr_ = (u32 *)ptr; \
+       EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \
+       EMIT_PTR(ptr_, MIPS_NOP); \
+} while (0)
+#define emith_jump_at_size() 8
+
 #define emith_jump_reg(r) \
        emith_branch(MIPS_JR(r))
 #define emith_jump_reg_c(cond, r) \
@@ -1232,8 +1242,8 @@ static int emith_cond_check(int cond, int *r)
 #define emith_ret_to_ctx(offs) \
        emith_ctx_write_ptr(LR, offs)
 
-#define emith_add_r_ret_imm(r, imm) \
-       emith_add_r_r_ptr_imm(r, LR, imm)
+#define emith_add_r_ret(r) \
+       emith_add_r_r_ptr(r, LR)
 
 // NB: ABI SP alignment is 8 for compatibility with MIPS IV
 #define emith_push_ret(r) do { \
index d8b3a2d..451fa8d 100644 (file)
@@ -371,8 +371,16 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
        } \
 } while (0)
 
-#define emith_move_r_imm_s8(r, imm) \
-       emith_move_r_imm(r, (u32)(signed int)(signed char)(imm))
+#define emith_move_r_imm_s8_patchable(r, imm) do { \
+       EMIT_REX_IF(0, 0, r); \
+       EMIT_OP(0xb8 + ((r)&7)); \
+       EMIT((s8)(imm), u32); \
+} while (0)
+#define emith_move_r_imm_s8_patch(ptr, imm) do { \
+       u8 *ptr_ = ptr; \
+       while ((*ptr_ & 0xf8) != 0xb8) ptr_++; \
+       EMIT_PTR(ptr_ + 1, (s8)(imm), u32); \
+} while (0)
 
 #define emith_arith_r_imm(op, r, imm) do { \
        EMIT_REX_IF(0, 0, r); \
@@ -851,7 +859,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
 
 #define emith_jump_patchable(target) \
        emith_jump(target)
-#define emith_jump_patchable_size() 5 /* JMP rel32 */
 
 #define emith_jump_cond(cond, ptr) do { \
        u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 6); \
@@ -867,15 +874,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,        // x86-64,i386 common
        u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 4); \
        u32 offs_ = (*(u8 *)(ptr) == 0x0f) ? 2 : 1; \
        EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \
-       if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \
+       if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr + offs_; \
 } while (0)
-#define emith_jump_patch_size() 6
+#define emith_jump_patch_size() 4
+#define emith_jump_patch_inrange(ptr, target) !0
 
 #define emith_jump_at(ptr, target) do { \
        u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 5); \
        EMIT_PTR(ptr, 0xe9, u8); \
        EMIT_PTR((u8 *)(ptr) + 1, disp_, u32); \
 } while (0)
+#define emith_jump_at_size() 5
 
 #define emith_call(ptr) do { \
        u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \
@@ -900,9 +909,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
 #define emith_ret() \
        EMIT_OP(0xc3)
 
-#define emith_add_r_ret_imm(r, imm) do { \
-       emith_read_r_r_offs_ptr(r, xSP, 0); \
-       emith_add_r_r_ptr_imm(r, r, imm); \
+#define emith_add_r_ret(r) do { \
+       EMIT_REX_IF(1, r, xSP); \
+       emith_deref_modrm(0x03, 0, r, xSP); /* add r, [xsp] */ \
 } while (0)
 
 #define emith_jump_reg(r) \
@@ -974,7 +983,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI,  // x86-64,i386 common
        emith_move_r_imm(rd, imm); \
 } while (0)
 
-#define host_instructions_updated(base, end)
+#define host_instructions_updated(base, end)   (void)(base),(void)(end)
 #define        emith_update_cache()    /**/
 
 #define emith_rw_offs_max()    0xffffffff
index ec8554c..932f21c 100644 (file)
@@ -69,7 +69,7 @@
 // 800 - state dump on exit
 // {
 #ifndef DRC_DEBUG
-#define DRC_DEBUG 0//x8e7
+#define DRC_DEBUG 0//x8c7
 #endif
 
 #if DRC_DEBUG
@@ -288,15 +288,19 @@ static u8 *tcache_ptr;
 
 #define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 6)
 
+enum { BL_JMP=1, BL_LDJMP, BL_JCCBLX };
 struct block_link {
+  short tcache_id;
+  short type;                // BL_JMP et al
   u32 target_pc;
   void *jump;                // insn address
+  void *blx;                 // block link/exit  area if any
+  u8 jdisp[8];               // jump backup buffer
   struct block_link *next;   // either in block_entry->links or unresolved
   struct block_link *o_next; //     ...in block_entry->o_links
   struct block_link *prev;
   struct block_link *o_prev;
   struct block_entry *target;// target block this is linked in (be->links)
-  int tcache_id;
 };
 
 struct block_entry {
@@ -686,18 +690,24 @@ static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask)
   return poffs;
 }
 
-static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id)
+static int dr_get_tcache_id(u32 pc, int is_slave)
 {
-  struct block_entry *be;
   u32 tcid = 0;
  
   if ((pc & 0xe0000000) == 0xc0000000)
     tcid = 1 + is_slave; // data array
   if ((pc & ~0xfff) == 0)
     tcid = 1 + is_slave; // BIOS
-  *tcache_id = tcid;
+  return tcid;
+}
 
-  be = HASH_FUNC(hash_tables[tcid], pc, HASH_TABLE_SIZE(tcid) - 1);
+static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id)
+{
+  struct block_entry *be;
+  *tcache_id = dr_get_tcache_id(pc, is_slave);
+
+  be = HASH_FUNC(hash_tables[*tcache_id], pc, HASH_TABLE_SIZE(*tcache_id) - 1);
   if (be != NULL) // don't ask... gcc code generation hint
   for (; be != NULL; be = be->next)
     if (be->pc == pc)
@@ -1101,17 +1111,11 @@ static struct block_desc *dr_add_block(u32 addr, int size,
   bd->size_lit = size_lit;
   bd->tcache_ptr = tcache_ptr;
   bd->crc = crc;
-  bd->active = 1;
-
-  bd->entry_count = 1;
-  bd->entryp[0].pc = addr;
-  bd->entryp[0].tcache_ptr = tcache_ptr;
-  bd->entryp[0].links = bd->entryp[0].o_links = NULL;
+  bd->active = 0;
+  bd->entry_count = 0;
 #if (DRC_DEBUG & 2)
-  bd->entryp[0].block = bd;
   bd->refcount = 0;
 #endif
-  add_to_hashlist(&bd->entryp[0], tcache_id);
 
   *blk_id = *bcount;
   (*bcount)++;
@@ -1150,11 +1154,33 @@ static void dr_block_link(struct block_entry *be, struct block_link *bl, int emi
     bl->jump, bl->target_pc, be->tcache_ptr);
 
   if (emit_jump) {
-    u8 *jump;
-    emith_jump_patch(bl->jump, be->tcache_ptr, &jump);
+    u8 *jump = bl->jump;
+    int jsz = emith_jump_patch_size();
+    if (bl->type == BL_JMP) { // patch: jump @entry
+      // inlined: @jump far jump to target
+      emith_jump_patch(jump, be->tcache_ptr, &jump);
+    } else if (bl->type == BL_LDJMP) { // write: jump @entry
+      // inlined: @jump far jump to target
+      emith_jump_at(jump, be->tcache_ptr);
+      jsz = emith_jump_at_size();
+    } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry
+      if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) {
+        // inlined: @jump near jumpcc to target
+        emith_jump_patch(jump, be->tcache_ptr, &jump);
+      } else { // dispatcher cond immediate
+        // via blx: @jump near jumpcc to blx; @blx far jump
+        emith_jump_patch(jump, bl->blx, &jump);
+        emith_jump_at(bl->blx, be->tcache_ptr);
+        if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf)
+          host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
+      }
+    } else {
+      printf("unknown BL type %d\n", bl->type);
+      exit(1);
+    }
     // only needs sync if patch is possibly crossing cacheline (assume 16 byte)
-    if ((uintptr_t)jump >>4 != ((uintptr_t)jump+emith_jump_patch_size()-1) >>4)
-      host_instructions_updated(jump, jump+emith_jump_patch_size());
+    if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf)
+      host_instructions_updated(jump, jump + jsz-1);
   }
 
   // move bl to block_entry
@@ -1172,10 +1198,26 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump)
 
   if (bl->target) {
     if (emit_jump) {
-      u8 *jump;
-      emith_jump_patch(bl->jump, sh2_drc_dispatcher, &jump);
+      u8 *jump = bl->jump;
+      int jsz = emith_jump_patch_size();
+      if (bl->type == BL_JMP) { // jump_patch @dispatcher
+        // inlined: @jump far jump to dispatcher
+        emith_jump_patch(jump, sh2_drc_dispatcher, &jump);
+      } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher
+        // inlined: @jump load target_pc, far jump to dispatcher
+        memcpy(jump, bl->jdisp, emith_jump_at_size());
+        jsz = emith_jump_at_size();
+      } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump
+        // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump
+        emith_jump_patch(bl->jump, bl->blx, &jump);
+        memcpy(bl->blx, bl->jdisp, emith_jump_at_size());
+        host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
+      } else {
+        printf("unknown BL type %d\n", bl->type);
+        exit(1);
+      }
       // update cpu caches since the previous jump target doesn't exist anymore
-      host_instructions_updated(jump, jump+emith_jump_patch_size());
+      host_instructions_updated(jump, jump + jsz-1);
     }
 
     if (bl->prev)
@@ -1189,18 +1231,17 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump)
 }
 #endif
 
-static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id)
+static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id)
 {
 #if LINK_BRANCHES
   struct block_link *bl = block_link_pool[tcache_id];
   int cnt = block_link_pool_counts[tcache_id];
-  struct block_entry *be = NULL;
   int target_tcache_id;
 
   // get the target block entry
-  be = dr_get_entry(pc, is_slave, &target_tcache_id);
+  target_tcache_id = dr_get_tcache_id(pc, is_slave);
   if (target_tcache_id && target_tcache_id != tcache_id)
-    return sh2_drc_dispatcher;
+    return NULL;
 
   // get a block link
   if (blink_free[tcache_id] != NULL) {
@@ -1208,29 +1249,24 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla
     blink_free[tcache_id] = bl->next;
   } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) {
     dbg(1, "bl overflow for tcache %d", tcache_id);
-    return sh2_drc_dispatcher;
+    return NULL;
   } else {
     bl += cnt;
     block_link_pool_counts[tcache_id] = cnt+1;
   }
 
-  // prepare link and add to ougoing list of owner
+  // prepare link and add to outgoing list of owner
   bl->tcache_id = tcache_id;
   bl->target_pc = pc;
   bl->jump = tcache_ptr;
+  bl->blx = NULL;
   bl->o_next = owner->o_links;
   owner->o_links = bl;
 
-  if (be != NULL) {
-    dr_block_link(be, bl, 0); // jump not yet emitted by translate()
-    return be->tcache_ptr;
-  }
-  else {
-    add_to_hashlist_unresolved(bl, tcache_id);
-    return sh2_drc_dispatcher;
-  }
+  add_to_hashlist_unresolved(bl, tcache_id);
+  return bl;
 #else
-  return sh2_drc_dispatcher;
+  return NULL;
 #endif
 }
 
@@ -1272,6 +1308,27 @@ static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave
 #endif
 }
 
+static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave)
+{
+  int i;
+
+  // connect branches
+  for (i = 0; i < bd->entry_count; i++) {
+    struct block_entry *entry = &bd->entryp[i];
+    add_to_hashlist(entry, tcache_id);
+    // incoming branches
+    dr_link_blocks(entry, tcache_id);
+    if (!tcache_id)
+      dr_link_blocks(entry, is_slave?2:1);
+    // outgoing branches
+    dr_link_outgoing(entry, tcache_id, is_slave);
+  }
+
+  // mark memory for overwrite detection
+  dr_mark_memory(1, bd, tcache_id, 0);
+  bd->active = 1;
+}
+
 #define ADD_TO_ARRAY(array, count, item, failcode) { \
   if (count >= ARRAY_SIZE(array)) { \
     dbg(1, "warning: " #array " overflow"); \
@@ -2422,6 +2479,7 @@ static void rcache_invalidate(void)
 {
   int i;
   gconst_invalidate();
+  rcache_unlock_all();
 
   for (i = 0; i < ARRAY_SIZE(cache_regs); i++)
     rcache_free_vreg(i);
@@ -2446,7 +2504,6 @@ static void rcache_invalidate(void)
 
 static void rcache_flush(void)
 {
-  rcache_unlock_all();
   rcache_clean();
   rcache_invalidate();
 }
@@ -2916,13 +2973,22 @@ static void *dr_get_pc_base(u32 pc, SH2 *sh2);
 
 static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
 {
+  // branch targets in current block
   u32 branch_target_pc[MAX_LOCAL_BRANCHES];
   void *branch_target_ptr[MAX_LOCAL_BRANCHES];
   int branch_target_count = 0;
-  void *branch_patch_ptr[MAX_LOCAL_BRANCHES];
+  // unresolved local forward branches, for fixup at block end
   u32 branch_patch_pc[MAX_LOCAL_BRANCHES];
+  void *branch_patch_ptr[MAX_LOCAL_BRANCHES];
   int branch_patch_count = 0;
+  // external branch targets with a block link/exit area
+  u32 blx_target_pc[MAX_LOCAL_BRANCHES];
+  void *blx_target_ptr[MAX_LOCAL_BRANCHES];
+  struct block_link *blx_target_bl[MAX_LOCAL_BRANCHES];
+  int blx_target_count = 0;
+
   u8 op_flags[BLOCK_INSN_LIMIT];
+
   struct drcf {
     int delay_reg:8;
     u32 loop_type:8;
@@ -2931,9 +2997,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
     u32 pending_branch_direct:1;
     u32 pending_branch_indirect:1;
   } drcf = { 0, };
+
 #if LOOP_OPTIMIZER
-  void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16];
+  // loops with pinned registers for optimzation
+  // pinned regs are like statics and don't need saving/restoring inside a loop
   u32 pinned_loop_pc[MAX_LOCAL_BRANCHES/16];
+  void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16];
   u32 pinned_loop_mask[MAX_LOCAL_BRANCHES/16];
   int pinned_loop_count = 0;
 #endif
@@ -2976,24 +3045,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
     base_literals, end_literals - base_literals);
 
   if (block) {
-    // connect branches
     dbg(2, "== %csh2 reuse block %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm',
       base_pc, end_pc, base_literals, end_literals, block->entryp->tcache_ptr);
-    for (i = 0; i < block->entry_count; i++) {
-      entry = &block->entryp[i];
-      add_to_hashlist(entry, tcache_id);
-#if LINK_BRANCHES
-      // incoming branches
-      dr_link_blocks(entry, tcache_id);
-      if (!tcache_id)
-        dr_link_blocks(entry, sh2->is_slave?2:1);
-      // outgoing branches
-      dr_link_outgoing(entry, tcache_id, sh2->is_slave);
-#endif
-    }
-    // mark memory for overwrite detection
-    dr_mark_memory(1, block, tcache_id, 0);
-    block->active = 1;
+    dr_activate_block(block, tcache_id, sh2->is_slave);
     emith_update_cache();
     return block->entryp[0].tcache_ptr;
   }
@@ -3069,7 +3123,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
         if (op_flags[v] & OF_BASIC_LOOP) {
           m3 &= ~rcache_regs_static & ~BITMASK4(SHR_PC, SHR_PR, SHR_SR, SHR_MEM);
           if (m3 && count_bits(m3) < count_bits(rcache_hregs_reg) &&
-              pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)) {
+              pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)-1) {
             pinned_loop_mask[pinned_loop_count] = m3;
             pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v;
           } else
@@ -3080,6 +3134,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
     }
 #endif
   }
+  pinned_loop_pc[pinned_loop_count] = -1;
 
   if (branch_target_count > 0) {
     memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count);
@@ -3101,7 +3156,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
 
 
   // clear stale state after compile errors
-  rcache_unlock_all();
   rcache_invalidate();
   emith_invalidate_t();
   drcf = (struct drcf) { 0 };
@@ -3146,39 +3200,31 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
         emith_sync_t(sr);
         rcache_flush();
         emith_flush();
+      }
 
-        // make block entry
-        v = block->entry_count;
+      // make block entry
+      v = block->entry_count;
+      entry = &block->entryp[v];
+      if (v < ARRAY_SIZE(block->entryp))
+      {
         entry = &block->entryp[v];
-        if (v < ARRAY_SIZE(block->entryp))
-        {
-          entry = &block->entryp[v];
-          entry->pc = pc;
-          entry->tcache_ptr = tcache_ptr;
-          entry->links = entry->o_links = NULL;
+        entry->pc = pc;
+        entry->tcache_ptr = tcache_ptr;
+        entry->links = entry->o_links = NULL;
 #if (DRC_DEBUG & 2)
-          entry->block = block;
+        entry->block = block;
 #endif
-          add_to_hashlist(entry, tcache_id);
-          block->entry_count++;
+        block->entry_count++;
 
-          dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p",
-            sh2->is_slave ? 's' : 'm', tcache_id, blkid_main,
-            pc, tcache_ptr);
-        }
-        else {
-          dbg(1, "too many entryp for block #%d,%d pc=%08x",
-            tcache_id, blkid_main, pc);
-          break;
-        }
-      } else {
-        entry = block->entryp;
+        dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p",
+          sh2->is_slave ? 's' : 'm', tcache_id, blkid_main,
+          pc, tcache_ptr);
+      }
+      else {
+        dbg(1, "too many entryp for block #%d,%d pc=%08x",
+          tcache_id, blkid_main, pc);
+        break;
       }
-
-      // since we made a block entry, link any other blocks that jump to it
-      dr_link_blocks(entry, tcache_id);
-      if (!tcache_id) // can safely link from cpu-local to global memory
-        dr_link_blocks(entry, sh2->is_slave?2:1);
 
       v = find_in_sorted_array(branch_target_pc, branch_target_count, pc);
       if (v >= 0)
@@ -3220,29 +3266,35 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
 #endif
 
       // check cycles
-      tmp = rcache_get_tmp_arg(0);
       sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL);
       emith_cmp_r_imm(sr, 0);
+
 #if LOOP_OPTIMIZER
-      // on drc exit pinned registers must be saved
+      u8 *jp = NULL;
       if (op_flags[i] & OF_BASIC_LOOP) {
-        EMITH_JMP_START(DCOND_GT);
+        // if exiting a pinned loop pinned regs must be written back to ctx
+        // since they are reloaded in the loop entry code
+        jp = tcache_ptr;
+        emith_jump_cond_patchable(DCOND_GT, jp); // XXX need API for JMP_POS
         rcache_save_pinned();
-        emith_move_r_imm(tmp, pc);
-        emith_jump(sh2_drc_exit);
-        EMITH_JMP_END(DCOND_GT);
-      } else
+      }
 #endif
-      if (emith_jump_cond_inrange(sh2_drc_exit)) {
-        emith_move_r_imm_c(DCOND_LE, tmp, pc);
-        emith_jump_cond(DCOND_LE, sh2_drc_exit);
+      if (blx_target_count < ARRAY_SIZE(blx_target_pc)) {
+        // exit via stub in blx table (saves some 1-3 insns in the main flow)
+        blx_target_pc[blx_target_count] = pc|1;
+        blx_target_bl[blx_target_count] = NULL;
+        blx_target_ptr[blx_target_count++] = tcache_ptr;
       } else {
-        EMITH_JMP_START(DCOND_GT);
-        emith_move_r_imm(tmp, pc);
-        emith_jump(sh2_drc_exit);
-        EMITH_JMP_END(DCOND_GT);
+        // blx table full, must inline exit code
+        tmp = rcache_get_tmp_arg(0);
+        emith_move_r_imm_c(DCOND_LE, tmp, pc);
+        rcache_free_tmp(tmp);
       }
-      rcache_free_tmp(tmp);
+      emith_jump_cond_patchable(DCOND_LE, tcache_ptr);
+#if LOOP_OPTIMIZER
+      if (op_flags[i] & OF_BASIC_LOOP)
+        emith_jump_patch(jp, tcache_ptr, NULL);
+#endif
 
 #if (DRC_DEBUG & 32)
       // block hit counter
@@ -3880,7 +3932,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
         case 2: // SHAL Rn    0100nnnn00100000
           tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2);
           sr  = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
-          emith_sync_t(sr);
+          emith_invalidate_t();
           emith_tpop_carry(sr, 0); // dummy
           emith_lslf(tmp, tmp2, 1);
           emith_tpush_carry(sr, 0);
@@ -3909,7 +3961,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
         case 2: // SHAR Rn    0100nnnn00100001
           tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2);
           sr  = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
-          emith_sync_t(sr);
+          emith_invalidate_t();
           emith_tpop_carry(sr, 0); // dummy
           if (op & 0x20) {
             emith_asrf(tmp, tmp2, 1);
@@ -3967,7 +4019,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
         case 0x05: // ROTR   Rn          0100nnnn00000101
           tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2);
           sr  = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
-          emith_sync_t(sr);
+          emith_invalidate_t();
           emith_tpop_carry(sr, 0); // dummy
           if (op & 1) {
             emith_rorf(tmp, tmp2, 1);
@@ -4351,11 +4403,12 @@ end_op:
       int cond = -1;
       int ctaken = 0;
       void *target = NULL;
-      int patchable = 0;
+      struct block_link *bl = NULL;
 
       if (OP_ISBRACND(opd_b->op))
         ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2;
       cycles += ctaken; // assume branch taken
+
 #if LOOP_OPTIMIZER
       if ((drcf.loop_type == OF_IDLE_LOOP ||
           (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0)))
@@ -4365,14 +4418,35 @@ end_op:
         emith_sh2_delay_loop(cycles, drcf.delay_reg);
         drcf.polling = drcf.loop_type = 0;
       }
+
+      if (target_pc < pc && pinned_loop_pc[pinned_loop_count] == target_pc) {
+        // backward jump at end of optimized loop
+        rcache_unpin_all();
+        target = pinned_loop_ptr[pinned_loop_count];
+        pinned_loop_count ++;
+      }
 #endif
 
       sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
       FLUSH_CYCLES(sr);
+      rcache_unlock_all();
       rcache_clean();
 
-      // emit condition test for conditional branch
+#if CALL_STACK
+      void *rtsadd = NULL, *rtsret = NULL;
+      if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) {
+        // BSR - save rts data
+        tmp = rcache_get_tmp_arg(1);
+        rtsadd = tcache_ptr;
+        emith_move_r_imm_s8_patchable(tmp, 0);
+        rcache_invalidate_tmp();
+        emith_call(sh2_drc_dispatcher_call);
+        rtsret = tcache_ptr;
+      }
+#endif
+
       if (OP_ISBRACND(opd_b->op)) {
+        // BT[S], BF[S] - emit condition test
         cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE;
         if (delay_dep_fw & BITMASK1(SHR_T)) {
           emith_sync_t(sr);
@@ -4396,61 +4470,118 @@ end_op:
       {
         // local branch
         if (branch_target_ptr[v]) {
-          // jumps back can be linked here since host PC is already known
+          // local backward jump, link here now since host PC is already known
           target = branch_target_ptr[v];
+          if (cond != -1)
+            emith_jump_cond(cond, target);
+          else {
+            emith_jump(target);
+            rcache_invalidate();
+          }
         } else if (branch_patch_count < MAX_LOCAL_BRANCHES) {
+          // local forward jump
           target = tcache_ptr;
           branch_patch_pc[branch_patch_count] = target_pc;
           branch_patch_ptr[branch_patch_count] = target;
           branch_patch_count++;
-          patchable = 1;
+          if (cond != -1)
+            emith_jump_cond_patchable(cond, target);
+          else {
+            emith_jump_patchable(target);
+            rcache_invalidate();
+          }
         } else
           dbg(1, "warning: too many local branches");
       }
 #endif
 
-      rcache_unlock_all();
-#if LOOP_OPTIMIZER
-      if (target && pinned_loop_pc[pinned_loop_count] == target_pc) {
-        rcache_unpin_all();
-        target = pinned_loop_ptr[pinned_loop_count];
-        pinned_loop_count ++;
-      }
-#endif
-
       if (target == NULL)
       {
         // can't resolve branch locally, make a block exit
-        rcache_clean();
-        tmp = rcache_get_tmp_arg(0);
-        emith_move_r_imm(tmp, target_pc);
-        rcache_free_tmp(tmp);
+        bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
+        if (cond != -1) {
+#if 1
+          if (bl) {
+            if (blx_target_count < ARRAY_SIZE(blx_target_pc)) {
+              // conditional jumps get a blx stub for the far jump
+              blx_target_pc[blx_target_count] = target_pc;
+              blx_target_bl[blx_target_count] = bl;
+              blx_target_ptr[blx_target_count++] = tcache_ptr;
+              bl->type = BL_JCCBLX;
+              target = tcache_ptr;
+            } else {
+              // blx table full, patch jump only
+              tmp = rcache_get_tmp_arg(0);
+              emith_move_r_imm(tmp, target_pc);
+              rcache_free_tmp(tmp);
+              bl->jump = tcache_ptr;
+              bl->type = BL_JMP;
+              target = sh2_drc_dispatcher;
+            }
+            emith_jump_cond_patchable(cond, target);
+          } else {
+            // cannot link, inline jump @dispatcher
+            EMITH_JMP_START(emith_invert_cond(cond));
+            tmp = rcache_get_tmp_arg(0);
+            emith_move_r_imm(tmp, target_pc);
+            rcache_free_tmp(tmp);
+            target = sh2_drc_dispatcher;
+
+            emith_jump(target);
+            EMITH_JMP_END(emith_invert_cond(cond));
+          }
+#elif 1
+          // jump @dispatcher - ARM 32bit version with conditional execution
+          EMITH_SJMP_START(emith_invert_cond(cond));
+          tmp = rcache_get_tmp_arg(0);
+          emith_move_r_imm_c(cond, tmp, target_pc);
+          rcache_free_tmp(tmp);
+          target = sh2_drc_dispatcher;
 
-#if CALL_STACK
-        if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) {
-          // BSR
-          emith_call(sh2_drc_dispatcher_call);
-        }
+          if (bl) {
+            bl->jump = tcache_ptr;
+            bl->type = BL_JMP;
+          }
+          emith_jump_cond_patchable(cond, target);
+          EMITH_SJMP_END(emith_invert_cond(cond));
+#else
+          // jump @dispatcher - generic version (jump !cond @over, jump @trgt)
+          EMITH_JMP_START(emith_invert_cond(cond));
+          if (bl) {
+            bl->jump = tcache_ptr;
+            bl->type = BL_LDJMP;
+          }
+          tmp = rcache_get_tmp_arg(0);
+          emith_move_r_imm(tmp, target_pc);
+          rcache_free_tmp(tmp);
+          target = sh2_drc_dispatcher;
+
+          emith_jump_patchable(target);
+          EMITH_JMP_END(emith_invert_cond(cond));
 #endif
+        } else {
+          // unconditional, has the far jump inlined
+          if (bl)
+            bl->type = BL_LDJMP;
 
-        target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
-        patchable = 1;
-      }
+          tmp = rcache_get_tmp_arg(0);
+          emith_move_r_imm(tmp, target_pc);
+          rcache_free_tmp(tmp);
+          target = sh2_drc_dispatcher;
 
-      // create branch
-      if (cond != -1) {
-        if (patchable)
-          emith_jump_cond_patchable(cond, target);
-        else
-          emith_jump_cond(cond, target);
-      } else {
-        rcache_invalidate();
-        if (patchable)
           emith_jump_patchable(target);
-        else
-          emith_jump(target);
+          rcache_invalidate();
+        }
       }
 
+      emith_flush();
+      if (bl)
+        memcpy(bl->jdisp, bl->jump, emith_jump_at_size());
+#if CALL_STACK
+      if (rtsadd)
+        emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret);
+#endif
+
       // branch not taken, correct cycle count
       if (ctaken)
         emith_add_r_imm(sr, ctaken << 12);
@@ -4463,35 +4594,57 @@ end_op:
         drcf.polling = drcf.loop_type = 0;
     }
     else if (drcf.pending_branch_indirect) {
-      void *target;
       u32 target_pc;
+      struct block_link *bl = NULL;
 
       sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
       FLUSH_CYCLES(sr);
       emith_sync_t(sr);
       rcache_clean();
+
       tmp = rcache_get_reg_arg(0, SHR_PC, NULL);
-      rcache_invalidate();
+
 #if CALL_STACK
       struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd;
+      void *rtsadd = NULL, *rtsret = NULL;
+
       if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) {
-        // JSR/BSRF
+        // JSR, BSRF - save rts data
+        tmp = rcache_get_tmp_arg(1);
+        rtsadd = tcache_ptr;
+        emith_move_r_imm_s8_patchable(tmp, 0);
+        rcache_invalidate_tmp();
         emith_call(sh2_drc_dispatcher_call);
+        rtsret = tcache_ptr;
       }
+#endif
 
+#if CALL_STACK
       if (opd_b->rm == SHR_PR) {
-        // RTS
+        // RTS - restore rts data, else jump to dispatcher
         emith_jump(sh2_drc_dispatcher_return);
       } else
 #endif
       if (gconst_get(SHR_PC, &target_pc)) {
-        // JMP const, treat like unconditional direct branch
-        target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
-        emith_jump_patchable(target);
+        // JMP, JSR, BRAF, BSRF const - treat like unconditional direct branch
+        bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
+        if (bl) { // pc already loaded somewhere else, can patch jump only
+          bl->type = BL_JMP;
+          bl->jump = tcache_ptr;
+        }
+        emith_jump_patchable(sh2_drc_dispatcher);
       } else {
-        // JMP
+        // JMP, JSR, BRAF, BSRF not const
         emith_jump(sh2_drc_dispatcher);
       }
+      rcache_invalidate();
+
+      emith_flush();
+#if CALL_STACK
+      if (rtsadd)
+        emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret);
+#endif
+
       drcf.pending_branch_indirect = 0;
       drcf.polling = drcf.loop_type = 0;
     }
@@ -4508,24 +4661,48 @@ end_op:
 
   if (! OP_ISBRAUC(opd->op))
   {
-    void *target;
+    struct block_link *bl;
 
     tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
     FLUSH_CYCLES(tmp);
     emith_sync_t(tmp);
 
     rcache_clean();
+    bl = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id);
+    if (bl)
+      bl->type = BL_LDJMP;
     tmp = rcache_get_tmp_arg(0);
     emith_move_r_imm(tmp, pc);
-
-    target = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id);
-    if (target == NULL)
-      return NULL;
+    emith_jump_patchable(sh2_drc_dispatcher);
     rcache_invalidate();
-    emith_jump_patchable(target);
+    emith_flush();
+    if (bl)
+      memcpy(bl->jdisp, bl->jump, emith_jump_at_size());
   } else
     rcache_flush();
+
+  // emit blx area
+  for (i = 0; i < blx_target_count; i++) {
+    void *target = (blx_target_pc[i] & 1 ? sh2_drc_exit : sh2_drc_dispatcher);
+    struct block_link *bl = blx_target_bl[i];
+
+    emith_pool_check();
+    if (bl)
+      bl->blx = tcache_ptr;
+    emith_jump_patch(blx_target_ptr[i], tcache_ptr, NULL);
+    tmp = rcache_get_tmp_arg(0);
+    emith_move_r_imm(tmp, blx_target_pc[i] & ~1);
+    emith_jump(target);
+    rcache_invalidate();
+    emith_flush();
+    if (bl)
+      memcpy(bl->jdisp, bl->blx, emith_jump_at_size());
+  }
+
   emith_flush();
+  do_host_disasm(tcache_id);
+
+  emith_pool_commit(0);
 
   // link local branches
   for (i = 0; i < branch_patch_count; i++) {
@@ -4539,20 +4716,18 @@ end_op:
       target = tcache_ptr;
       tmp = rcache_get_tmp_arg(0);
       emith_move_r_imm(tmp, branch_patch_pc[i]);
-      rcache_flush();
       emith_jump(sh2_drc_dispatcher);
+      rcache_flush();
     }
     emith_jump_patch(branch_patch_ptr[i], target, NULL);
   }
 
-  emith_pool_commit(0);
-
-  dr_mark_memory(1, block, tcache_id, 0);
-
   tcache_ptrs[tcache_id] = tcache_ptr;
-
   host_instructions_updated(block_entry_ptr, tcache_ptr);
 
+  dr_activate_block(block, tcache_id, sh2->is_slave);
+  emith_update_cache();
+
   do_host_disasm(tcache_id);
 
   dbg(2, " block #%d,%d -> %p tcache %d/%d, insns %d -> %d %.3f",
@@ -4574,7 +4749,6 @@ end_op:
   fflush(stdout);
 #endif
 
-  emith_update_cache();
   return block_entry_ptr;
 }
 
@@ -4769,14 +4943,14 @@ static void sh2_generate_utils(void)
   // pc = sh2_drc_dispatcher_call(u32 pc)
   sh2_drc_dispatcher_call = (void *)tcache_ptr;
   emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx));
-  emith_ctx_read(arg1, SHR_PR * 4);
   emith_add_r_imm(arg2, 2*sizeof(void *));
   emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *));
   emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx));
   emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0);
-  emith_write_r_r_offs(arg1, arg2, offsetof(SH2, rts_cache));
-  emith_add_r_ret_imm(arg1, emith_jump_patchable_size()); // skip jump_patchable for rts host address
-  emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache) + sizeof(void *));
+  emith_ctx_read(arg3, SHR_PR * 4);
+  emith_add_r_ret(arg1);
+  emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache)+sizeof(void *));
+  emith_write_r_r_offs(arg3, arg2, offsetof(SH2, rts_cache));
   emith_ret();
   emith_flush();
 
@@ -5378,10 +5552,8 @@ void sh2_drc_finish(SH2 *sh2)
   if (block_tables[0] == NULL)
     return;
 
-  sh2_drc_flush_all();
-
-  for (i = 0; i < TCACHE_BUFFERS; i++) {
 #if (DRC_DEBUG & 4)
+  for (i = 0; i < TCACHE_BUFFERS; i++) {
     printf("~~~ tcache %d\n", i);
 #if 0
     tcache_dsm_ptrs[i] = tcache_bases[i];
@@ -5394,8 +5566,12 @@ void sh2_drc_finish(SH2 *sh2)
     }
 #endif
     printf("max links: %d\n", block_link_pool_counts[i]);
+  }
 #endif
 
+  sh2_drc_flush_all();
+
+  for (i = 0; i < TCACHE_BUFFERS; i++) {
     if (block_tables[i] != NULL)
       free(block_tables[i]);
     block_tables[i] = NULL;