From 52055c13b253cce969a24fa2b95eb9c39ac7ea79 Mon Sep 17 00:00:00 2001
From: kub <derkub@gmail.com>
Date: Thu, 17 Oct 2019 21:54:37 +0200
Subject: [PATCH] sh2 drc: reorganised block mgmt code, plus some small scale
 optimisations

---
 cpu/sh2/compiler.c          | 691 ++++++++++++++++++------------------
 cpu/sh2/compiler.h          |   4 +-
 cpu/sh2/sh2.h               |   2 +-
 pico/32x/memory.c           |  56 +--
 pico/32x/memory_arm.S       |  15 +-
 pico/pico_int.h             |   4 +
 platform/gp2x/PicoDrive.gpe |   2 +
 7 files changed, 395 insertions(+), 379 deletions(-)

diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c
index 86d4b85a..1acc7215 100644
--- a/cpu/sh2/compiler.c
+++ b/cpu/sh2/compiler.c
@@ -764,58 +764,16 @@ static void rm_from_block_lists(struct block_desc *block)
   block->list = NULL;
 }
 
-static void rm_block_list(struct block_list **blist)
+static void discard_block_list(struct block_list **blist)
 {
-  while (*blist != NULL)
-    rm_from_block_lists((*blist)->block);
-}
-
-static void REGPARM(1) flush_tcache(int tcid)
-{
-  int i;
-#if (DRC_DEBUG & 1)
-  int tc_used, bl_used;
-
-  tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]);
-  bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]);
-  elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used,
-    tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid));
-#endif
-
-  block_counts[tcid] = 0;
-  block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1;
-  block_link_pool_counts[tcid] = 0;
-  blink_free[tcid] = NULL;
-  memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid));
-  memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid));
-  tcache_ptrs[tcid] = tcache_bases[tcid];
-  tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid];
-  if (Pico32xMem->sdram != NULL) {
-    if (tcid == 0) { // ROM, RAM
-      memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram));
-      memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
-      memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache));
-      memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache));
-      memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache));
-      memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache));
-      sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
-    } else {
-      memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram));
-      memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
-      memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1]));
-      memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1]));
-      memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache));
-      memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache));
-      sh2s[tcid - 1].rts_cache_idx = 0;
-    }
+  struct block_list *next, *current = *blist;
+  while (current != NULL) {
+    next = current->next;
+    current->next = blist_free;
+    blist_free = current;
+    current = next;
   }
-#if (DRC_DEBUG & 4)
-  tcache_dsm_ptrs[tcid] = tcache_bases[tcid];
-#endif
-
-  for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++)
-    rm_block_list(&inval_lookup[tcid][i]);
-  rm_block_list(&inactive_blocks[tcid]);
+  *blist = NULL;
 }
 
 static void add_to_hashlist(struct block_entry *be, int tcache_id)
@@ -902,68 +860,127 @@ static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id)
     bl->next->prev = bl->prev;
 }
 
-static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free);
-static void dr_free_oldest_block(int tcache_id)
+#if LINK_BRANCHES
+static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump)
 {
-  struct block_desc *bd;
+  dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ",
+    bl->jump, bl->target_pc, be->tcache_ptr);
 
-  if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) {
-    // block desc wrap around
-    block_limit[tcache_id] = 0;
+  if (emit_jump) {
+    u8 *jump = bl->jump;
+    int jsz = emith_jump_patch_size();
+    if (bl->type == BL_JMP) { // patch: jump @entry
+      // inlined: @jump far jump to target
+      emith_jump_patch(jump, be->tcache_ptr, &jump);
+    } else if (bl->type == BL_LDJMP) { // write: jump @entry
+      // inlined: @jump far jump to target
+      emith_jump_at(jump, be->tcache_ptr);
+      jsz = emith_jump_at_size();
+    } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry
+      if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) {
+        // inlined: @jump near jumpcc to target
+        emith_jump_patch(jump, be->tcache_ptr, &jump);
+      } else { // dispatcher cond immediate
+        // via blx: @jump near jumpcc to blx; @blx far jump
+        emith_jump_patch(jump, bl->blx, &jump);
+        emith_jump_at(bl->blx, be->tcache_ptr);
+        if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf)
+          host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
+      }
+    } else {
+      printf("unknown BL type %d\n", bl->type);
+      exit(1);
+    }
+    // only needs sync if patch is possibly crossing cacheline (assume 16 byte)
+    if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf)
+      host_instructions_updated(jump, jump + jsz-1);
   }
-  bd = &block_tables[tcache_id][block_limit[tcache_id]];
 
-  if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) {
-    // cache wrap around
-    tcache_ptrs[tcache_id] = bd->tcache_ptr;
-  }
+  // move bl to block_entry
+  bl->target = be;
+  bl->prev = NULL;
+  if (be->links)
+    be->links->prev = bl;
+  bl->next = be->links;
+  be->links = bl;
+}
 
-  if (bd->addr && bd->entry_count)
-    sh2_smc_rm_block_entry(bd, tcache_id, 0, 1);
+static void dr_block_unlink(struct block_link *bl, int emit_jump)
+{
+  dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc);
 
-  block_limit[tcache_id]++;
-  if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id))
-    block_limit[tcache_id] = 0;
-  bd = &block_tables[tcache_id][block_limit[tcache_id]];
-  if (bd->tcache_ptr >= tcache_ptrs[tcache_id])
-    tcache_limit[tcache_id] = bd->tcache_ptr;
-  else
-    tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id];
+  if (bl->target) {
+    if (emit_jump) {
+      u8 *jump = bl->jump;
+      int jsz = emith_jump_patch_size();
+      if (bl->type == BL_JMP) { // jump_patch @dispatcher
+        // inlined: @jump far jump to dispatcher
+        emith_jump_patch(jump, sh2_drc_dispatcher, &jump);
+      } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher
+        // inlined: @jump load target_pc, far jump to dispatcher
+        memcpy(jump, bl->jdisp, emith_jump_at_size());
+        jsz = emith_jump_at_size();
+      } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump
+        // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump
+        emith_jump_patch(bl->jump, bl->blx, &jump);
+        memcpy(bl->blx, bl->jdisp, emith_jump_at_size());
+        host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
+      } else {
+        printf("unknown BL type %d\n", bl->type);
+        exit(1);
+      }
+      // update cpu caches since the previous jump target doesn't exist anymore
+      host_instructions_updated(jump, jump + jsz-1);
+    }
+
+    if (bl->prev)
+      bl->prev->next = bl->next;
+    else
+      bl->target->links = bl->next;
+    if (bl->next)
+      bl->next->prev = bl->prev;
+    bl->target = NULL;
+  }
 }
+#endif
 
-static u8 *dr_prepare_cache(int tcache_id, int insn_count)
+static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id)
 {
-  u8 *limit = tcache_limit[tcache_id];
+#if LINK_BRANCHES
+  struct block_link *bl = block_link_pool[tcache_id];
+  int cnt = block_link_pool_counts[tcache_id];
+  int target_tcache_id;
 
-  // if no block desc available
-  if (block_counts[tcache_id] == block_limit[tcache_id])
-    dr_free_oldest_block(tcache_id);
+  // get the target block entry
+  target_tcache_id = dr_get_tcache_id(pc, is_slave);
+  if (target_tcache_id && target_tcache_id != tcache_id)
+    return NULL;
 
-  // while not enough cache space left (limit - tcache_ptr < max space needed)
-  while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128)
-    dr_free_oldest_block(tcache_id);
+  // get a block link
+  if (blink_free[tcache_id] != NULL) {
+    bl = blink_free[tcache_id];
+    blink_free[tcache_id] = bl->next;
+  } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) {
+    dbg(1, "bl overflow for tcache %d", tcache_id);
+    return NULL;
+  } else {
+    bl += cnt;
+    block_link_pool_counts[tcache_id] = cnt+1;
+  }
 
-  if (limit != tcache_limit[tcache_id]) {
-#if BRANCH_CACHE
-    if (tcache_id)
-      memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
-    else {
-      memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
-      memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4);
-    }
-#endif
-#if CALL_STACK
-    if (tcache_id) {
-      memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
-      sh2s[tcache_id-1].rts_cache_idx = 0;
-    } else {
-      memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
-      memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4);
-      sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
-    }
+  // prepare link and add to outgoing list of owner
+  bl->tcache_id = tcache_id;
+  bl->target_pc = pc;
+  bl->jump = tcache_ptr;
+  bl->blx = NULL;
+  bl->o_next = owner->o_links;
+  owner->o_links = bl;
+
+  add_to_hashlist_unresolved(bl, tcache_id);
+  return bl;
+#else
+  return NULL;
 #endif
-  }
-  return (u8 *)tcache_ptrs[tcache_id];
 }
 
 static void dr_mark_memory(int mark, struct block_desc *block, int tcache_id, u32 nolit)
@@ -1059,207 +1076,117 @@ static u32 dr_check_nolit(u32 start, u32 end, int tcache_id)
   return end;
 }
 
-static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc,
-  u32 addr, int size, u32 addr_lit, int size_lit)
+static void dr_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free)
 {
-  struct block_list **head = &inactive_blocks[tcache_id];
-  struct block_list *current;
+  struct block_link *bl;
+  u32 i;
 
-  for (current = *head; current != NULL; current = current->next) {
-    struct block_desc *block = current->block;
-    if (block->crc == crc && block->addr == addr && block->size == size &&
-        block->addr_lit == addr_lit && block->size_lit == size_lit)
-    {
-      rm_from_block_lists(block);
-      return block;
-    }
+  free = free || nolit; // block is invalid if literals are overwritten
+  dbg(2,"  %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl",
+    bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit,
+    tcache_id, bd - block_tables[tcache_id]);
+  if (bd->addr == 0 || bd->entry_count == 0) {
+    dbg(1, "  killing dead block!? %08x", bd->addr);
+    return;
   }
-  return NULL;
-}
 
-static struct block_desc *dr_add_block(u32 addr, int size,
-  u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id)
-{
-  struct block_entry *be;
-  struct block_desc *bd;
-  int tcache_id;
-  int *bcount;
+#if LINK_BRANCHES
+  // remove from hash table, make incoming links unresolved
+  if (bd->active) {
+    for (i = 0; i < bd->entry_count; i++) {
+      rm_from_hashlist(&bd->entryp[i], tcache_id);
 
-  // do a lookup to get tcache_id and override check
-  be = dr_get_entry(addr, is_slave, &tcache_id);
-  if (be != NULL)
-    dbg(1, "block override for %08x", addr);
+      while ((bl = bd->entryp[i].links) != NULL) {
+        dr_block_unlink(bl, 1);
+        add_to_hashlist_unresolved(bl, tcache_id);
+      }
+    }
 
-  bcount = &block_counts[tcache_id];
-  if (*bcount == block_limit[tcache_id]) {
-    dbg(1, "bd overflow for tcache %d", tcache_id);
-    return NULL;
+    dr_mark_memory(-1, bd, tcache_id, nolit);
+    add_to_block_list(&inactive_blocks[tcache_id], bd);
   }
-
-  bd = &block_tables[tcache_id][*bcount];
-  bd->addr = addr;
-  bd->size = size;
-  bd->addr_lit = addr_lit;
-  bd->size_lit = size_lit;
-  bd->tcache_ptr = tcache_ptr;
-  bd->crc = crc;
   bd->active = 0;
-  bd->entry_count = 0;
-#if (DRC_DEBUG & 2)
-  bd->refcount = 0;
 #endif
 
-  *blk_id = *bcount;
-  (*bcount)++;
-  if (*bcount >= BLOCK_MAX_COUNT(tcache_id))
-    *bcount = 0;
-
-  return bd;
-}
-
-static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id)
-{
-  struct block_entry *be = NULL;
-  void *block = NULL;
-
-  be = dr_get_entry(pc, sh2->is_slave, tcache_id);
-  if (be != NULL)
-    block = be->tcache_ptr;
-
-#if (DRC_DEBUG & 2)
-  if (be != NULL)
-    be->block->refcount++;
-#endif
-  return block;
-}
-
-static void *dr_failure(void)
-{
-  lprintf("recompilation failed\n");
-  exit(1);
-}
-
+  if (free) {
 #if LINK_BRANCHES
-static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump)
-{
-  dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ",
-    bl->jump, bl->target_pc, be->tcache_ptr);
-
-  if (emit_jump) {
-    u8 *jump = bl->jump;
-    int jsz = emith_jump_patch_size();
-    if (bl->type == BL_JMP) { // patch: jump @entry
-      // inlined: @jump far jump to target
-      emith_jump_patch(jump, be->tcache_ptr, &jump);
-    } else if (bl->type == BL_LDJMP) { // write: jump @entry
-      // inlined: @jump far jump to target
-      emith_jump_at(jump, be->tcache_ptr);
-      jsz = emith_jump_at_size();
-    } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry
-      if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) {
-        // inlined: @jump near jumpcc to target
-        emith_jump_patch(jump, be->tcache_ptr, &jump);
-      } else { // dispatcher cond immediate
-        // via blx: @jump near jumpcc to blx; @blx far jump
-        emith_jump_patch(jump, bl->blx, &jump);
-        emith_jump_at(bl->blx, be->tcache_ptr);
-        if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf)
-          host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
-      }
-    } else {
-      printf("unknown BL type %d\n", bl->type);
-      exit(1);
+    // revoke outgoing links
+    for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) {
+      if (bl->target)
+        dr_block_unlink(bl, 0);
+      else
+        rm_from_hashlist_unresolved(bl, tcache_id);
+      bl->jump = NULL;
+      bl->next = blink_free[bl->tcache_id];
+      blink_free[bl->tcache_id] = bl;
     }
-    // only needs sync if patch is possibly crossing cacheline (assume 16 byte)
-    if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf)
-      host_instructions_updated(jump, jump + jsz-1);
+    bd->entryp[0].o_links = NULL;
+#endif
+    // invalidate block
+    rm_from_block_lists(bd);
+    bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0;
+    bd->entry_count = 0;
   }
-
-  // move bl to block_entry
-  bl->target = be;
-  bl->prev = NULL;
-  if (be->links)
-    be->links->prev = bl;
-  bl->next = be->links;
-  be->links = bl;
+  emith_update_cache();
 }
 
-static void dr_block_unlink(struct block_link *bl, int emit_jump)
+static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc,
+  u32 addr, int size, u32 addr_lit, int size_lit)
 {
-  dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc);
+  struct block_list **head = &inactive_blocks[tcache_id];
+  struct block_list *current;
 
-  if (bl->target) {
-    if (emit_jump) {
-      u8 *jump = bl->jump;
-      int jsz = emith_jump_patch_size();
-      if (bl->type == BL_JMP) { // jump_patch @dispatcher
-        // inlined: @jump far jump to dispatcher
-        emith_jump_patch(jump, sh2_drc_dispatcher, &jump);
-      } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher
-        // inlined: @jump load target_pc, far jump to dispatcher
-        memcpy(jump, bl->jdisp, emith_jump_at_size());
-        jsz = emith_jump_at_size();
-      } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump
-        // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump
-        emith_jump_patch(bl->jump, bl->blx, &jump);
-        memcpy(bl->blx, bl->jdisp, emith_jump_at_size());
-        host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1);
-      } else {
-        printf("unknown BL type %d\n", bl->type);
-        exit(1);
-      }
-      // update cpu caches since the previous jump target doesn't exist anymore
-      host_instructions_updated(jump, jump + jsz-1);
+  for (current = *head; current != NULL; current = current->next) {
+    struct block_desc *block = current->block;
+    if (block->crc == crc && block->addr == addr && block->size == size &&
+        block->addr_lit == addr_lit && block->size_lit == size_lit)
+    {
+      rm_from_block_lists(block);
+      return block;
     }
-
-    if (bl->prev)
-      bl->prev->next = bl->next;
-    else
-      bl->target->links = bl->next;
-    if (bl->next)
-      bl->next->prev = bl->prev;
-    bl->target = NULL;
   }
+  return NULL;
 }
-#endif
 
-static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id)
+static struct block_desc *dr_add_block(u32 addr, int size,
+  u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id)
 {
-#if LINK_BRANCHES
-  struct block_link *bl = block_link_pool[tcache_id];
-  int cnt = block_link_pool_counts[tcache_id];
-  int target_tcache_id;
+  struct block_entry *be;
+  struct block_desc *bd;
+  int tcache_id;
+  int *bcount;
 
-  // get the target block entry
-  target_tcache_id = dr_get_tcache_id(pc, is_slave);
-  if (target_tcache_id && target_tcache_id != tcache_id)
-    return NULL;
+  // do a lookup to get tcache_id and override check
+  be = dr_get_entry(addr, is_slave, &tcache_id);
+  if (be != NULL)
+    dbg(1, "block override for %08x", addr);
 
-  // get a block link
-  if (blink_free[tcache_id] != NULL) {
-    bl = blink_free[tcache_id];
-    blink_free[tcache_id] = bl->next;
-  } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) {
-    dbg(1, "bl overflow for tcache %d", tcache_id);
+  bcount = &block_counts[tcache_id];
+  if (*bcount == block_limit[tcache_id]) {
+    dbg(1, "bd overflow for tcache %d", tcache_id);
     return NULL;
-  } else {
-    bl += cnt;
-    block_link_pool_counts[tcache_id] = cnt+1;
   }
 
-  // prepare link and add to outgoing list of owner
-  bl->tcache_id = tcache_id;
-  bl->target_pc = pc;
-  bl->jump = tcache_ptr;
-  bl->blx = NULL;
-  bl->o_next = owner->o_links;
-  owner->o_links = bl;
-
-  add_to_hashlist_unresolved(bl, tcache_id);
-  return bl;
-#else
-  return NULL;
+  bd = &block_tables[tcache_id][*bcount];
+  bd->addr = addr;
+  bd->size = size;
+  bd->addr_lit = addr_lit;
+  bd->size_lit = size_lit;
+  bd->tcache_ptr = tcache_ptr;
+  bd->crc = crc;
+  bd->active = 0;
+  bd->list = NULL;
+  bd->entry_count = 0;
+#if (DRC_DEBUG & 2)
+  bd->refcount = 0;
 #endif
+
+  *blk_id = *bcount;
+  (*bcount)++;
+  if (*bcount >= BLOCK_MAX_COUNT(tcache_id))
+    *bcount = 0;
+
+  return bd;
 }
 
 static void dr_link_blocks(struct block_entry *be, int tcache_id)
@@ -1321,6 +1248,139 @@ static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave
   bd->active = 1;
 }
 
+static void REGPARM(3) ALIGNED(32) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id)
+{
+  struct block_entry *be = NULL;
+  void *block = NULL;
+
+  be = dr_get_entry(pc, sh2->is_slave, tcache_id);
+  if (be != NULL)
+    block = be->tcache_ptr;
+
+#if (DRC_DEBUG & 2)
+  if (be != NULL)
+    be->block->refcount++;
+#endif
+  return block;
+}
+
+static void dr_free_oldest_block(int tcache_id)
+{
+  struct block_desc *bd;
+
+  if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) {
+    // block desc wrap around
+    block_limit[tcache_id] = 0;
+  }
+  bd = &block_tables[tcache_id][block_limit[tcache_id]];
+
+  if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) {
+    // cache wrap around
+    tcache_ptrs[tcache_id] = bd->tcache_ptr;
+  }
+
+  if (bd->addr && bd->entry_count)
+    dr_rm_block_entry(bd, tcache_id, 0, 1);
+
+  block_limit[tcache_id]++;
+  if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id))
+    block_limit[tcache_id] = 0;
+  bd = &block_tables[tcache_id][block_limit[tcache_id]];
+  if (bd->tcache_ptr >= tcache_ptrs[tcache_id])
+    tcache_limit[tcache_id] = bd->tcache_ptr;
+  else
+    tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id];
+}
+
+static u8 *dr_prepare_cache(int tcache_id, int insn_count)
+{
+  u8 *limit = tcache_limit[tcache_id];
+
+  // if no block desc available
+  if (block_counts[tcache_id] == block_limit[tcache_id])
+    dr_free_oldest_block(tcache_id);
+
+  // while not enough cache space left (limit - tcache_ptr < max space needed)
+  while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128)
+    dr_free_oldest_block(tcache_id);
+
+  if (limit != tcache_limit[tcache_id]) {
+#if BRANCH_CACHE
+    if (tcache_id)
+      memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
+    else {
+      memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
+      memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4);
+    }
+#endif
+#if CALL_STACK
+    if (tcache_id) {
+      memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+      sh2s[tcache_id-1].rts_cache_idx = 0;
+    } else {
+      memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+      memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4);
+      sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
+    }
+#endif
+  }
+  return (u8 *)tcache_ptrs[tcache_id];
+}
+
+static void dr_flush_tcache(int tcid)
+{
+  int i;
+#if (DRC_DEBUG & 1)
+  int tc_used, bl_used;
+
+  tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]);
+  bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]);
+  elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used,
+    tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid));
+#endif
+
+  block_counts[tcid] = 0;
+  block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1;
+  block_link_pool_counts[tcid] = 0;
+  blink_free[tcid] = NULL;
+  memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid));
+  memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid));
+  tcache_ptrs[tcid] = tcache_bases[tcid];
+  tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid];
+  if (Pico32xMem->sdram != NULL) {
+    if (tcid == 0) { // ROM, RAM
+      memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram));
+      memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
+      memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache));
+      memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache));
+      memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache));
+      memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache));
+      sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
+    } else {
+      memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram));
+      memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
+      memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1]));
+      memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1]));
+      memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache));
+      memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache));
+      sh2s[tcid - 1].rts_cache_idx = 0;
+    }
+  }
+#if (DRC_DEBUG & 4)
+  tcache_dsm_ptrs[tcid] = tcache_bases[tcid];
+#endif
+
+  for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++)
+    discard_block_list(&inval_lookup[tcid][i]);
+  discard_block_list(&inactive_blocks[tcid]);
+}
+
+static void *dr_failure(void)
+{
+  lprintf("recompilation failed\n");
+  exit(1);
+}
+
 #define ADD_TO_ARRAY(array, count, item, failcode) { \
   if (count >= ARRAY_SIZE(array)) { \
     dbg(1, "warning: " #array " overflow"); \
@@ -5066,61 +5126,7 @@ static void sh2_generate_utils(void)
 #endif
 }
 
-static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free)
-{
-  struct block_link *bl;
-  u32 i;
-
-  free = free || nolit; // block is invalid if literals are overwritten
-  dbg(2,"  %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl",
-    bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit,
-    tcache_id, bd - block_tables[tcache_id]);
-  if (bd->addr == 0 || bd->entry_count == 0) {
-    dbg(1, "  killing dead block!? %08x", bd->addr);
-    return;
-  }
-
-#if LINK_BRANCHES
-  // remove from hash table, make incoming links unresolved
-  if (bd->active) {
-    for (i = 0; i < bd->entry_count; i++) {
-      rm_from_hashlist(&bd->entryp[i], tcache_id);
-
-      while ((bl = bd->entryp[i].links) != NULL) {
-        dr_block_unlink(bl, 1);
-        add_to_hashlist_unresolved(bl, tcache_id);
-      }
-    }
-
-    dr_mark_memory(-1, bd, tcache_id, nolit);
-    add_to_block_list(&inactive_blocks[tcache_id], bd);
-  }
-  bd->active = 0;
-#endif
-
-  if (free) {
-#if LINK_BRANCHES
-    // revoke outgoing links
-    for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) {
-      if (bl->target)
-        dr_block_unlink(bl, 0);
-      else
-        rm_from_hashlist_unresolved(bl, tcache_id);
-      bl->jump = NULL;
-      bl->next = blink_free[bl->tcache_id];
-      blink_free[bl->tcache_id] = bl;
-    }
-    bd->entryp[0].o_links = NULL;
-#endif
-    // invalidate block
-    rm_from_block_lists(bd);
-    bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0;
-    bd->entry_count = 0;
-  }
-  emith_update_cache();
-}
-
-static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift)
+static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift)
 {
   struct block_list **blist, *entry, *next;
   u32 mask = RAM_SIZE(tcache_id) - 1;
@@ -5146,12 +5152,12 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift)
     start_lit = block->addr_lit & wtmask;
     end_lit = start_lit + block->size_lit;
     // disable/delete block if it covers the modified address
-    if ((start_addr <= a && a < end_addr) ||
-        (start_lit <= a && a < end_lit))
+    if ((start_addr <= a+len && a < end_addr) ||
+        (start_lit <= a+len && a < end_lit))
     {
       dbg(2, "smc remove @%08x", a);
-      end_addr = (start_lit <= a && block->size_lit ? a : 0);
-      sh2_smc_rm_block_entry(block, tcache_id, end_addr, 0);
+      end_addr = (start_lit <= a+len && block->size_lit ? a : 0);
+      dr_rm_block_entry(block, tcache_id, end_addr, 0);
 #if (DRC_DEBUG & 2)
       removed = 1;
 #endif
@@ -5182,17 +5188,20 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift)
 #endif
 }
 
-void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2)
+void sh2_drc_wcheck_ram(unsigned int a, unsigned t, SH2 *sh2)
 {
-  dbg(2, "%csh2 smc check @%08x v=%d", sh2->is_slave ? 's' : 'm', a, val);
-  sh2_smc_rm_blocks(a, 0, SH2_DRCBLK_RAM_SHIFT);
+  int off = ((u16) t ? 0 : 2);
+  int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0);
+
+  sh2_smc_rm_blocks(a + off, len, 0, SH2_DRCBLK_RAM_SHIFT);
 }
 
-void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2)
+void sh2_drc_wcheck_da(unsigned int a, unsigned t, SH2 *sh2)
 {
-  int cpuid = sh2->is_slave;
-  dbg(2, "%csh2 smc check @%08x v=%d", cpuid ? 's' : 'm', a, val);
-  sh2_smc_rm_blocks(a, 1 + cpuid, SH2_DRCBLK_DA_SHIFT);
+  int off = ((u16) t ? 0 : 2);
+  int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0);
+
+  sh2_smc_rm_blocks(a + off, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT);
 }
 
 int sh2_execute_drc(SH2 *sh2c, int cycles)
@@ -5408,9 +5417,9 @@ void sh2_drc_flush_all(void)
   block_stats();
   entry_stats();
   bcache_stats();
-  flush_tcache(0);
-  flush_tcache(1);
-  flush_tcache(2);
+  dr_flush_tcache(0);
+  dr_flush_tcache(1);
+  dr_flush_tcache(2);
   Pico32x.emu_flags &= ~P32XF_DRC_ROM_C;
 }
 
diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h
index 3565940d..94dff8c5 100644
--- a/cpu/sh2/compiler.h
+++ b/cpu/sh2/compiler.h
@@ -1,7 +1,7 @@
 int  sh2_drc_init(SH2 *sh2);
 void sh2_drc_finish(SH2 *sh2);
-void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2);
-void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2);
+void sh2_drc_wcheck_ram(unsigned int a, unsigned val, SH2 *sh2);
+void sh2_drc_wcheck_da(unsigned int a, unsigned val, SH2 *sh2);
 
 #ifdef DRC_SH2
 void sh2_drc_mem_setup(SH2 *sh2);
diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h
index cf830dfc..57693ac1 100644
--- a/cpu/sh2/sh2.h
+++ b/cpu/sh2/sh2.h
@@ -80,7 +80,7 @@ typedef struct SH2_
 
 	unsigned char	data_array[0x1000]; // cache (can be used as RAM)
 	unsigned int	peri_regs[0x200/4]; // periphereal regs
-} SH2;
+} SH2 ALIGNED(32);
 
 #define CYCLE_MULT_SHIFT 10
 #define C_M68K_TO_SH2(xsh2, c) \
diff --git a/pico/32x/memory.c b/pico/32x/memory.c
index 06215a7c..39504416 100644
--- a/pico/32x/memory.c
+++ b/pico/32x/memory.c
@@ -231,7 +231,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2)
   for (idx = nrd = wr; idx != rd; ) {
     idx = (idx-1) % PFIFO_SZ;
     q = &fifo[idx];
-    if (q->cpu != cpu && q->a == a)	{ q->a = -1; }
+    if (q->a == a && q->cpu != cpu)	{ q->a = -1; }
     if (q->a != -1)			{ nrd = idx; }
   }
   rd = nrd;
@@ -825,7 +825,8 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2)
         unsigned int cycles = sh2_cycles_done_m68k(sh2);
         Pico32x.sh2_regs[4 / 2] = d;
         p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles);
-        sh2_end_run(sh2, 4);
+        if (p32x_sh2_ready(sh2->other_sh2, cycles+16))
+          sh2_end_run(sh2, 4);
         sh2_poll_write(a & ~1, d, cycles, sh2);
       }
       return;
@@ -851,7 +852,8 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2)
         REG8IN16(r, a) = d;
         p32x_m68k_poll_event(P32XF_68KCPOLL);
         p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles);
-        sh2_end_run(sh2, 1);
+        if (p32x_sh2_ready(sh2->other_sh2, cycles+16))
+          sh2_end_run(sh2, 1);
         sh2_poll_write(a & ~1, r[a / 2], cycles, sh2);
       }
       return;
@@ -943,7 +945,8 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2)
         Pico32x.regs[a / 2] = d;
         p32x_m68k_poll_event(P32XF_68KCPOLL);
         p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles);
-        sh2_end_run(sh2, 1);
+        if (p32x_sh2_ready(sh2->other_sh2, cycles+16))
+          sh2_end_run(sh2, 1);
         sh2_poll_write(a, d, cycles, sh2);
       }
       return;
@@ -1569,7 +1572,7 @@ static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2)
 
 // writes
 #ifdef DRC_SH2
-static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2)
+static void sh2_sdram_poll(u32 a, u32 d, SH2 *sh2)
 {
   unsigned cycles;
 
@@ -1577,34 +1580,35 @@ static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2)
   cycles = sh2_cycles_done_m68k(sh2);
   sh2_poll_write(a, d, cycles, sh2);
   p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles);
-  sh2_end_run(sh2, 1);
+  if (p32x_sh2_ready(sh2->other_sh2, cycles+16))
+    sh2_end_run(sh2, 1);
   DRC_RESTORE_SR(sh2);
 }
 
-void NOINLINE sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, int t)
+void sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, u32 t)
 {
-  if (t & 0x80)
-    sh2_sdram_poll(a, d, sh2);
-  if (t & 0x7f)
-    sh2_drc_wcheck_ram(a, t & 0x7f, sh2);
+  if (t & 0x80)         sh2_sdram_poll(a, d, sh2);
+  if (t & 0x7f)         sh2_drc_wcheck_ram(a, t & 0x7f, sh2);
 }
 
-void NOINLINE sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, int t)
+void sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, u32 t)
 {
-  sh2_sdram_checks(a, d>>16, sh2, t);
-  sh2_sdram_checks(a+2, d, sh2, t>>16);
+  u32 m = 0x80 | 0x800000;
+
+  if (t & 0x000080)     sh2_sdram_poll(a, d>>16, sh2);
+  if (t & 0x800000)     sh2_sdram_poll(a+2, d, sh2);
+  if (t & ~m)           sh2_drc_wcheck_ram(a, t & ~m, sh2);
 }
 
 #ifndef _ASM_32X_MEMORY_C
-static void sh2_da_checks(u32 a, int t, SH2 *sh2)
+static void sh2_da_checks(u32 a, u32 t, SH2 *sh2)
 {
   sh2_drc_wcheck_da(a, t, sh2);
 }
 
-static void NOINLINE sh2_da_checks_l(u32 a, int t, SH2 *sh2)
+static void sh2_da_checks_l(u32 a, u32 t, SH2 *sh2)
 {
-  sh2_da_checks(a, t, sh2);
-  sh2_da_checks(a+2, t>>16, sh2);
+  sh2_drc_wcheck_da(a, t, sh2);
 }
 #endif
 #endif
@@ -1667,7 +1671,7 @@ static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2)
   ((u8 *)sh2->p_sdram)[a1] = d;
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_ram;
-  int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
   if (t)
     sh2_sdram_checks(a & ~1, ((u16 *)sh2->p_sdram)[a1 / 2], sh2, t);
 #endif
@@ -1679,7 +1683,7 @@ static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2)
   sh2->data_array[a1] = d;
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_da;
-  int t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
   if (t)
     sh2_da_checks(a, t, sh2);
 #endif
@@ -1741,7 +1745,7 @@ static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2)
   ((u16 *)sh2->p_sdram)[a1 / 2] = d;
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_ram;
-  int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
   if (t)
     sh2_sdram_checks(a, d, sh2, t);
 #endif
@@ -1753,7 +1757,7 @@ static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2)
   ((u16 *)sh2->data_array)[a1 / 2] = d;
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_da;
-  int t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
   if (t)
     sh2_da_checks(a, t, sh2);
 #endif
@@ -1816,8 +1820,8 @@ static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2)
   *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16);
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_ram;
-  int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
-  int u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT];
+  u32 u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT];
   if (t|(u<<16))
     sh2_sdram_checks_l(a, d, sh2, t|(u<<16));
 #endif
@@ -1829,8 +1833,8 @@ static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2)
   *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16);
 #ifdef DRC_SH2
   u8 *p = sh2->p_drcblk_da;
-  int t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
-  int u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT];
+  u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT];
+  u32 u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT];
   if (t|(u<<16))
     sh2_da_checks_l(a, t|(u<<16), sh2);
 #endif
diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S
index ba83a6bf..b3a94b62 100644
--- a/pico/32x/memory_arm.S
+++ b/pico/32x/memory_arm.S
@@ -17,6 +17,7 @@
 .equ SH2_DRAM_OW, 1<<(32-SH2_DRAM_SHIFT) @ DRAM overwrite mode bit
 
 .text
+.align 5
 
 #if 0
 @ u32 a, SH2 *sh2
@@ -142,11 +143,12 @@ sh2_write8_sdram:
     ldrb    r3, [ip, r3, lsr #SH2_RAM_SHIFT+1]
     cmp     r3, #0
     bxeq    lr
+    @ need to load aligned 16 bit data for check
     ldr     ip, [r2, #OFS_SH2_p_sdram]
     bic     r0, r0, #1
-    mov     r3, r0, lsl #SH2_RAM_SHIFT
-    mov     r3, r3, lsr #SH2_RAM_SHIFT
-    ldrh    r1, [ip, r3]
+    mov     r1, r0, lsl #SH2_RAM_SHIFT
+    mov     r1, r1, lsr #SH2_RAM_SHIFT
+    ldrh    r1, [ip, r1]
     b       sh2_sdram_checks
 #else
     bx      lr
@@ -252,13 +254,8 @@ sh2_write32_da:
     ldr     ip, [r2, #OFS_SH2_p_drcblk_da]
     ldrb    r1, [ip, r3, lsr #SH2_DA_SHIFT+1]!
     ldrb    ip, [ip, #1]
-    orrs    r3, r1, ip, lsl #16
+    orrs    r1, r1, ip, lsl #16
     bxeq    lr
-    stmfd   sp!, {r0, r2, ip, lr}
-    bl      sh2_drc_wcheck_da
-    ldmfd   sp!, {r0, r2, ip, lr}
-    add     r0, r0, #2
-    mov     r1, ip
     b       sh2_drc_wcheck_da
 #else
     bx      lr
diff --git a/pico/pico_int.h b/pico/pico_int.h
index 89acc4fb..0fc458ef 100644
--- a/pico/pico_int.h
+++ b/pico/pico_int.h
@@ -921,6 +921,10 @@ void p32x_event_schedule(unsigned int now, enum p32x_event event, int after);
 void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after);
 void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles);
 
+#define p32x_sh2_ready(sh2, cycles) \
+  (CYCLES_GT(cycles,sh2->m68krcycles_done) && \
+  !(sh2->state&(SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL)))
+
 // 32x/memory.c
 extern struct Pico32xMem *Pico32xMem;
 unsigned int PicoRead8_32x(unsigned int a);
diff --git a/platform/gp2x/PicoDrive.gpe b/platform/gp2x/PicoDrive.gpe
index 1c065185..59416d93 100644
--- a/platform/gp2x/PicoDrive.gpe
+++ b/platform/gp2x/PicoDrive.gpe
@@ -7,6 +7,8 @@ if ! [ -e /dev/accel ]; then
 	export POLLUX_RAM_TIMINGS='ram_timings=2,9,4,1,1,1,1'
 	export POLLUX_LCD_TIMINGS_NTSC='lcd_timings=397,1,37,277,341,0,17,337;clkdiv0=9'
 	export POLLUX_LCD_TIMINGS_PAL='lcd_timings=428,1,37,277,341,0,17,337;clkdiv0=10'
+else
+	export POLLUX_RAM_TIMINGS='ram_timings=3,9,4,1,1,1,1'
 fi
 
 ./PicoDrive "$@"
-- 
2.47.3