frontend: update libpicofe, fix missed callbacks
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / emu_if.c
index cc0bcdf..cefadd2 100644 (file)
 #include "pcsxmem.h"
 #include "../psxhle.h"
 #include "../psxinterpreter.h"
 #include "pcsxmem.h"
 #include "../psxhle.h"
 #include "../psxinterpreter.h"
+#include "../psxcounters.h"
+#include "../psxevents.h"
+#include "../psxbios.h"
 #include "../r3000a.h"
 #include "../r3000a.h"
-#include "../cdrom.h"
-#include "../psxdma.h"
-#include "../mdec.h"
 #include "../gte_arm.h"
 #include "../gte_neon.h"
 #include "../gte_arm.h"
 #include "../gte_neon.h"
+#include "compiler_features.h"
+#include "arm_features.h"
 #define FLAGLESS
 #include "../gte.h"
 #define FLAGLESS
 #include "../gte.h"
+#if defined(NDRC_THREAD) && !defined(DRC_DISABLE) && !defined(LIGHTREC)
+#include "../../frontend/libretro-rthreads.h"
+#include "features/features_cpu.h"
+#include "retro_timers.h"
+#endif
+#ifdef _3DS
+#include <3ds_utils.h>
+#endif
 
 
+#ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+#endif
 
 //#define evprintf printf
 #define evprintf(...)
 
 
 //#define evprintf printf
 #define evprintf(...)
 
-char invalid_code[0x100000];
-u32 event_cycles[PSXINT_COUNT];
-
-static void schedule_timeslice(void)
-{
-       u32 i, c = psxRegs.cycle;
-       u32 irqs = psxRegs.interrupt;
-       s32 min, dif;
-
-       min = PSXCLK;
-       for (i = 0; irqs != 0; i++, irqs >>= 1) {
-               if (!(irqs & 1))
-                       continue;
-               dif = event_cycles[i] - c;
-               //evprintf("  ev %d\n", dif);
-               if (0 < dif && dif < min)
-                       min = dif;
-       }
-       next_interupt = c + min;
-}
-
-typedef void (irq_func)();
-
-static irq_func * const irq_funcs[] = {
-       [PSXINT_SIO]    = sioInterrupt,
-       [PSXINT_CDR]    = cdrInterrupt,
-       [PSXINT_CDREAD] = cdrReadInterrupt,
-       [PSXINT_GPUDMA] = gpuInterrupt,
-       [PSXINT_MDECOUTDMA] = mdec1Interrupt,
-       [PSXINT_SPUDMA] = spuInterrupt,
-       [PSXINT_MDECINDMA] = mdec0Interrupt,
-       [PSXINT_GPUOTCDMA] = gpuotcInterrupt,
-       [PSXINT_CDRDMA] = cdrDmaInterrupt,
-       [PSXINT_CDRLID] = cdrLidSeekInterrupt,
-       [PSXINT_CDRPLAY] = cdrPlayInterrupt,
-       [PSXINT_SPU_UPDATE] = spuUpdate,
-       [PSXINT_RCNT] = psxRcntUpdate,
-};
-
-/* local dupe of psxBranchTest, using event_cycles */
-static void irq_test(void)
-{
-       u32 cycle = psxRegs.cycle;
-       u32 irq, irq_bits;
-
-       for (irq = 0, irq_bits = psxRegs.interrupt; irq_bits != 0; irq++, irq_bits >>= 1) {
-               if (!(irq_bits & 1))
-                       continue;
-               if ((s32)(cycle - event_cycles[irq]) >= 0) {
-                       // note: irq_funcs() also modify psxRegs.interrupt
-                       psxRegs.interrupt &= ~(1u << irq);
-                       irq_funcs[irq]();
-               }
-       }
-
-       if ((psxHu32(0x1070) & psxHu32(0x1074)) && (Status & 0x401) == 0x401) {
-               psxException(0x400, 0);
-               pending_exception = 1;
-       }
-}
-
-void gen_interupt()
-{
-       evprintf("  +ge %08x, %u->%u (%d)\n", psxRegs.pc, psxRegs.cycle,
-               next_interupt, next_interupt - psxRegs.cycle);
-
-       irq_test();
-       //psxBranchTest();
-       //pending_exception = 1;
-
-       schedule_timeslice();
-
-       evprintf("  -ge %08x, %u->%u (%d)\n", psxRegs.pc, psxRegs.cycle,
-               next_interupt, next_interupt - psxRegs.cycle);
-}
-
-// from interpreter
-extern void MTC0(int reg, u32 val);
-
-void pcsx_mtc0(u32 reg, u32 val)
-{
-       evprintf("MTC0 %d #%x @%08x %u\n", reg, val, psxRegs.pc, psxRegs.cycle);
-       MTC0(reg, val);
-       gen_interupt();
-       if (Cause & Status & 0x0300) // possible sw irq
-               pending_exception = 1;
-}
-
-void pcsx_mtc0_ds(u32 reg, u32 val)
-{
-       evprintf("MTC0 %d #%x @%08x %u\n", reg, val, psxRegs.pc, psxRegs.cycle);
-       MTC0(reg, val);
-}
-
-void new_dyna_before_save(void)
-{
-       psxRegs.interrupt &= ~(1 << PSXINT_RCNT); // old savestate compat
-
-       // psxRegs.intCycle is always maintained, no need to convert
-}
-
-void new_dyna_after_save(void)
-{
-       psxRegs.interrupt |= 1 << PSXINT_RCNT;
-}
-
-static void new_dyna_restore(void)
-{
-       int i;
-       for (i = 0; i < PSXINT_COUNT; i++)
-               event_cycles[i] = psxRegs.intCycle[i].sCycle + psxRegs.intCycle[i].cycle;
-
-       event_cycles[PSXINT_RCNT] = psxNextsCounter + psxNextCounter;
-       psxRegs.interrupt |=  1 << PSXINT_RCNT;
-       psxRegs.interrupt &= (1 << PSXINT_COUNT) - 1;
-
-       new_dyna_pcsx_mem_load_state();
-}
+static void ari64_thread_sync(void);
 
 
-void new_dyna_freeze(void *f, int mode)
+void ndrc_freeze(void *f, int mode)
 {
        const char header_save[8] = "ariblks";
        uint32_t addrs[1024 * 4];
 {
        const char header_save[8] = "ariblks";
        uint32_t addrs[1024 * 4];
@@ -152,6 +47,8 @@ void new_dyna_freeze(void *f, int mode)
        int bytes;
        char header[8];
 
        int bytes;
        char header[8];
 
+       ari64_thread_sync();
+
        if (mode != 0) { // save
                size = new_dynarec_save_blocks(addrs, sizeof(addrs));
                if (size == 0)
        if (mode != 0) { // save
                size = new_dynarec_save_blocks(addrs, sizeof(addrs));
                if (size == 0)
@@ -162,8 +59,6 @@ void new_dyna_freeze(void *f, int mode)
                SaveFuncs.write(f, addrs, size);
        }
        else {
                SaveFuncs.write(f, addrs, size);
        }
        else {
-               new_dyna_restore();
-
                bytes = SaveFuncs.read(f, header, sizeof(header));
                if (bytes != sizeof(header) || strcmp(header, header_save)) {
                        if (bytes > 0)
                bytes = SaveFuncs.read(f, header, sizeof(header));
                if (bytes != sizeof(header) || strcmp(header, header_save)) {
                        if (bytes > 0)
@@ -189,7 +84,30 @@ void new_dyna_freeze(void *f, int mode)
        //printf("drc: %d block info entries %s\n", size/8, mode ? "saved" : "loaded");
 }
 
        //printf("drc: %d block info entries %s\n", size/8, mode ? "saved" : "loaded");
 }
 
-#ifndef DRC_DISABLE
+void ndrc_clear_full(void)
+{
+       ari64_thread_sync();
+       new_dynarec_clear_full();
+}
+
+#if !defined(DRC_DISABLE) && !defined(LIGHTREC)
+#include "linkage_offsets.h"
+
+static void ari64_thread_init(void);
+static int  ari64_thread_check_range(unsigned int start, unsigned int end);
+
+void pcsx_mtc0(psxRegisters *regs, u32 reg, u32 val)
+{
+       evprintf("MTC0 %d #%x @%08x %u\n", reg, val, regs->pc, regs->cycle);
+       MTC0(regs, reg, val);
+       gen_interupt(&regs->CP0);
+}
+
+void pcsx_mtc0_ds(psxRegisters *regs, u32 reg, u32 val)
+{
+       evprintf("MTC0 %d #%x @%08x %u\n", reg, val, regs->pc, regs->cycle);
+       MTC0(regs, reg, val);
+}
 
 /* GTE stuff */
 void *gte_handlers[64];
 
 /* GTE stuff */
 void *gte_handlers[64];
@@ -292,118 +210,425 @@ const uint64_t gte_reg_writes[64] = {
        [GTE_NCCT]  = GDBITS9(9,10,11,20,21,22,25,26,27),
 };
 
        [GTE_NCCT]  = GDBITS9(9,10,11,20,21,22,25,26,27),
 };
 
-static int ari64_init()
-{
-       static u32 scratch_buf[8*8*2] __attribute__((aligned(64)));
-       extern void (*psxCP2[64])();
-       extern void psxNULL();
-       size_t i;
-
-       new_dynarec_init();
-       new_dyna_pcsx_mem_init();
-
-       for (i = 0; i < ARRAY_SIZE(gte_handlers); i++)
-               if (psxCP2[i] != psxNULL)
-                       gte_handlers[i] = psxCP2[i];
-
-#if defined(__arm__) && !defined(DRC_DBG)
-       gte_handlers[0x06] = gteNCLIP_arm;
-#ifdef HAVE_ARMV5
-       gte_handlers_nf[0x01] = gteRTPS_nf_arm;
-       gte_handlers_nf[0x30] = gteRTPT_nf_arm;
-#endif
-#ifdef __ARM_NEON__
-       // compiler's _nf version is still a lot slower than neon
-       // _nf_arm RTPS is roughly the same, RTPT slower
-       gte_handlers[0x01] = gte_handlers_nf[0x01] = gteRTPS_neon;
-       gte_handlers[0x30] = gte_handlers_nf[0x30] = gteRTPT_neon;
-#endif
-#endif
-#ifdef DRC_DBG
-       memcpy(gte_handlers_nf, gte_handlers, sizeof(gte_handlers_nf));
-#endif
-       psxH_ptr = psxH;
-       zeromem_ptr = zero_mem;
-       scratch_buf_ptr = scratch_buf;
-
-       return 0;
-}
-
 static void ari64_reset()
 {
 static void ari64_reset()
 {
-       printf("ari64_reset\n");
+       ari64_thread_sync();
        new_dyna_pcsx_mem_reset();
        new_dynarec_invalidate_all_pages();
        new_dyna_pcsx_mem_reset();
        new_dynarec_invalidate_all_pages();
-       new_dyna_restore();
-       pending_exception = 1;
+       new_dyna_pcsx_mem_load_state();
 }
 
 // execute until predefined leave points
 // (HLE softcall exit and BIOS fastboot end)
 }
 
 // execute until predefined leave points
 // (HLE softcall exit and BIOS fastboot end)
-static void ari64_execute_until()
+static void ari64_execute_until(psxRegisters *regs)
 {
 {
-       schedule_timeslice();
+       void *drc_local = (char *)regs - LO_psxRegs;
 
 
-       evprintf("ari64_execute %08x, %u->%u (%d)\n", psxRegs.pc,
-               psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle);
+       assert(drc_local == dynarec_local);
+       evprintf("+exec %08x, %u->%u (%d)\n", regs->pc, regs->cycle,
+               regs->next_interupt, regs->next_interupt - regs->cycle);
 
 
-       new_dyna_start(dynarec_local);
+       new_dyna_start(drc_local);
 
 
-       evprintf("ari64_execute end %08x, %u->%u (%d)\n", psxRegs.pc,
-               psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle);
+       evprintf("-exec %08x, %u->%u (%d) stop %d \n", regs->pc, regs->cycle,
+               regs->next_interupt, regs->next_interupt - regs->cycle, regs->stop);
 }
 
 }
 
-static void ari64_execute()
+static void ari64_execute(struct psxRegisters *regs)
 {
 {
-       while (!stop) {
-               ari64_execute_until();
-               evprintf("drc left @%08x\n", psxRegs.pc);
+       while (!regs->stop) {
+               schedule_timeslice(regs);
+               ari64_execute_until(regs);
+               evprintf("drc left @%08x\n", regs->pc);
        }
 }
 
        }
 }
 
+static void ari64_execute_block(struct psxRegisters *regs, enum blockExecCaller caller)
+{
+       if (caller == EXEC_CALLER_BOOT)
+               regs->stop++;
+
+       regs->next_interupt = regs->cycle + 1;
+       ari64_execute_until(regs);
+
+       if (caller == EXEC_CALLER_BOOT)
+               regs->stop--;
+}
+
 static void ari64_clear(u32 addr, u32 size)
 {
 static void ari64_clear(u32 addr, u32 size)
 {
-       size *= 4; /* PCSX uses DMA units (words) */
+       u32 end = addr + size * 4; /* PCSX uses DMA units (words) */
 
 
-       evprintf("ari64_clear %08x %04x\n", addr, size);
+       evprintf("ari64_clear %08x %04x\n", addr, size * 4);
 
 
-       new_dynarec_invalidate_range(addr, addr + size);
+       if (!new_dynarec_quick_check_range(addr, end) &&
+           !ari64_thread_check_range(addr, end))
+               return;
+
+       ari64_thread_sync();
+       new_dynarec_invalidate_range(addr, end);
+}
+
+static void ari64_on_ext_change(int ram_replaced, int other_cpu_emu_exec)
+{
+       if (ram_replaced)
+               ari64_reset();
+       else if (other_cpu_emu_exec)
+               new_dyna_pcsx_mem_load_state();
 }
 
 }
 
-static void ari64_notify(int note, void *data) {
-       /*
-       Should be fixed when ARM dynarec has proper icache emulation.
+static void ari64_notify(enum R3000Anote note, void *data) {
        switch (note)
        {
        switch (note)
        {
-               case R3000ACPU_NOTIFY_CACHE_UNISOLATED:
-                       break;
-               case R3000ACPU_NOTIFY_CACHE_ISOLATED:
-               Sent from psxDma3().
-               case R3000ACPU_NOTIFY_DMA3_EXE_LOAD:
-               default:
-                       break;
+       case R3000ACPU_NOTIFY_CACHE_UNISOLATED:
+       case R3000ACPU_NOTIFY_CACHE_ISOLATED:
+               new_dyna_pcsx_mem_isolate(note == R3000ACPU_NOTIFY_CACHE_ISOLATED);
+               break;
+       case R3000ACPU_NOTIFY_BEFORE_SAVE:
+               break;
+       case R3000ACPU_NOTIFY_AFTER_LOAD:
+               ari64_on_ext_change(data == NULL, 0);
+               psxInt.Notify(note, data);
+               break;
        }
        }
-       */
 }
 
 static void ari64_apply_config()
 {
 }
 
 static void ari64_apply_config()
 {
+       int thread_changed;
+
+       ari64_thread_sync();
        intApplyConfig();
 
        if (Config.DisableStalls)
        intApplyConfig();
 
        if (Config.DisableStalls)
-               new_dynarec_hacks |= NDHACK_NO_STALLS;
+               ndrc_g.hacks |= NDHACK_NO_STALLS;
        else
        else
-               new_dynarec_hacks &= ~NDHACK_NO_STALLS;
+               ndrc_g.hacks &= ~NDHACK_NO_STALLS;
 
 
-       if (cycle_multiplier != cycle_multiplier_old
-           || new_dynarec_hacks != new_dynarec_hacks_old)
+       thread_changed = ((ndrc_g.hacks | ndrc_g.hacks_pergame) ^ ndrc_g.hacks_old)
+               & (NDHACK_THREAD_FORCE | NDHACK_THREAD_FORCE_ON);
+       if (Config.cycle_multiplier != ndrc_g.cycle_multiplier_old
+           || (ndrc_g.hacks | ndrc_g.hacks_pergame) != ndrc_g.hacks_old)
        {
                new_dynarec_clear_full();
        }
        {
                new_dynarec_clear_full();
        }
+       if (thread_changed)
+               ari64_thread_init();
+}
+
+#ifdef NDRC_THREAD
+static void clear_local_cache(void)
+{
+#if defined(__arm__) || defined(__aarch64__)
+       if (ndrc_g.thread.dirty_start) {
+               // see "Ensuring the visibility of updates to instructions"
+               // in v7/v8 reference manuals (DDI0406, DDI0487 etc.)
+#if defined(__aarch64__) || defined(HAVE_ARMV8)
+               // the actual clean/invalidate is broadcast to all cores,
+               // the manual only prescribes an isb
+               __asm__ volatile("isb");
+//#elif defined(_3DS)
+//             ctr_invalidate_icache();
+#else
+               // while on v6 this is always required, on v7 it depends on
+               // "Multiprocessing Extensions" being present, but that is difficult
+               // to detect so do it always for now
+               new_dyna_clear_cache(ndrc_g.thread.dirty_start, ndrc_g.thread.dirty_end);
+#endif
+               ndrc_g.thread.dirty_start = ndrc_g.thread.dirty_end = 0;
+       }
+#endif
+}
+
+static void mixed_execute_block(struct psxRegisters *regs, enum blockExecCaller caller)
+{
+       psxInt.ExecuteBlock(regs, caller);
+}
+
+static void mixed_clear(u32 addr, u32 size)
+{
+       ari64_clear(addr, size);
+       psxInt.Clear(addr, size);
+}
+
+static void mixed_notify(enum R3000Anote note, void *data)
+{
+       ari64_notify(note, data);
+       psxInt.Notify(note, data);
+}
+
+static R3000Acpu psxMixedCpu = {
+       NULL /* Init */, NULL /* Reset */, NULL /* Execute */,
+       mixed_execute_block,
+       mixed_clear,
+       mixed_notify,
+       NULL /* ApplyConfig */, NULL /* Shutdown */
+};
+
+static noinline void ari64_execute_threaded_slow(struct psxRegisters *regs,
+       enum blockExecCaller block_caller)
+{
+       if (ndrc_g.thread.busy_addr == ~0u) {
+               memcpy(ndrc_smrv_regs, regs->GPR.r, sizeof(ndrc_smrv_regs));
+               slock_lock(ndrc_g.thread.lock);
+               ndrc_g.thread.busy_addr = regs->pc;
+               slock_unlock(ndrc_g.thread.lock);
+               scond_signal(ndrc_g.thread.cond);
+       }
+
+       //ari64_notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL);
+       psxInt.Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL);
+       assert(psxCpu == &psxRec);
+       psxCpu = &psxMixedCpu;
+       for (;;)
+       {
+               mixed_execute_block(regs, block_caller);
+
+               if (ndrc_g.thread.busy_addr == ~0u)
+                       break;
+               if (block_caller == EXEC_CALLER_HLE) {
+                       if (!psxBiosSoftcallEnded())
+                               continue;
+                       break;
+               }
+               else if (block_caller == EXEC_CALLER_BOOT) {
+                       if (!psxExecuteBiosEnded())
+                               continue;
+                       break;
+               }
+               if (regs->stop)
+                       break;
+       }
+       psxCpu = &psxRec;
+
+       psxInt.Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL);
+       //ari64_notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL);
+       ari64_on_ext_change(0, 1);
+}
+
+static void ari64_execute_threaded_once(struct psxRegisters *regs,
+       enum blockExecCaller block_caller)
+{
+       void *drc_local = (char *)regs - LO_psxRegs;
+       struct ht_entry *hash_table =
+               *(void **)((char *)drc_local + LO_hash_table_ptr);
+       void *target;
+
+       if (likely(ndrc_g.thread.busy_addr == ~0u)) {
+               target = ndrc_get_addr_ht_param(hash_table, regs->pc,
+                               ndrc_cm_no_compile);
+               if (target) {
+                       clear_local_cache();
+                       new_dyna_start_at(drc_local, target);
+                       return;
+               }
+       }
+       ari64_execute_threaded_slow(regs, block_caller);
+}
+
+static void ari64_execute_threaded(struct psxRegisters *regs)
+{
+       schedule_timeslice(regs);
+       while (!regs->stop)
+       {
+               ari64_execute_threaded_once(regs, EXEC_CALLER_OTHER);
+
+               if ((s32)(regs->cycle - regs->next_interupt) >= 0)
+                       schedule_timeslice(regs);
+       }
+}
+
+static void ari64_execute_threaded_block(struct psxRegisters *regs,
+       enum blockExecCaller caller)
+{
+       if (caller == EXEC_CALLER_BOOT)
+               regs->stop++;
+
+       regs->next_interupt = regs->cycle + 1;
+
+       ari64_execute_threaded_once(regs, caller);
+       if (regs->cpuInRecursion) {
+               // must sync since we are returning to compiled code
+               ari64_thread_sync();
+       }
+
+       if (caller == EXEC_CALLER_BOOT)
+               regs->stop--;
+}
+
+static void ari64_thread_sync(void)
+{
+       if (!ndrc_g.thread.lock || ndrc_g.thread.busy_addr == ~0u)
+               return;
+       for (;;) {
+               slock_lock(ndrc_g.thread.lock);
+               slock_unlock(ndrc_g.thread.lock);
+               if (ndrc_g.thread.busy_addr == ~0)
+                       break;
+               retro_sleep(0);
+       }
+}
+
+static int ari64_thread_check_range(unsigned int start, unsigned int end)
+{
+       u32 addr = ndrc_g.thread.busy_addr;
+       if (addr == ~0u)
+               return 0;
+
+       addr &= 0x1fffffff;
+       start &= 0x1fffffff;
+       end &= 0x1fffffff;
+       if (addr >= end)
+               return 0;
+       if (addr + MAXBLOCK * 4 <= start)
+               return 0;
+
+       //SysPrintf("%x hits %x-%x\n", addr, start, end);
+       return 1;
+}
+
+static void ari64_compile_thread(void *unused)
+{
+       struct ht_entry *hash_table =
+               *(void **)((char *)dynarec_local + LO_hash_table_ptr);
+       void *target;
+       u32 addr;
+
+       slock_lock(ndrc_g.thread.lock);
+       while (!ndrc_g.thread.exit)
+       {
+               addr = *(volatile unsigned int *)&ndrc_g.thread.busy_addr;
+               if (addr == ~0u)
+                       scond_wait(ndrc_g.thread.cond, ndrc_g.thread.lock);
+               addr = *(volatile unsigned int *)&ndrc_g.thread.busy_addr;
+               if (addr == ~0u || ndrc_g.thread.exit)
+                       continue;
+
+               target = ndrc_get_addr_ht_param(hash_table, addr,
+                               ndrc_cm_compile_in_thread);
+               //printf("c  %08x -> %p\n", addr, target);
+               ndrc_g.thread.busy_addr = ~0u;
+       }
+       slock_unlock(ndrc_g.thread.lock);
+       (void)target;
+}
+
+static void ari64_thread_shutdown(void)
+{
+       psxRec.Execute = ari64_execute;
+       psxRec.ExecuteBlock = ari64_execute_block;
+
+       if (ndrc_g.thread.lock)
+               slock_lock(ndrc_g.thread.lock);
+       ndrc_g.thread.exit = 1;
+       if (ndrc_g.thread.lock)
+               slock_unlock(ndrc_g.thread.lock);
+       if (ndrc_g.thread.cond)
+               scond_signal(ndrc_g.thread.cond);
+       if (ndrc_g.thread.handle) {
+               sthread_join(ndrc_g.thread.handle);
+               ndrc_g.thread.handle = NULL;
+       }
+       if (ndrc_g.thread.cond) {
+               scond_free(ndrc_g.thread.cond);
+               ndrc_g.thread.cond = NULL;
+       }
+       if (ndrc_g.thread.lock) {
+               slock_free(ndrc_g.thread.lock);
+               ndrc_g.thread.lock = NULL;
+       }
+       ndrc_g.thread.busy_addr = ~0u;
+}
+
+static void ari64_thread_init(void)
+{
+       int enable;
+
+       if (ndrc_g.hacks_pergame & NDHACK_THREAD_FORCE)
+               enable = 0;
+       else if (ndrc_g.hacks & NDHACK_THREAD_FORCE)
+               enable = ndrc_g.hacks & NDHACK_THREAD_FORCE_ON;
+       else {
+               u32 cpu_count = cpu_features_get_core_amount();
+               enable = cpu_count > 1;
+#ifdef _3DS
+               // bad for old3ds, reprotedly no improvement for new3ds
+               enable = 0;
+#endif
+       }
+
+       if (!ndrc_g.thread.handle == !enable)
+               return;
+
+       ari64_thread_shutdown();
+       ndrc_g.thread.exit = 0;
+       ndrc_g.thread.busy_addr = ~0u;
+
+       if (enable) {
+               ndrc_g.thread.lock = slock_new();
+               ndrc_g.thread.cond = scond_new();
+       }
+       if (ndrc_g.thread.lock && ndrc_g.thread.cond)
+               ndrc_g.thread.handle = pcsxr_sthread_create(ari64_compile_thread, PCSXRT_DRC);
+       if (ndrc_g.thread.handle) {
+               psxRec.Execute = ari64_execute_threaded;
+               psxRec.ExecuteBlock = ari64_execute_threaded_block;
+       }
+       else {
+               // clean up potential incomplete init
+               ari64_thread_shutdown();
+       }
+       SysPrintf("compiler thread %sabled\n", ndrc_g.thread.handle ? "en" : "dis");
+}
+#else // if !NDRC_THREAD
+static void ari64_thread_init(void) {}
+static void ari64_thread_shutdown(void) {}
+static int ari64_thread_check_range(unsigned int start, unsigned int end) { return 0; }
+#endif
+
+static int ari64_init()
+{
+       static u32 scratch_buf[8*8*2] __attribute__((aligned(64)));
+       size_t i;
+
+       new_dynarec_init();
+       new_dyna_pcsx_mem_init();
+
+       for (i = 0; i < ARRAY_SIZE(gte_handlers); i++)
+               if (psxCP2[i] != gteNULL)
+                       gte_handlers[i] = psxCP2[i];
+
+#if defined(__arm__) && !defined(DRC_DBG)
+       gte_handlers[0x06] = gteNCLIP_arm;
+#ifdef HAVE_ARMV5
+       gte_handlers_nf[0x01] = gteRTPS_nf_arm;
+       gte_handlers_nf[0x30] = gteRTPT_nf_arm;
+#endif
+#ifdef __ARM_NEON__
+       // compiler's _nf version is still a lot slower than neon
+       // _nf_arm RTPS is roughly the same, RTPT slower
+       gte_handlers[0x01] = gte_handlers_nf[0x01] = gteRTPS_neon;
+       gte_handlers[0x30] = gte_handlers_nf[0x30] = gteRTPT_neon;
+#endif
+#endif
+#ifdef DRC_DBG
+       memcpy(gte_handlers_nf, gte_handlers, sizeof(gte_handlers_nf));
+#endif
+       psxH_ptr = psxH;
+       zeromem_ptr = zero_mem;
+       scratch_buf_ptr = scratch_buf; // for gte_neon.S
+
+       ndrc_g.cycle_multiplier_old = Config.cycle_multiplier;
+       ndrc_g.hacks_old = ndrc_g.hacks | ndrc_g.hacks_pergame;
+       ari64_apply_config();
+       ari64_thread_init();
+
+       return 0;
 }
 
 static void ari64_shutdown()
 {
 }
 
 static void ari64_shutdown()
 {
+       ari64_thread_shutdown();
        new_dynarec_cleanup();
        new_dyna_pcsx_mem_shutdown();
 }
        new_dynarec_cleanup();
        new_dyna_pcsx_mem_shutdown();
 }
@@ -412,7 +637,7 @@ R3000Acpu psxRec = {
        ari64_init,
        ari64_reset,
        ari64_execute,
        ari64_init,
        ari64_reset,
        ari64_execute,
-       ari64_execute_until,
+       ari64_execute_block,
        ari64_clear,
        ari64_notify,
        ari64_apply_config,
        ari64_clear,
        ari64_notify,
        ari64_apply_config,
@@ -421,21 +646,7 @@ R3000Acpu psxRec = {
 
 #else // if DRC_DISABLE
 
 
 #else // if DRC_DISABLE
 
-unsigned int address;
-int pending_exception, stop;
-unsigned int next_interupt;
-int new_dynarec_did_compile;
-int cycle_multiplier;
-int cycle_multiplier_override;
-int cycle_multiplier_old;
-int new_dynarec_hacks_pergame;
-int new_dynarec_hacks_old;
-int new_dynarec_hacks;
-void *psxH_ptr;
-void *zeromem_ptr;
-u8 zero_mem[0x1000];
-void *mem_rtab;
-void *scratch_buf_ptr;
+struct ndrc_globals ndrc_g; // dummy
 void new_dynarec_init() {}
 void new_dyna_start(void *context) {}
 void new_dynarec_cleanup() {}
 void new_dynarec_init() {}
 void new_dyna_start(void *context) {}
 void new_dynarec_cleanup() {}
@@ -445,9 +656,15 @@ void new_dynarec_invalidate_range(unsigned int start, unsigned int end) {}
 void new_dyna_pcsx_mem_init(void) {}
 void new_dyna_pcsx_mem_reset(void) {}
 void new_dyna_pcsx_mem_load_state(void) {}
 void new_dyna_pcsx_mem_init(void) {}
 void new_dyna_pcsx_mem_reset(void) {}
 void new_dyna_pcsx_mem_load_state(void) {}
+void new_dyna_pcsx_mem_isolate(int enable) {}
 void new_dyna_pcsx_mem_shutdown(void) {}
 int  new_dynarec_save_blocks(void *save, int size) { return 0; }
 void new_dynarec_load_blocks(const void *save, int size) {}
 void new_dyna_pcsx_mem_shutdown(void) {}
 int  new_dynarec_save_blocks(void *save, int size) { return 0; }
 void new_dynarec_load_blocks(const void *save, int size) {}
+
+#endif // DRC_DISABLE
+
+#ifndef NDRC_THREAD
+static void ari64_thread_sync(void) {}
 #endif
 
 #ifdef DRC_DBG
 #endif
 
 #ifdef DRC_DBG
@@ -508,12 +725,12 @@ void do_insn_trace(void)
        }
        // log event changes
        for (i = 0; i < PSXINT_COUNT; i++) {
        }
        // log event changes
        for (i = 0; i < PSXINT_COUNT; i++) {
-               if (event_cycles[i] != event_cycles_o[i]) {
+               if (psxRegs.event_cycles[i] != event_cycles_o[i]) {
                        byte = 0xf8;
                        fwrite(&byte, 1, 1, f);
                        fwrite(&i, 1, 1, f);
                        byte = 0xf8;
                        fwrite(&byte, 1, 1, f);
                        fwrite(&i, 1, 1, f);
-                       fwrite(&event_cycles[i], 1, 4, f);
-                       event_cycles_o[i] = event_cycles[i];
+                       fwrite(&psxRegs.event_cycles[i], 1, 4, f);
+                       event_cycles_o[i] = psxRegs.event_cycles[i];
                }
        }
        #define SAVE_IF_CHANGED(code_, name_) { \
                }
        }
        #define SAVE_IF_CHANGED(code_, name_) { \
@@ -598,7 +815,9 @@ void do_insn_cmp(void)
        static u32 handler_cycle_intr;
        u32 *allregs_p = (void *)&psxRegs;
        u32 *allregs_e = (void *)&rregs;
        static u32 handler_cycle_intr;
        u32 *allregs_p = (void *)&psxRegs;
        u32 *allregs_e = (void *)&rregs;
+       u32 badregs_mask = 0;
        static u32 ppc, failcount;
        static u32 ppc, failcount;
+       static u32 badregs_mask_prev;
        int i, ret, bad = 0, fatal = 0, which_event = -1;
        u32 ev_cycles = 0;
        u8 code;
        int i, ret, bad = 0, fatal = 0, which_event = -1;
        u32 ev_cycles = 0;
        u8 code;
@@ -648,9 +867,9 @@ void do_insn_cmp(void)
 
        //if (psxRegs.cycle == 166172) breakme();
 
 
        //if (psxRegs.cycle == 166172) breakme();
 
-       if (which_event >= 0 && event_cycles[which_event] != ev_cycles) {
+       if (which_event >= 0 && psxRegs.event_cycles[which_event] != ev_cycles) {
                printf("bad ev_cycles #%d: %u %u / %u\n", which_event,
                printf("bad ev_cycles #%d: %u %u / %u\n", which_event,
-                       event_cycles[which_event], ev_cycles, psxRegs.cycle);
+                       psxRegs.event_cycles[which_event], ev_cycles, psxRegs.cycle);
                fatal = 1;
        }
 
                fatal = 1;
        }
 
@@ -678,18 +897,24 @@ void do_insn_cmp(void)
                if (allregs_p[i] != allregs_e[i]) {
                        miss_log_add(i, allregs_p[i], allregs_e[i], psxRegs.pc, psxRegs.cycle);
                        bad++;
                if (allregs_p[i] != allregs_e[i]) {
                        miss_log_add(i, allregs_p[i], allregs_e[i], psxRegs.pc, psxRegs.cycle);
                        bad++;
-                       if (i > 32+2)
+                       if (i >= 32)
                                fatal = 1;
                                fatal = 1;
+                       else
+                               badregs_mask |= 1u << i;
                }
        }
 
                }
        }
 
-       if (!fatal && psxRegs.pc == rregs.pc && bad < 6 && failcount < 32) {
+       if (badregs_mask_prev & badregs_mask)
+               failcount++;
+       else
+               failcount = 0;
+
+       if (!fatal && psxRegs.pc == rregs.pc && bad < 6 && failcount < 24) {
                static int last_mcycle;
                if (last_mcycle != psxRegs.cycle >> 20) {
                        printf("%u\n", psxRegs.cycle);
                        last_mcycle = psxRegs.cycle >> 20;
                }
                static int last_mcycle;
                if (last_mcycle != psxRegs.cycle >> 20) {
                        printf("%u\n", psxRegs.cycle);
                        last_mcycle = psxRegs.cycle >> 20;
                }
-               failcount++;
                goto ok;
        }
 
                goto ok;
        }
 
@@ -701,13 +926,15 @@ void do_insn_cmp(void)
        for (i = 0; i < 8; i++)
                printf("r%d=%08x r%2d=%08x r%2d=%08x r%2d=%08x\n", i, allregs_p[i],
                        i+8, allregs_p[i+8], i+16, allregs_p[i+16], i+24, allregs_p[i+24]);
        for (i = 0; i < 8; i++)
                printf("r%d=%08x r%2d=%08x r%2d=%08x r%2d=%08x\n", i, allregs_p[i],
                        i+8, allregs_p[i+8], i+16, allregs_p[i+16], i+24, allregs_p[i+24]);
-       printf("PC: %08x/%08x, cycle %u, next %u\n", psxRegs.pc, ppc, psxRegs.cycle, next_interupt);
+       printf("PC: %08x/%08x, cycle %u, next %u\n", psxRegs.pc, ppc,
+               psxRegs.cycle, psxRegs.next_interupt);
        //dump_mem("/tmp/psxram.dump", psxM, 0x200000);
        //dump_mem("/mnt/ntz/dev/pnd/tmp/psxregs.dump", psxH, 0x10000);
        exit(1);
 ok:
        //psxRegs.cycle = rregs.cycle + 2; // sync timing
        ppc = psxRegs.pc;
        //dump_mem("/tmp/psxram.dump", psxM, 0x200000);
        //dump_mem("/mnt/ntz/dev/pnd/tmp/psxregs.dump", psxH, 0x10000);
        exit(1);
 ok:
        //psxRegs.cycle = rregs.cycle + 2; // sync timing
        ppc = psxRegs.pc;
+       badregs_mask_prev = badregs_mask;
 }
 
 }
 
-#endif
+#endif // DRC_DBG