+ if (thread_changed)
+ ari64_thread_init();
+}
+
+#ifdef NDRC_THREAD
+static void clear_local_cache(void)
+{
+#if defined(__arm__) || defined(__aarch64__)
+ if (ndrc_g.thread.dirty_start) {
+ // see "Ensuring the visibility of updates to instructions"
+ // in v7/v8 reference manuals (DDI0406, DDI0487 etc.)
+#if defined(__aarch64__) || defined(HAVE_ARMV8)
+ // the actual clean/invalidate is broadcast to all cores,
+ // the manual only prescribes an isb
+ __asm__ volatile("isb");
+//#elif defined(_3DS)
+// ctr_invalidate_icache();
+#else
+ // while on v6 this is always required, on v7 it depends on
+ // "Multiprocessing Extensions" being present, but that is difficult
+ // to detect so do it always for now
+ new_dyna_clear_cache(ndrc_g.thread.dirty_start, ndrc_g.thread.dirty_end);
+#endif
+ ndrc_g.thread.dirty_start = ndrc_g.thread.dirty_end = 0;
+ }
+#endif
+}
+
+static void mixed_execute_block(struct psxRegisters *regs, enum blockExecCaller caller)
+{
+ psxInt.ExecuteBlock(regs, caller);
+}
+
+static void mixed_clear(u32 addr, u32 size)
+{
+ ari64_clear(addr, size);
+ psxInt.Clear(addr, size);
+}
+
+static void mixed_notify(enum R3000Anote note, void *data)
+{
+ ari64_notify(note, data);
+ psxInt.Notify(note, data);
+}
+
+static R3000Acpu psxMixedCpu = {
+ NULL /* Init */, NULL /* Reset */, NULL /* Execute */,
+ mixed_execute_block,
+ mixed_clear,
+ mixed_notify,
+ NULL /* ApplyConfig */, NULL /* Shutdown */
+};
+
+static noinline void ari64_execute_threaded_slow(struct psxRegisters *regs,
+ enum blockExecCaller block_caller)
+{
+ if (ndrc_g.thread.busy_addr == ~0u) {
+ memcpy(ndrc_smrv_regs, regs->GPR.r, sizeof(ndrc_smrv_regs));
+ slock_lock(ndrc_g.thread.lock);
+ ndrc_g.thread.busy_addr = regs->pc;
+ slock_unlock(ndrc_g.thread.lock);
+ scond_signal(ndrc_g.thread.cond);
+ }
+
+ //ari64_notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL);
+ psxInt.Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL);
+ assert(psxCpu == &psxRec);
+ psxCpu = &psxMixedCpu;
+ for (;;)
+ {
+ mixed_execute_block(regs, block_caller);
+
+ if (ndrc_g.thread.busy_addr == ~0u)
+ break;
+ if (block_caller == EXEC_CALLER_HLE) {
+ if (!psxBiosSoftcallEnded())
+ continue;
+ break;
+ }
+ else if (block_caller == EXEC_CALLER_BOOT) {
+ if (!psxExecuteBiosEnded())
+ continue;
+ break;
+ }
+ if (regs->stop)
+ break;
+ }
+ psxCpu = &psxRec;
+
+ psxInt.Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL);
+ //ari64_notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL);
+ ari64_on_ext_change(0, 1);
+}
+
+static void ari64_execute_threaded_once(struct psxRegisters *regs,
+ enum blockExecCaller block_caller)
+{
+ void *drc_local = (char *)regs - LO_psxRegs;
+ struct ht_entry *hash_table =
+ *(void **)((char *)drc_local + LO_hash_table_ptr);
+ void *target;
+
+ if (likely(ndrc_g.thread.busy_addr == ~0u)) {
+ target = ndrc_get_addr_ht_param(hash_table, regs->pc,
+ ndrc_cm_no_compile);
+ if (target) {
+ clear_local_cache();
+ new_dyna_start_at(drc_local, target);
+ return;
+ }
+ }
+ ari64_execute_threaded_slow(regs, block_caller);
+}
+
+static void ari64_execute_threaded(struct psxRegisters *regs)
+{
+ schedule_timeslice(regs);
+ while (!regs->stop)
+ {
+ ari64_execute_threaded_once(regs, EXEC_CALLER_OTHER);
+
+ if ((s32)(regs->cycle - regs->next_interupt) >= 0)
+ schedule_timeslice(regs);
+ }
+}
+
+static void ari64_execute_threaded_block(struct psxRegisters *regs,
+ enum blockExecCaller caller)
+{
+ if (caller == EXEC_CALLER_BOOT)
+ regs->stop++;
+
+ regs->next_interupt = regs->cycle + 1;
+
+ ari64_execute_threaded_once(regs, caller);
+ if (regs->cpuInRecursion) {
+ // must sync since we are returning to compiled code
+ ari64_thread_sync();
+ }
+
+ if (caller == EXEC_CALLER_BOOT)
+ regs->stop--;
+}
+
+static void ari64_thread_sync(void)
+{
+ if (!ndrc_g.thread.lock || ndrc_g.thread.busy_addr == ~0u)
+ return;
+ for (;;) {
+ slock_lock(ndrc_g.thread.lock);
+ slock_unlock(ndrc_g.thread.lock);
+ if (ndrc_g.thread.busy_addr == ~0)
+ break;
+ retro_sleep(0);
+ }
+}
+
+static int ari64_thread_check_range(unsigned int start, unsigned int end)
+{
+ u32 addr = ndrc_g.thread.busy_addr;
+ if (addr == ~0u)
+ return 0;
+
+ addr &= 0x1fffffff;
+ start &= 0x1fffffff;
+ end &= 0x1fffffff;
+ if (addr >= end)
+ return 0;
+ if (addr + MAXBLOCK * 4 <= start)
+ return 0;
+
+ //SysPrintf("%x hits %x-%x\n", addr, start, end);
+ return 1;
+}
+
+static void ari64_compile_thread(void *unused)
+{
+ struct ht_entry *hash_table =
+ *(void **)((char *)dynarec_local + LO_hash_table_ptr);
+ void *target;
+ u32 addr;
+
+ slock_lock(ndrc_g.thread.lock);
+ while (!ndrc_g.thread.exit)
+ {
+ addr = *(volatile unsigned int *)&ndrc_g.thread.busy_addr;
+ if (addr == ~0u)
+ scond_wait(ndrc_g.thread.cond, ndrc_g.thread.lock);
+ addr = *(volatile unsigned int *)&ndrc_g.thread.busy_addr;
+ if (addr == ~0u || ndrc_g.thread.exit)
+ continue;
+
+ target = ndrc_get_addr_ht_param(hash_table, addr,
+ ndrc_cm_compile_in_thread);
+ //printf("c %08x -> %p\n", addr, target);
+ ndrc_g.thread.busy_addr = ~0u;
+ }
+ slock_unlock(ndrc_g.thread.lock);
+ (void)target;
+}
+
+static void ari64_thread_shutdown(void)
+{
+ psxRec.Execute = ari64_execute;
+ psxRec.ExecuteBlock = ari64_execute_block;
+
+ if (ndrc_g.thread.lock)
+ slock_lock(ndrc_g.thread.lock);
+ ndrc_g.thread.exit = 1;
+ if (ndrc_g.thread.lock)
+ slock_unlock(ndrc_g.thread.lock);
+ if (ndrc_g.thread.cond)
+ scond_signal(ndrc_g.thread.cond);
+ if (ndrc_g.thread.handle) {
+ sthread_join(ndrc_g.thread.handle);
+ ndrc_g.thread.handle = NULL;
+ }
+ if (ndrc_g.thread.cond) {
+ scond_free(ndrc_g.thread.cond);
+ ndrc_g.thread.cond = NULL;
+ }
+ if (ndrc_g.thread.lock) {
+ slock_free(ndrc_g.thread.lock);
+ ndrc_g.thread.lock = NULL;
+ }
+ ndrc_g.thread.busy_addr = ~0u;
+}
+
+static void ari64_thread_init(void)
+{
+ int enable;
+
+ if (ndrc_g.hacks_pergame & NDHACK_THREAD_FORCE)
+ enable = 0;
+ else if (ndrc_g.hacks & NDHACK_THREAD_FORCE)
+ enable = ndrc_g.hacks & NDHACK_THREAD_FORCE_ON;
+ else {
+ u32 cpu_count = cpu_features_get_core_amount();
+ enable = cpu_count > 1;
+#ifdef _3DS
+ // bad for old3ds, reprotedly no improvement for new3ds
+ enable = 0;
+#endif
+ }
+
+ if (!ndrc_g.thread.handle == !enable)
+ return;
+
+ ari64_thread_shutdown();
+ ndrc_g.thread.exit = 0;
+ ndrc_g.thread.busy_addr = ~0u;
+
+ if (enable) {
+ ndrc_g.thread.lock = slock_new();
+ ndrc_g.thread.cond = scond_new();
+ }
+ if (ndrc_g.thread.lock && ndrc_g.thread.cond)
+ ndrc_g.thread.handle = pcsxr_sthread_create(ari64_compile_thread, PCSXRT_DRC);
+ if (ndrc_g.thread.handle) {
+ psxRec.Execute = ari64_execute_threaded;
+ psxRec.ExecuteBlock = ari64_execute_threaded_block;
+ }
+ else {
+ // clean up potential incomplete init
+ ari64_thread_shutdown();
+ }
+ SysPrintf("compiler thread %sabled\n", ndrc_g.thread.handle ? "en" : "dis");
+}
+#else // if !NDRC_THREAD
+static void ari64_thread_init(void) {}
+static void ari64_thread_shutdown(void) {}
+static int ari64_thread_check_range(unsigned int start, unsigned int end) { return 0; }
+#endif
+
+static int ari64_init()
+{
+ static u32 scratch_buf[8*8*2] __attribute__((aligned(64)));
+ size_t i;
+
+ new_dynarec_init();
+ new_dyna_pcsx_mem_init();
+
+ for (i = 0; i < ARRAY_SIZE(gte_handlers); i++)
+ if (psxCP2[i] != gteNULL)
+ gte_handlers[i] = psxCP2[i];
+
+#if defined(__arm__) && !defined(DRC_DBG)
+ gte_handlers[0x06] = gteNCLIP_arm;
+#ifdef HAVE_ARMV5
+ gte_handlers_nf[0x01] = gteRTPS_nf_arm;
+ gte_handlers_nf[0x30] = gteRTPT_nf_arm;
+#endif
+#ifdef __ARM_NEON__
+ // compiler's _nf version is still a lot slower than neon
+ // _nf_arm RTPS is roughly the same, RTPT slower
+ gte_handlers[0x01] = gte_handlers_nf[0x01] = gteRTPS_neon;
+ gte_handlers[0x30] = gte_handlers_nf[0x30] = gteRTPT_neon;
+#endif
+#endif
+#ifdef DRC_DBG
+ memcpy(gte_handlers_nf, gte_handlers, sizeof(gte_handlers_nf));
+#endif
+ psxH_ptr = psxH;
+ zeromem_ptr = zero_mem;
+ scratch_buf_ptr = scratch_buf; // for gte_neon.S
+
+ ndrc_g.cycle_multiplier_old = Config.cycle_multiplier;
+ ndrc_g.hacks_old = ndrc_g.hacks | ndrc_g.hacks_pergame;
+ ari64_apply_config();
+ ari64_thread_init();
+
+ return 0;