X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=ef9bec7b886467af78cc9a90ec6061dbbeddac3c;hp=9ce1f069a92bdd5766adb1db2e175aedf489c746;hb=919981d0bca7a0898133362a91405395678612e3;hpb=3968e69e7fa8f9cb0d44ac79477d5929b9649271 diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 9ce1f069..ef9bec7b 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -66,6 +66,23 @@ static int sceBlock; #define MAXBLOCK 4096 #define MAX_OUTPUT_BLOCK_SIZE 262144 +struct ndrc_mem +{ + u_char translation_cache[1 << TARGET_SIZE_2]; + struct + { + struct tramp_insns ops[2048 / sizeof(struct tramp_insns)]; + const void *f[2048 / sizeof(void *)]; + } tramp; +}; + +#ifdef BASE_ADDR_DYNAMIC +static struct ndrc_mem *ndrc; +#else +static struct ndrc_mem ndrc_ __attribute__((aligned(4096))); +static struct ndrc_mem *ndrc = &ndrc_; +#endif + // stubs enum stub_type { CC_STUB = 1, @@ -308,6 +325,8 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override); static void *get_direct_memhandler(void *table, u_int addr, enum stub_type type, uintptr_t *addr_host); static void pass_args(int a0, int a1); +static void emit_far_jump(const void *f); +static void emit_far_call(const void *f); static void mprotect_w_x(void *start, void *end, int is_x) { @@ -336,7 +355,7 @@ static void start_tcache_write(void *start, void *end) static void end_tcache_write(void *start, void *end) { -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) size_t len = (char *)end - (char *)start; #if defined(__BLACKBERRY_QNX__) msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE); @@ -346,12 +365,14 @@ static void end_tcache_write(void *start, void *end) sceKernelSyncVMDomain(sceBlock, start, len); #elif defined(_3DS) ctr_flush_invalidate_cache(); + #elif defined(__aarch64__) + // as of 2021, __clear_cache() is still broken on arm64 + // so here is a custom one :( + clear_cache_arm64(start, end); #else __clear_cache(start, end); #endif (void)len; -#else - __clear_cache(start, end); #endif mprotect_w_x(start, end, 1); @@ -360,8 +381,8 @@ static void end_tcache_write(void *start, void *end) static void *start_block(void) { u_char *end = out + MAX_OUTPUT_BLOCK_SIZE; - if (end > translation_cache + (1< ndrc->translation_cache + sizeof(ndrc->translation_cache)) + end = ndrc->translation_cache + sizeof(ndrc->translation_cache); start_tcache_write(out, end); return out; } @@ -371,6 +392,49 @@ static void end_block(void *start) end_tcache_write(start, out); } +// also takes care of w^x mappings when patching code +static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)]; + +static void mark_clear_cache(void *target) +{ + uintptr_t offset = (u_char *)target - ndrc->translation_cache; + u_int mask = 1u << ((offset >> 12) & 31); + if (!(needs_clear_cache[offset >> 17] & mask)) { + char *start = (char *)((uintptr_t)target & ~4095l); + start_tcache_write(start, start + 4095); + needs_clear_cache[offset >> 17] |= mask; + } +} + +// Clearing the cache is rather slow on ARM Linux, so mark the areas +// that need to be cleared, and then only clear these areas once. +static void do_clear_cache(void) +{ + int i, j; + for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++) + { + u_int bitmap = needs_clear_cache[i]; + if (!bitmap) + continue; + for (j = 0; j < 32; j++) + { + u_char *start, *end; + if (!(bitmap & (1<translation_cache + i*131072 + j*4096; + end = start + 4095; + for (j++; j < 32; j++) { + if (!(bitmap & (1<tramp.f); i++) { + if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL) + break; + } + if (i == ARRAY_SIZE(ndrc->tramp.f)) { + SysPrintf("trampoline table is full, last func %p\n", f); + abort(); + } + if (ndrc->tramp.f[i] == NULL) { + start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + ndrc->tramp.f[i] = f; + end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + } + return &ndrc->tramp.ops[i]; +} + +static void emit_far_jump(const void *f) +{ + if (can_jump_or_call(f)) { + emit_jmp(f); + return; + } + + f = get_trampoline(f); + emit_jmp(f); +} + +static void emit_far_call(const void *f) +{ + if (can_jump_or_call(f)) { + emit_call(f); + return; + } + + f = get_trampoline(f); + emit_call(f); +} + // Add virtual address mapping to linked list void ll_add(struct ll_entry **head,int vaddr,void *addr) { @@ -993,9 +1099,7 @@ static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift) { inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr); void *host_addr=find_extjump_insn(head->addr); - #if defined(__arm__) || defined(__aarch64__) - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); } head=head->next; @@ -1021,9 +1125,7 @@ static void invalidate_page(u_int page) while(head!=NULL) { inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr); void *host_addr=find_extjump_insn(head->addr); - #if defined(__arm__) || defined(__aarch64__) - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); next=head->next; free(head); @@ -1046,9 +1148,7 @@ static void invalidate_block_range(u_int block, u_int first, u_int last) for(first=page+1;firsttranslation_cache; beginning = start_block(); emit_movimm(DRC_TEST_VAL + i, 0); // test emit_ret(); @@ -6412,15 +6511,15 @@ static void new_dynarec_test(void) SysPrintf("test passed.\n"); else SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]); - out = translation_cache; + out = ndrc->translation_cache; } // clear the state completely, instead of just marking // things invalid like invalidate_all_pages() does -void new_dynarec_clear_full() +void new_dynarec_clear_full(void) { int n; - out = translation_cache; + out = ndrc->translation_cache; memset(invalid_code,1,sizeof(invalid_code)); memset(hash_table,0xff,sizeof(hash_table)); memset(mini_ht,-1,sizeof(mini_ht)); @@ -6438,34 +6537,28 @@ void new_dynarec_clear_full() for(n=0;n<4096;n++) ll_clear(jump_dirty+n); } -void new_dynarec_init() +void new_dynarec_init(void) { SysPrintf("Init new dynarec\n"); - // allocate/prepare a buffer for translation cache - // see assem_arm.h for some explanation -#if defined(BASE_ADDR_FIXED) - if (mmap(translation_cache, 1 << TARGET_SIZE_2, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0) != translation_cache) { - SysPrintf("mmap() failed: %s\n", strerror(errno)); - SysPrintf("disable BASE_ADDR_FIXED and recompile\n"); - abort(); - } -#elif defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2); if (sceBlock < 0) SysPrintf("sceKernelAllocMemBlockForVM failed\n"); - int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache); + int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc); if (ret < 0) SysPrintf("sceKernelGetMemBlockBase failed\n"); #else - translation_cache = mmap (NULL, 1 << TARGET_SIZE_2, + uintptr_t desired_addr = 0; + #ifdef __ELF__ + extern char _end; + desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl; + #endif + ndrc = mmap((void *)desired_addr, sizeof(*ndrc), PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (translation_cache == MAP_FAILED) { + if (ndrc == MAP_FAILED) { SysPrintf("mmap() failed: %s\n", strerror(errno)); abort(); } @@ -6473,11 +6566,12 @@ void new_dynarec_init() #else #ifndef NO_WRITE_EXEC // not all systems allow execute in data segment by default - if (mprotect(translation_cache, 1<translation_cache) + sizeof(ndrc->tramp.ops), + PROT_READ | PROT_WRITE | PROT_EXEC) != 0) SysPrintf("mprotect() failed: %s\n", strerror(errno)); #endif #endif - out = translation_cache; + out = ndrc->translation_cache; cycle_multiplier=200; new_dynarec_clear_full(); #ifdef HOST_IMM8 @@ -6493,15 +6587,15 @@ void new_dynarec_init() SysPrintf("warning: RAM is not directly mapped, performance will suffer\n"); } -void new_dynarec_cleanup() +void new_dynarec_cleanup(void) { int n; -#if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceKernelFreeMemBlock(sceBlock); sceBlock = -1; #else - if (munmap(translation_cache, 1<>12]=0; emit_movimm(start,0); emit_writeword(0,&pcaddr); - emit_jmp(new_dyna_leave); + emit_far_jump(new_dyna_leave); literal_pool(0); end_block(beginning); ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning); @@ -8696,7 +8790,7 @@ int new_recompile_block(u_int addr) emit_cmp(0,1); #ifdef __aarch64__ emit_jeq(out + 4*2); - emit_jmp(new_dyna_leave); + emit_far_jump(new_dyna_leave); #else emit_jne(new_dyna_leave); #endif @@ -8968,8 +9062,8 @@ int new_recompile_block(u_int addr) // If we're within 256K of the end of the buffer, // start over from the beginning. (Is 256K enough?) - if (out > translation_cache+(1< ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE) + out = ndrc->translation_cache; // Trap writes to any of the pages we compiled for(i=start>>12;i<=(start+slen*4)>>12;i++) { @@ -8986,11 +9080,11 @@ int new_recompile_block(u_int addr) /* Pass 10 - Free memory by expiring oldest blocks */ - int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; + int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; while(expirep!=end) { int shift=TARGET_SIZE_2-3; // Divide into 8 blocks - uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<translation_cache+((expirep>>13)<>11)&3) { @@ -9028,10 +9122,8 @@ int new_recompile_block(u_int addr) break; case 3: // Clear jump_out - #if defined(__arm__) || defined(__aarch64__) if((expirep&2047)==0) do_clear_cache(); - #endif ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); break;