From 398d69247112982aa5ddb91bf0fa2a435c6e008c Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 19 Feb 2022 21:37:23 +0200 Subject: [PATCH] drc: don't delay block restoration Not sure why it was done the way it was (maybe something N64 related?), but it occasionally caused dyna_linker to repeatedly walk the jump_in list. What's worse, if the dirty block was deemed to expire soon, it would never be restored and dyna_linker repeated jump_in walking would never end, causing severe slowdown. --- libpcsxcore/new_dynarec/emu_if.c | 6 - libpcsxcore/new_dynarec/linkage_arm.S | 121 +++++--------- libpcsxcore/new_dynarec/linkage_arm64.S | 20 --- libpcsxcore/new_dynarec/linkage_offsets.h | 3 +- libpcsxcore/new_dynarec/new_dynarec.c | 183 +++++++++------------- 5 files changed, 114 insertions(+), 219 deletions(-) diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index f9ee6416..e9008ae8 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -296,7 +296,6 @@ static int ari64_init() static u32 scratch_buf[8*8*2] __attribute__((aligned(64))); extern void (*psxCP2[64])(); extern void psxNULL(); - extern unsigned char *out; size_t i; new_dynarec_init(); @@ -326,10 +325,6 @@ static int ari64_init() zeromem_ptr = zero_mem; scratch_buf_ptr = scratch_buf; - SysPrintf("Mapped (RAM/scrp/ROM/LUTs/TC):\n"); - SysPrintf("%p/%p/%p/%p/%p\n", - psxM, psxH, psxR, mem_rtab, out); - return 0; } @@ -448,7 +443,6 @@ int new_dynarec_hacks; void *psxH_ptr; void *zeromem_ptr; u8 zero_mem[0x1000]; -unsigned char *out; void *mem_rtab; void *scratch_buf_ptr; void new_dynarec_init() {} diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S index 1a16aa04..978280a8 100644 --- a/libpcsxcore/new_dynarec/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -28,9 +28,9 @@ #define dynarec_local ESYM(dynarec_local) #define add_jump_out ESYM(add_jump_out) #define new_recompile_block ESYM(new_recompile_block) +#define ndrc_try_restore_block ESYM(ndrc_try_restore_block) #define get_addr ESYM(get_addr) #define get_addr_ht ESYM(get_addr_ht) -#define clean_blocks ESYM(clean_blocks) #define gen_interupt ESYM(gen_interupt) #define invalidate_addr ESYM(invalidate_addr) #define gteCheckStallRaw ESYM(gteCheckStallRaw) @@ -88,7 +88,6 @@ DRC_VAR(invc_ptr, 4) DRC_VAR(scratch_buf_ptr, 4) DRC_VAR(ram_offset, 4) DRC_VAR(mini_ht, 256) -DRC_VAR(restore_candidate, 512) #ifdef TEXRELS_FORBIDDEN @@ -96,8 +95,6 @@ DRC_VAR(restore_candidate, 512) .align 2 ptr_jump_in: .word ESYM(jump_in) -ptr_jump_dirty: - .word ESYM(jump_dirty) ptr_hash_table: .word ESYM(hash_table) #endif @@ -159,44 +156,44 @@ ptr_hash_table: #endif .endm -/* r0 = virtual target address */ -/* r1 = instruction to patch */ +/* r4 = virtual target address */ +/* r5 = instruction to patch */ .macro dyna_linker_main #ifndef NO_WRITE_EXEC load_varadr_ext r3, jump_in /* get_page */ - lsr r2, r0, #12 + lsr r2, r4, #12 mov r6, #4096 bic r2, r2, #0xe0000 sub r6, r6, #1 cmp r2, #0x1000 - ldr r7, [r1] + ldr r7, [r5] biclt r2, #0x0e00 and r6, r6, r2 cmp r2, #2048 add r12, r7, #2 orrcs r2, r6, #2048 - ldr r5, [r3, r2, lsl #2] + ldr r1, [r3, r2, lsl #2] lsl r12, r12, #8 - add r6, r1, r12, asr #6 /* old target */ + add r6, r5, r12, asr #6 /* old target */ mov r8, #0 /* jump_in lookup */ 1: - movs r4, r5 + movs r0, r1 beq 2f - ldr r3, [r5] /* ll_entry .vaddr */ - ldrd r4, r5, [r4, #8] /* ll_entry .next, .addr */ - teq r3, r0 + ldr r3, [r1] /* ll_entry .vaddr */ + ldrd r0, r1, [r0, #8] /* ll_entry .addr, .next */ + teq r3, r4 bne 1b - teq r4, r6 - moveq pc, r4 /* Stale i-cache */ - mov r8, r4 + teq r0, r6 + moveq pc, r0 /* Stale i-cache */ + mov r8, r0 b 1b /* jump_in may have dupes, continue search */ 2: tst r8, r8 - beq 3f /* r0 not in jump_in */ + beq 3f /* r4 not in jump_in */ - mov r5, r1 + mov r0, r4 mov r1, r6 bl add_jump_out sub r2, r8, r5 @@ -207,43 +204,13 @@ ptr_hash_table: str r1, [r5] mov pc, r8 3: - /* hash_table lookup */ - cmp r2, #2048 - load_varadr_ext r3, jump_dirty - eor r4, r0, r0, lsl #16 - lslcc r2, r0, #9 - load_varadr_ext r6, hash_table - lsr r4, r4, #12 - lsrcc r2, r2, #21 - bic r4, r4, #15 - ldr r5, [r3, r2, lsl #2] - ldr r7, [r6, r4]! - teq r7, r0 - ldreq pc, [r6, #8] - ldr r7, [r6, #4] - teq r7, r0 - ldreq pc, [r6, #12] - /* jump_dirty lookup */ -6: - movs r4, r5 - beq 8f - ldr r3, [r5] - ldr r5, [r4, #12] - teq r3, r0 - bne 6b -7: - ldr r1, [r4, #8] - /* hash_table insert */ - ldr r2, [r6] - ldr r3, [r6, #8] - str r0, [r6] - str r1, [r6, #8] - str r2, [r6, #4] - str r3, [r6, #12] - mov pc, r1 -8: + mov r0, r4 + bl ndrc_try_restore_block + tst r0, r0 + movne pc, r0 #else /* XXX: should be able to do better than this... */ + mov r0, r4 bl get_addr_ht mov pc, r0 #endif @@ -253,16 +220,18 @@ ptr_hash_table: FUNCTION(dyna_linker): /* r0 = virtual target address */ /* r1 = instruction to patch */ - dyna_linker_main - mov r4, r0 mov r5, r1 +10: + dyna_linker_main + + mov r0, r4 bl new_recompile_block tst r0, r0 - mov r0, r4 - mov r1, r5 - beq dyna_linker + beq 10b + /* pagefault */ + mov r0, r4 mov r1, r0 mov r2, #(4<<2) /* Address error (fetch) */ .size dyna_linker, .-dyna_linker @@ -288,18 +257,19 @@ FUNCTION(exec_pagefault): FUNCTION(dyna_linker_ds): /* r0 = virtual target address */ /* r1 = instruction to patch */ - dyna_linker_main - mov r4, r0 - bic r0, r0, #7 mov r5, r1 +10: + dyna_linker_main + + bic r0, r4, #7 orr r0, r0, #1 bl new_recompile_block tst r0, r0 - mov r0, r4 - mov r1, r5 - beq dyna_linker_ds + beq 10b + /* pagefault */ + mov r0, r4 bic r1, r0, #7 mov r2, #0x80000008 /* High bit set indicates pagefault in delay slot */ sub r0, r1, #4 @@ -426,18 +396,12 @@ FUNCTION(verify_code): FUNCTION(cc_interrupt): ldr r0, [fp, #LO_last_count] mov r1, #0 - mov r2, #0x1fc add r10, r0, r10 str r1, [fp, #LO_pending_exception] - and r2, r2, r10, lsr #17 - add r3, fp, #LO_restore_candidate str r10, [fp, #LO_cycle] /* PCSX cycles */ @@ str r10, [fp, #LO_reg_cop0+36] /* Count - not on PSX */ - ldr r4, [r2, r3] mov r10, lr - tst r4, r4 - bne .E4 -.E1: + bl gen_interupt mov lr, r10 ldr r10, [fp, #LO_cycle] @@ -450,22 +414,9 @@ FUNCTION(cc_interrupt): ldmfdne sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc} tst r1, r1 moveq pc, lr -.E2: ldr r0, [fp, #LO_pcaddr] bl get_addr_ht mov pc, r0 -.E4: - /* Move 'dirty' blocks to the 'clean' list */ - lsl r5, r2, #3 - str r1, [r2, r3] -.E5: - lsrs r4, r4, #1 - mov r0, r5 - add r5, r5, #1 - blcs clean_blocks - tst r5, #31 - bne .E5 - b .E1 .size cc_interrupt, .-cc_interrupt .align 2 diff --git a/libpcsxcore/new_dynarec/linkage_arm64.S b/libpcsxcore/new_dynarec/linkage_arm64.S index 39e95a83..5c4d1274 100644 --- a/libpcsxcore/new_dynarec/linkage_arm64.S +++ b/libpcsxcore/new_dynarec/linkage_arm64.S @@ -79,7 +79,6 @@ DRC_VAR(zeromem_ptr, 8) DRC_VAR(scratch_buf_ptr, 8) DRC_VAR(ram_offset, 8) DRC_VAR(mini_ht, 256) -DRC_VAR(restore_candidate, 512) .text @@ -118,16 +117,11 @@ FUNCTION(dyna_linker_ds): .align 2 FUNCTION(cc_interrupt): ldr w0, [rFP, #LO_last_count] - mov w2, #0x1fc add rCC, w0, rCC str wzr, [rFP, #LO_pending_exception] - and w2, w2, rCC, lsr #17 - add x3, rFP, #LO_restore_candidate str rCC, [rFP, #LO_cycle] /* PCSX cycles */ # str rCC, [rFP, #LO_reg_cop0+36] /* Count */ - ldr w19, [x3, w2, uxtw] mov x21, lr - cbnz w19, 4f 1: bl gen_interupt mov lr, x21 @@ -144,20 +138,6 @@ FUNCTION(cc_interrupt): ldr w0, [rFP, #LO_pcaddr] bl get_addr_ht br x0 -4: - /* Move 'dirty' blocks to the 'clean' list */ - lsl w20, w2, #3 - str wzr, [x3, w2, uxtw] -5: - mov w0, w20 - add w20, w20, #1 - tbz w19, #0, 6f - bl clean_blocks -6: - lsr w19, w19, #1 - tst w20, #31 - bne 5b - b 1b .size cc_interrupt, .-cc_interrupt .align 2 diff --git a/libpcsxcore/new_dynarec/linkage_offsets.h b/libpcsxcore/new_dynarec/linkage_offsets.h index 7ac2e611..0c189d78 100644 --- a/libpcsxcore/new_dynarec/linkage_offsets.h +++ b/libpcsxcore/new_dynarec/linkage_offsets.h @@ -39,7 +39,6 @@ #define LO_saved_lr (LO_scratch_buf_ptr + PTRSZ) #define LO_ram_offset (LO_saved_lr + PTRSZ) #define LO_mini_ht (LO_ram_offset + PTRSZ) -#define LO_restore_candidate (LO_mini_ht + PTRSZ*32*2) -#define LO_dynarec_local_size (LO_restore_candidate + 512) +#define LO_dynarec_local_size (LO_mini_ht + PTRSZ*32*2) #define LO_cop2_to_scratch_buf (LO_scratch_buf_ptr - LO_reg_cop2d) diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 6ade3bb0..93319ec0 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -191,11 +191,11 @@ static struct decoded_insn } dops[MAXBLOCK]; // used by asm: - u_char *out; struct ht_entry hash_table[65536] __attribute__((aligned(16))); struct ll_entry *jump_in[4096] __attribute__((aligned(16))); - struct ll_entry *jump_dirty[4096]; + static u_char *out; + static struct ll_entry *jump_dirty[4096]; static struct ll_entry *jump_out[4096]; static u_int start; static u_int *source; @@ -250,7 +250,6 @@ static struct decoded_insn extern int branch_target; extern uintptr_t ram_offset; extern uintptr_t mini_ht[32][2]; - extern u_char restore_candidate[512]; /* registers that may be allocated */ /* 1-31 gpr */ @@ -336,10 +335,13 @@ void jump_break (u_int u0, u_int u1, u_int pc); void jump_break_ds(u_int u0, u_int u1, u_int pc); void jump_to_new_pc(); void call_gteStall(); -void clean_blocks(u_int page); void add_jump_out(u_int vaddr, void *src); void new_dyna_leave(); +static void *get_clean_addr(void *addr); +static void get_bounds(void *addr, u_char **start, u_char **end); +static void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr); + // Needed by assembler static void wb_register(signed char r, const signed char regmap[], uint64_t dirty); static void wb_dirtys(const signed char i_regmap[], uint64_t i_dirty); @@ -531,6 +533,21 @@ static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr) ht_bin->tcaddr[0] = tcaddr; } +static void mark_valid_code(u_int vaddr, u_int len) +{ + u_int i, j; + vaddr &= 0x1fffffff; + for (i = vaddr & ~0xfff; i < vaddr + len; i += 0x1000) { + // ram mirrors, but should not hurt bios + for (j = 0; j < 0x800000; j += 0x200000) { + invalid_code[(i|j) >> 12] = + invalid_code[(i|j|0x80000000u) >> 12] = + invalid_code[(i|j|0xa0000000u) >> 12] = 0; + } + } + inv_code_start = inv_code_end = ~0; +} + // some messy ari64's code, seems to rely on unsigned 32bit overflow static int doesnt_expire_soon(void *tcaddr) { @@ -538,51 +555,69 @@ static int doesnt_expire_soon(void *tcaddr) return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2))); } +void *ndrc_try_restore_block(u_int vaddr) +{ + u_int page = get_page(vaddr); + struct ll_entry *head; + + for (head = jump_dirty[page]; head != NULL; head = head->next) + { + if (head->vaddr != vaddr) + continue; + // don't restore blocks which are about to expire from the cache + if (!doesnt_expire_soon(head->addr)) + continue; + if (!verify_dirty(head->addr)) + continue; + + // restore + u_char *start, *end; + get_bounds(head->addr, &start, &end); + mark_valid_code(vaddr, end - start); + + void *clean_addr = get_clean_addr(head->addr); + ll_add_flags(jump_in + page, vaddr, head->reg_sv_flags, clean_addr); + + struct ht_entry *ht_bin = hash_table_get(vaddr); + int in_ht = 0; + if (ht_bin->vaddr[0] == vaddr) { + ht_bin->tcaddr[0] = clean_addr; // Replace existing entry + in_ht = 1; + } + if (ht_bin->vaddr[1] == vaddr) { + ht_bin->tcaddr[1] = clean_addr; // Replace existing entry + in_ht = 1; + } + if (!in_ht) + hash_table_add(ht_bin, vaddr, clean_addr); + inv_debug("INV: Restored %08x (%p/%p)\n", head->vaddr, head->addr, clean_addr); + return clean_addr; + } + return NULL; +} + // Get address from virtual address // This is called from the recompiled JR/JALR instructions void noinline *get_addr(u_int vaddr) { - u_int page=get_page(vaddr); - u_int vpage=get_vpage(vaddr); + u_int page = get_page(vaddr); struct ll_entry *head; - //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page); - head=jump_in[page]; - while(head!=NULL) { - if(head->vaddr==vaddr) { - //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr); + void *code; + + for (head = jump_in[page]; head != NULL; head = head->next) { + if (head->vaddr == vaddr) { hash_table_add(hash_table_get(vaddr), vaddr, head->addr); return head->addr; } - head=head->next; } - head=jump_dirty[vpage]; - while(head!=NULL) { - if(head->vaddr==vaddr) { - //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr); - // Don't restore blocks which are about to expire from the cache - if (doesnt_expire_soon(head->addr)) - if (verify_dirty(head->addr)) { - //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); - invalid_code[vaddr>>12]=0; - inv_code_start=inv_code_end=~0; - if(vpage<2048) { - restore_candidate[vpage>>3]|=1<<(vpage&7); - } - else restore_candidate[page>>3]|=1<<(page&7); - struct ht_entry *ht_bin = hash_table_get(vaddr); - if (ht_bin->vaddr[0] == vaddr) - ht_bin->tcaddr[0] = head->addr; // Replace existing entry - else - hash_table_add(ht_bin, vaddr, head->addr); + code = ndrc_try_restore_block(vaddr); + if (code) + return code; + + int r = new_recompile_block(vaddr); + if (r == 0) + return get_addr(vaddr); - return head->addr; - } - } - head=head->next; - } - //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr); - int r=new_recompile_block(vaddr); - if(r==0) return get_addr(vaddr); // generate an address error Status|=2; Cause=(vaddr<<31)|(4<<2); @@ -991,7 +1026,6 @@ static const struct { FUNCNAME(jump_syscall), FUNCNAME(jump_syscall_ds), FUNCNAME(call_gteStall), - FUNCNAME(clean_blocks), FUNCNAME(new_dyna_leave), FUNCNAME(pcsx_mtc0), FUNCNAME(pcsx_mtc0_ds), @@ -1352,11 +1386,6 @@ void invalidate_all_pages(void) u_int page; for(page=0;page<4096;page++) invalidate_page(page); - for(page=0;page<1048576;page++) - if(!invalid_code[page]) { - restore_candidate[(page&2047)>>3]|=1<<(page&7); - restore_candidate[((page&2047)>>3)+256]|=1<<(page&7); - } #ifdef USE_MINI_HT memset(mini_ht,-1,sizeof(mini_ht)); #endif @@ -1386,55 +1415,6 @@ void add_jump_out(u_int vaddr,void *src) //inv_debug("add_jump_out: to %p\n",get_pointer(src)); } -// If a code block was found to be unmodified (bit was set in -// restore_candidate) and it remains unmodified (bit is clear -// in invalid_code) then move the entries for that 4K page from -// the dirty list to the clean list. -void clean_blocks(u_int page) -{ - struct ll_entry *head; - inv_debug("INV: clean_blocks page=%d\n",page); - head=jump_dirty[page]; - while(head!=NULL) { - if(!invalid_code[head->vaddr>>12]) { - // Don't restore blocks which are about to expire from the cache - if (doesnt_expire_soon(head->addr)) { - if(verify_dirty(head->addr)) { - u_char *start, *end; - //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr); - u_int i; - u_int inv=0; - get_bounds(head->addr, &start, &end); - if (start - rdram < RAM_SIZE) { - for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) { - inv|=invalid_code[i]; - } - } - else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) { - inv=1; - } - if(!inv) { - void *clean_addr = get_clean_addr(head->addr); - if (doesnt_expire_soon(clean_addr)) { - u_int ppage=page; - inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr); - //printf("page=%x, addr=%x\n",page,head->vaddr); - //assert(head->vaddr>>12==(page|0x80000)); - ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr); - struct ht_entry *ht_bin = hash_table_get(head->vaddr); - if (ht_bin->vaddr[0] == head->vaddr) - ht_bin->tcaddr[0] = clean_addr; // Replace existing entry - if (ht_bin->vaddr[1] == head->vaddr) - ht_bin->tcaddr[1] = clean_addr; // Replace existing entry - } - } - } - } - } - head=head->next; - } -} - /* Register allocation */ // Note: registers are allocated clean (unmodified state) @@ -6347,7 +6327,6 @@ void new_dynarec_clear_full(void) memset(invalid_code,1,sizeof(invalid_code)); memset(hash_table,0xff,sizeof(hash_table)); memset(mini_ht,-1,sizeof(mini_ht)); - memset(restore_candidate,0,sizeof(restore_candidate)); memset(shadow,0,sizeof(shadow)); copy=shadow; expirep=16384; // Expiry pointer, +2 blocks @@ -6421,6 +6400,8 @@ void new_dynarec_init(void) ram_offset=(uintptr_t)rdram-0x80000000; if (ram_offset!=0) SysPrintf("warning: RAM is not directly mapped, performance will suffer\n"); + SysPrintf("Mapped (RAM/scrp/ROM/LUTs/TC):\n"); + SysPrintf("%p/%p/%p/%p/%p\n", psxM, psxH, psxR, mem_rtab, out); } void new_dynarec_cleanup(void) @@ -9432,17 +9413,7 @@ int new_recompile_block(u_int addr) out = ndrc->translation_cache; // Trap writes to any of the pages we compiled - for(i=start>>12;i<=(start+slen*4)>>12;i++) { - invalid_code[i]=0; - } - inv_code_start=inv_code_end=~0; - - // for PCSX we need to mark all mirrors too - if(get_page(start)<(RAM_SIZE>>12)) - for(i=start>>12;i<=(start+slen*4)>>12;i++) - invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]= - invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]= - invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0; + mark_valid_code(start, slen*4); /* Pass 10 - Free memory by expiring oldest blocks */ -- 2.39.5