X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=4e09592d0741dc04e46fdb0fdedb0295d4637a04;hp=f2dbb86a1ad0a06438e09e32d94352929e7b25ac;hb=a3203cf4f2f21b0f2f74c5e494e3f5ba58225eae;hpb=afec9d44d1170fd6391528f4985211ffb00e8bea diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index f2dbb86a..4e09592d 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -35,7 +35,8 @@ static int sceBlock; #endif #include "new_dynarec_config.h" -#include "../psxhle.h" //emulator interface +#include "../psxhle.h" +#include "../psxinterpreter.h" #include "emu_if.h" //emulator interface #define noinline __attribute__((noinline,noclone)) @@ -65,6 +66,23 @@ static int sceBlock; #define MAXBLOCK 4096 #define MAX_OUTPUT_BLOCK_SIZE 262144 +struct ndrc_mem +{ + u_char translation_cache[1 << TARGET_SIZE_2]; + struct + { + struct tramp_insns ops[2048 / sizeof(struct tramp_insns)]; + const void *f[2048 / sizeof(void *)]; + } tramp; +}; + +#ifdef BASE_ADDR_DYNAMIC +static struct ndrc_mem *ndrc; +#else +static struct ndrc_mem ndrc_ __attribute__((aligned(4096))); +static struct ndrc_mem *ndrc = &ndrc_; +#endif + // stubs enum stub_type { CC_STUB = 1, @@ -167,8 +185,10 @@ struct link_entry static uint64_t unneeded_reg[MAXBLOCK]; static uint64_t branch_unneeded_reg[MAXBLOCK]; static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i? - static uint64_t current_constmap[HOST_REGS]; - static uint64_t constmap[MAXBLOCK][HOST_REGS]; + // contains 'real' consts at [i] insn, but may differ from what's actually + // loaded in host reg as 'final' value is always loaded, see get_final_value() + static uint32_t current_constmap[HOST_REGS]; + static uint32_t constmap[MAXBLOCK][HOST_REGS]; static struct regstat regs[MAXBLOCK]; static struct regstat branch_regs[MAXBLOCK]; static signed char minimum_free_regs[MAXBLOCK]; @@ -272,7 +292,7 @@ struct link_entry #define DJT_2 (void *)2l // asm linkage -int new_recompile_block(int addr); +int new_recompile_block(u_int addr); void *get_addr_ht(u_int vaddr); void invalidate_block(u_int block); void invalidate_addr(u_int addr); @@ -284,9 +304,7 @@ void verify_code_ds(); void cc_interrupt(); void fp_exception(); void fp_exception_ds(); -void jump_syscall_hle(); -void jump_hlecall(); -void jump_intcall(); +void jump_to_new_pc(); void new_dyna_leave(); // Needed by assembler @@ -298,7 +316,7 @@ static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]); static void load_regs_entry(int t); static void load_all_consts(signed char regmap[],u_int dirty,int i); -static int verify_dirty(u_int *ptr); +static int verify_dirty(const u_int *ptr); static int get_final_value(int hr, int i, int *value); static void add_stub(enum stub_type type, void *addr, void *retaddr, u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e); @@ -309,6 +327,8 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override); static void *get_direct_memhandler(void *table, u_int addr, enum stub_type type, uintptr_t *addr_host); static void pass_args(int a0, int a1); +static void emit_far_jump(const void *f); +static void emit_far_call(const void *f); static void mprotect_w_x(void *start, void *end, int is_x) { @@ -337,7 +357,7 @@ static void start_tcache_write(void *start, void *end) static void end_tcache_write(void *start, void *end) { -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) size_t len = (char *)end - (char *)start; #if defined(__BLACKBERRY_QNX__) msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE); @@ -347,12 +367,14 @@ static void end_tcache_write(void *start, void *end) sceKernelSyncVMDomain(sceBlock, start, len); #elif defined(_3DS) ctr_flush_invalidate_cache(); + #elif defined(__aarch64__) + // as of 2021, __clear_cache() is still broken on arm64 + // so here is a custom one :( + clear_cache_arm64(start, end); #else __clear_cache(start, end); #endif (void)len; -#else - __clear_cache(start, end); #endif mprotect_w_x(start, end, 1); @@ -361,8 +383,8 @@ static void end_tcache_write(void *start, void *end) static void *start_block(void) { u_char *end = out + MAX_OUTPUT_BLOCK_SIZE; - if (end > translation_cache + (1< ndrc->translation_cache + sizeof(ndrc->translation_cache)) + end = ndrc->translation_cache + sizeof(ndrc->translation_cache); start_tcache_write(out, end); return out; } @@ -372,16 +394,62 @@ static void end_block(void *start) end_tcache_write(start, out); } +// also takes care of w^x mappings when patching code +static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)]; + +static void mark_clear_cache(void *target) +{ + uintptr_t offset = (u_char *)target - ndrc->translation_cache; + u_int mask = 1u << ((offset >> 12) & 31); + if (!(needs_clear_cache[offset >> 17] & mask)) { + char *start = (char *)((uintptr_t)target & ~4095l); + start_tcache_write(start, start + 4095); + needs_clear_cache[offset >> 17] |= mask; + } +} + +// Clearing the cache is rather slow on ARM Linux, so mark the areas +// that need to be cleared, and then only clear these areas once. +static void do_clear_cache(void) +{ + int i, j; + for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++) + { + u_int bitmap = needs_clear_cache[i]; + if (!bitmap) + continue; + for (j = 0; j < 32; j++) + { + u_char *start, *end; + if (!(bitmap & (1<translation_cache + i*131072 + j*4096; + end = start + 4095; + for (j++; j < 32; j++) { + if (!(bitmap & (1<>31)|1; - return (x * cycle_multiplier + s * 50) / 100; + return (x * m + s * 50) / 100; } static u_int get_page(u_int vaddr) @@ -529,7 +597,7 @@ void dirty_reg(struct regstat *cur,signed char reg) } } -void set_const(struct regstat *cur,signed char reg,uint64_t value) +static void set_const(struct regstat *cur, signed char reg, uint32_t value) { int hr; if(!reg) return; @@ -541,7 +609,7 @@ void set_const(struct regstat *cur,signed char reg,uint64_t value) } } -void clear_const(struct regstat *cur,signed char reg) +static void clear_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return; @@ -552,7 +620,7 @@ void clear_const(struct regstat *cur,signed char reg) } } -int is_const(struct regstat *cur,signed char reg) +static int is_const(struct regstat *cur, signed char reg) { int hr; if(reg<0) return 0; @@ -564,7 +632,8 @@ int is_const(struct regstat *cur,signed char reg) } return 0; } -uint64_t get_const(struct regstat *cur,signed char reg) + +static uint32_t get_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return 0; @@ -832,13 +901,14 @@ static const struct { FUNCNAME(jump_handler_write16), FUNCNAME(jump_handler_write32), FUNCNAME(invalidate_addr), - FUNCNAME(verify_code), - FUNCNAME(jump_hlecall), - FUNCNAME(jump_syscall_hle), + FUNCNAME(jump_to_new_pc), FUNCNAME(new_dyna_leave), FUNCNAME(pcsx_mtc0), FUNCNAME(pcsx_mtc0_ds), FUNCNAME(do_insn_cmp), +#ifdef __arm__ + FUNCNAME(verify_code), +#endif }; static const char *func_name(const void *a) @@ -866,6 +936,48 @@ static const char *func_name(const void *a) #include "assem_arm64.c" #endif +static void *get_trampoline(const void *f) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) { + if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL) + break; + } + if (i == ARRAY_SIZE(ndrc->tramp.f)) { + SysPrintf("trampoline table is full, last func %p\n", f); + abort(); + } + if (ndrc->tramp.f[i] == NULL) { + start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + ndrc->tramp.f[i] = f; + end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + } + return &ndrc->tramp.ops[i]; +} + +static void emit_far_jump(const void *f) +{ + if (can_jump_or_call(f)) { + emit_jmp(f); + return; + } + + f = get_trampoline(f); + emit_jmp(f); +} + +static void emit_far_call(const void *f) +{ + if (can_jump_or_call(f)) { + emit_call(f); + return; + } + + f = get_trampoline(f); + emit_call(f); +} + // Add virtual address mapping to linked list void ll_add(struct ll_entry **head,int vaddr,void *addr) { @@ -993,9 +1105,7 @@ static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift) { inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr); void *host_addr=find_extjump_insn(head->addr); - #if defined(__arm__) || defined(__aarch64__) - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); } head=head->next; @@ -1021,9 +1131,7 @@ static void invalidate_page(u_int page) while(head!=NULL) { inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr); void *host_addr=find_extjump_insn(head->addr); - #if defined(__arm__) || defined(__aarch64__) - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); next=head->next; free(head); @@ -1046,9 +1154,7 @@ static void invalidate_block_range(u_int block, u_int first, u_int last) for(first=page+1;firstregmap,rs2[i]); if(rs2[i]==0) // rx=0); - if(opcode2[i]==0x2a) // SLT + if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT + assert(s1l>=0); emit_shrimm(s1l,31,t); - else // SLTU (unsigned can not be less than zero) + } + else // SLTU (unsigned can not be less than zero, 0<0) emit_zeroreg(t); } else if(rs1[i]==0) // r0regmap, rt1[i]); + s = get_reg(i_regs->regmap, rs1[i]); + shift = get_reg(i_regs->regmap, rs2[i]); + if (t < 0) + return; + + if(rs1[i]==0) + emit_zeroreg(t); + else if(rs2[i]==0) { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else { + host_tempreg_acquire(); + emit_andimm(shift,31,HOST_TEMPREG); + switch(opcode2[i]) { + case 4: // SLLV + emit_shl(s,HOST_TEMPREG,t); + break; + case 6: // SRLV + emit_shr(s,HOST_TEMPREG,t); + break; + case 7: // SRAV + emit_sar(s,HOST_TEMPREG,t); + break; + default: + assert(0); + } + host_tempreg_release(); + } } + #endif enum { @@ -2430,7 +2571,7 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override) else if(type==MTYPE_1F80) { // scratchpad if (psxH == (void *)0x1f800000) { host_tempreg_acquire(); - emit_addimm(addr,-0x1f800000,HOST_TEMPREG); + emit_xorimm(addr,0x1f800000,HOST_TEMPREG); emit_cmpimm(HOST_TEMPREG,0x1000); host_tempreg_release(); jaddr=out; @@ -2631,10 +2772,86 @@ static void load_assemble(int i,struct regstat *i_regs) } #ifndef loadlr_assemble -void loadlr_assemble(int i,struct regstat *i_regs) +static void loadlr_assemble(int i,struct regstat *i_regs) { - printf("Need loadlr_assemble for this architecture.\n"); - abort(); + int s,tl,temp,temp2,addr; + int offset; + void *jaddr=0; + int memtarget=0,c=0; + int fastio_reg_override=-1; + u_int hr,reglist=0; + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + for(hr=0;hrregmap[hr]>=0) reglist|=1<=0) { + c=(i_regs->wasconst>>s)&1; + if(c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; + } + } + if(!c) { + emit_shlimm(addr,3,temp); + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override); + } + else { + if(ram_offset&&memtarget) { + host_tempreg_acquire(); + emit_addimm(temp2,ram_offset,HOST_TEMPREG); + fastio_reg_override=HOST_TEMPREG; + } + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR + if(!c||memtarget) { + int a=temp2; + if(fastio_reg_override>=0) a=fastio_reg_override; + emit_readword_indexed(0,a,temp2); + if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release(); + if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); + if(rt1[i]) { + assert(tl>=0); + emit_andimm(temp,24,temp); + if (opcode[i]==0x22) // LWL + emit_xorimm(temp,24,temp); + host_tempreg_acquire(); + emit_movimm(-1,HOST_TEMPREG); + if (opcode[i]==0x26) { + emit_shr(temp2,temp,temp2); + emit_bic_lsr(tl,HOST_TEMPREG,temp,tl); + }else{ + emit_shl(temp2,temp,temp2); + emit_bic_lsl(tl,HOST_TEMPREG,temp,tl); + } + host_tempreg_release(); + emit_or(temp2,tl,tl); + } + //emit_storereg(rt1[i],tl); // DEBUG + } + if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR + assert(0); + } } #endif @@ -2746,7 +2963,7 @@ void store_assemble(int i,struct regstat *i_regs) } // basic current block modification detection.. // not looking back as that should be in mips cache already - // (note: doesn't seem to trigger, migh be broken) + // (see Spyro2 title->attract mode) if(c&&start+i*4regmap==regs[i].regmap); // not delay slot @@ -2756,13 +2973,13 @@ void store_assemble(int i,struct regstat *i_regs) emit_movimm(start+i*4+4,0); emit_writeword(0,&pcaddr); emit_addimm(HOST_CCREG,2,HOST_CCREG); - emit_call(get_addr_ht); + emit_far_call(get_addr_ht); emit_jmpreg(0); } } } -void storelr_assemble(int i,struct regstat *i_regs) +static void storelr_assemble(int i,struct regstat *i_regs) { int s,tl; int temp; @@ -2802,7 +3019,8 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_jmp(0); } } - emit_addimm_no_flags(ram_offset,temp); + if(ram_offset) + emit_addimm_no_flags(ram_offset,temp); if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR assert(0); @@ -2819,15 +3037,11 @@ void storelr_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x2A) { // SWL emit_writeword_indexed(tl,0,temp); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR emit_writebyte_indexed(tl,3,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR + else assert(0); - } done0=out; emit_jmp(0); // 1 @@ -2840,16 +3054,10 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writebyte_indexed(tl,1,temp); if(rs2[i]) emit_rorimm(tl,8,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write two lsb into two most significant bytes emit_writehword_indexed(tl,1,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } done1=out; emit_jmp(0); // 2 @@ -2863,19 +3071,13 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writehword_indexed(tl,-2,temp); if(rs2[i]) emit_rorimm(tl,16,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write 3 lsb into three most significant bytes emit_writebyte_indexed(tl,-1,temp); if(rs2[i]) emit_rorimm(tl,8,tl); emit_writehword_indexed(tl,0,temp); if(rs2[i]) emit_rorimm(tl,24,tl); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } done2=out; emit_jmp(0); // 3 @@ -2886,25 +3088,13 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writebyte_indexed(tl,-3,temp); if(rs2[i]) emit_rorimm(tl,8,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write entire word emit_writeword_indexed(tl,-3,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } set_jump_target(done0, out); set_jump_target(done1, out); set_jump_target(done2, out); - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } if(!c||!memtarget) add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist); if(!(i_regs->waswritten&(1<=0); if (opcode[i]==0x3a) { // SWC2 - cop2_get_dreg(copr,tl,HOST_TEMPREG); + cop2_get_dreg(copr,tl,-1); type=STOREW_STUB; } else @@ -3291,14 +3463,7 @@ static void cop2_assemble(int i,struct regstat *i_regs) emit_signextend16(sl,temp); break; case 31: - //value = value & 0x7ffff000; - //if (value & 0x7f87e000) value |= 0x80000000; - emit_shrimm(sl,12,temp); - emit_shlimm(temp,12,temp); - emit_testimm(temp,0x7f000000); - emit_testeqimm(temp,0x00870000); - emit_testeqimm(temp,0x0000e000); - emit_orrne_imm(temp,0x80000000,temp); + c2op_ctc2_31_assemble(sl,temp); break; default: temp=sl; @@ -3309,6 +3474,90 @@ static void cop2_assemble(int i,struct regstat *i_regs) } } +static void do_unalignedwritestub(int n) +{ + assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4); + literal_pool(256); + set_jump_target(stubs[n].addr, out); + + int i=stubs[n].a; + struct regstat *i_regs=(struct regstat *)stubs[n].c; + int addr=stubs[n].b; + u_int reglist=stubs[n].e; + signed char *i_regmap=i_regs->regmap; + int temp2=get_reg(i_regmap,FTEMP); + int rt; + rt=get_reg(i_regmap,rs2[i]); + assert(rt>=0); + assert(addr>=0); + assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented + reglist|=(1<regmap,CCREG); assert(ccreg==HOST_CCREG); assert(!is_delayslot); (void)ccreg; - emit_movimm(start+i*4,EAX); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... - emit_jmp(jump_syscall_hle); // XXX + + emit_movimm(pc,3); // Get PC + emit_readword(&last_count,2); + emit_writeword(3,&psxRegs.pc); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX + emit_add(2,HOST_CCREG,2); + emit_writeword(2,&psxRegs.cycle); + emit_far_call(func); + emit_far_jump(jump_to_new_pc); +} + +static void syscall_assemble(int i,struct regstat *i_regs) +{ + emit_movimm(0x20,0); // cause code + emit_movimm(0,1); // not in delay slot + call_c_cpu_handler(i,i_regs,start+i*4,psxException); } static void hlecall_assemble(int i,struct regstat *i_regs) { - extern void psxNULL(); - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4+4,0); // Get PC + void *hlefunc = psxNULL; uint32_t hleCode = source[i] & 0x03ffffff; - if (hleCode >= ARRAY_SIZE(psxHLEt)) - emit_movimm((uintptr_t)psxNULL,1); - else - emit_movimm((uintptr_t)psxHLEt[hleCode],1); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX - emit_jmp(jump_hlecall); + if (hleCode < ARRAY_SIZE(psxHLEt)) + hlefunc = psxHLEt[hleCode]; + + call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc); } static void intcall_assemble(int i,struct regstat *i_regs) { - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4,0); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); - emit_jmp(jump_intcall); + call_c_cpu_handler(i,i_regs,start+i*4,execI); } static void speculate_mov(int rs,int rt) @@ -4075,19 +4325,36 @@ static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr) static void drc_dbg_emit_do_cmp(int i) { extern void do_insn_cmp(); - extern int cycle; + //extern int cycle; u_int hr,reglist=0; - for(hr=0;hr=0) reglist|=1< 0 && !bt[i]) { + for (hr = 0; hr < HOST_REGS; hr++) { + int reg = regs[i-1].regmap[hr]; + if (hr == EXCLUDE_REG || reg < 0) + continue; + if (!((regs[i-1].isconst >> hr) & 1)) + continue; + if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr]) + continue; + emit_movimm(constmap[i-1][hr],0); + emit_storereg(reg, 0); + } + } emit_movimm(start+i*4,0); emit_writeword(0,&pcaddr); - emit_call(do_insn_cmp); + emit_far_call(do_insn_cmp); //emit_readword(&cycle,0); //emit_addimm(0,2,0); //emit_writeword(0,&cycle); + (void)get_reg2; restore_regs(reglist); + assem_debug("\\\\do_insn_cmp\n"); } #else #define drc_dbg_emit_do_cmp(x) @@ -4214,11 +4481,13 @@ void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) else if(*adj==0||invert) { int cycles=CLOCK_ADJUST(count+2); // faster loop HACK +#if 0 if (t&&*adj) { int rel=t-i; if(-NO_CYCLE_PENALTY_THR(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); @@ -4956,6 +5228,9 @@ static void sjump_assemble(int i,struct regstat *i_regs) #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) @@ -6247,7 +6522,7 @@ static void new_dynarec_test(void) ((volatile u_int *)out)[0]++; // make cache dirty for (i = 0; i < ARRAY_SIZE(ret); i++) { - out = translation_cache; + out = ndrc->translation_cache; beginning = start_block(); emit_movimm(DRC_TEST_VAL + i, 0); // test emit_ret(); @@ -6261,15 +6536,15 @@ static void new_dynarec_test(void) SysPrintf("test passed.\n"); else SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]); - out = translation_cache; + out = ndrc->translation_cache; } // clear the state completely, instead of just marking // things invalid like invalidate_all_pages() does -void new_dynarec_clear_full() +void new_dynarec_clear_full(void) { int n; - out = translation_cache; + out = ndrc->translation_cache; memset(invalid_code,1,sizeof(invalid_code)); memset(hash_table,0xff,sizeof(hash_table)); memset(mini_ht,-1,sizeof(mini_ht)); @@ -6287,34 +6562,28 @@ void new_dynarec_clear_full() for(n=0;n<4096;n++) ll_clear(jump_dirty+n); } -void new_dynarec_init() +void new_dynarec_init(void) { SysPrintf("Init new dynarec\n"); - // allocate/prepare a buffer for translation cache - // see assem_arm.h for some explanation -#if defined(BASE_ADDR_FIXED) - if (mmap(translation_cache, 1 << TARGET_SIZE_2, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0) != translation_cache) { - SysPrintf("mmap() failed: %s\n", strerror(errno)); - SysPrintf("disable BASE_ADDR_FIXED and recompile\n"); - abort(); - } -#elif defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2); if (sceBlock < 0) SysPrintf("sceKernelAllocMemBlockForVM failed\n"); - int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache); + int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc); if (ret < 0) SysPrintf("sceKernelGetMemBlockBase failed\n"); #else - translation_cache = mmap (NULL, 1 << TARGET_SIZE_2, + uintptr_t desired_addr = 0; + #ifdef __ELF__ + extern char _end; + desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl; + #endif + ndrc = mmap((void *)desired_addr, sizeof(*ndrc), PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (translation_cache == MAP_FAILED) { + if (ndrc == MAP_FAILED) { SysPrintf("mmap() failed: %s\n", strerror(errno)); abort(); } @@ -6322,11 +6591,12 @@ void new_dynarec_init() #else #ifndef NO_WRITE_EXEC // not all systems allow execute in data segment by default - if (mprotect(translation_cache, 1<translation_cache) + sizeof(ndrc->tramp.ops), + PROT_READ | PROT_WRITE | PROT_EXEC) != 0) SysPrintf("mprotect() failed: %s\n", strerror(errno)); #endif #endif - out = translation_cache; + out = ndrc->translation_cache; cycle_multiplier=200; new_dynarec_clear_full(); #ifdef HOST_IMM8 @@ -6342,15 +6612,15 @@ void new_dynarec_init() SysPrintf("warning: RAM is not directly mapped, performance will suffer\n"); } -void new_dynarec_cleanup() +void new_dynarec_cleanup(void) { int n; -#if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceKernelFreeMemBlock(sceBlock); sceBlock = -1; #else - if (munmap(translation_cache, 1<>12]=0; emit_movimm(start,0); emit_writeword(0,&pcaddr); - emit_jmp(new_dyna_leave); + emit_far_jump(new_dyna_leave); literal_pool(0); end_block(beginning); ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning); @@ -7451,7 +7730,7 @@ int new_recompile_block(int addr) dirty_reg(&branch_regs[i-1],31); } memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); break; case RJUMP: memcpy(&branch_regs[i-1],¤t,sizeof(current)); @@ -7472,7 +7751,7 @@ int new_recompile_block(int addr) } #endif memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); break; case CJUMP: if((opcode[i-1]&0x3E)==4) // BEQ/BNE @@ -7499,7 +7778,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ @@ -7524,7 +7803,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken @@ -7578,7 +7857,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken @@ -7675,7 +7954,7 @@ int new_recompile_block(int addr) if(!is_ds[i]) { regs[i].dirty=current.dirty; regs[i].isconst=current.isconst; - memcpy(constmap[i],current_constmap,sizeof(current_constmap)); + memcpy(constmap[i],current_constmap,sizeof(constmap[i])); } for(hr=0;hr=0) { @@ -7906,8 +8185,8 @@ int new_recompile_block(int addr) } } } - } - } + } // if needed + } // for hr } /* Pass 5 - Pre-allocate registers */ @@ -8533,7 +8812,7 @@ int new_recompile_block(int addr) void *instr_addr0_override = NULL; if (start == 0x80030000) { - // nasty hack for fastbios thing + // nasty hack for the fastbios thing // override block entry to this code instr_addr0_override = out; emit_movimm(start,0); @@ -8543,7 +8822,12 @@ int new_recompile_block(int addr) emit_writeword(0,&pcaddr); emit_writeword(0,&address); emit_cmp(0,1); + #ifdef __aarch64__ + emit_jeq(out + 4*2); + emit_far_jump(new_dyna_leave); + #else emit_jne(new_dyna_leave); + #endif } for(i=0;i translation_cache+(1< ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE) + out = ndrc->translation_cache; // Trap writes to any of the pages we compiled for(i=start>>12;i<=(start+slen*4)>>12;i++) { @@ -8830,11 +9114,11 @@ int new_recompile_block(int addr) /* Pass 10 - Free memory by expiring oldest blocks */ - int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; + int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; while(expirep!=end) { int shift=TARGET_SIZE_2-3; // Divide into 8 blocks - uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<translation_cache+((expirep>>13)<>11)&3) { @@ -8872,10 +9156,8 @@ int new_recompile_block(int addr) break; case 3: // Clear jump_out - #if defined(__arm__) || defined(__aarch64__) if((expirep&2047)==0) do_clear_cache(); - #endif ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); break;