spu: rework synchronization
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / assem_arm.c
index 6524d1f..9ee832e 100644 (file)
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #ifdef PCSX
+#include "../gte.h"
+#define FLAGLESS
+#include "../gte.h"
+#undef FLAGLESS
 #include "../gte_arm.h"
 #include "../gte_neon.h"
 #include "pcnt.h"
 #endif
+#include "arm_features.h"
+
+#if !BASE_ADDR_FIXED
+char translation_cache[1 << TARGET_SIZE_2] __attribute__((aligned(4096)));
+#endif
+
+#ifndef __MACH__
+#define CALLER_SAVE_REGS 0x100f
+#else
+#define CALLER_SAVE_REGS 0x120f
+#endif
 
 extern int cycle_count;
 extern int last_count;
@@ -215,7 +230,7 @@ int get_pointer(void *stub)
 u_int get_clean_addr(int addr)
 {
   int *ptr=(int *)addr;
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   ptr+=4;
   #else
   ptr+=6;
@@ -232,7 +247,7 @@ u_int get_clean_addr(int addr)
 int verify_dirty(int addr)
 {
   u_int *ptr=(u_int *)addr;
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   // get from literal pool
   assert((*ptr&0xFFFF0000)==0xe59f0000);
   u_int offset=*ptr&0xfff;
@@ -271,7 +286,7 @@ int verify_dirty(int addr)
 // guarantees that it's not dirty
 int isclean(int addr)
 {
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   int *ptr=((u_int *)addr)+4;
   #else
   int *ptr=((u_int *)addr)+6;
@@ -284,10 +299,11 @@ int isclean(int addr)
   return 1;
 }
 
+// get source that block at addr was compiled from (host pointers)
 void get_bounds(int addr,u_int *start,u_int *end)
 {
   u_int *ptr=(u_int *)addr;
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   // get from literal pool
   assert((*ptr&0xFFFF0000)==0xe59f0000);
   u_int offset=*ptr&0xfff;
@@ -483,7 +499,7 @@ void alloc_reg(struct regstat *cur,int i,signed char reg)
       }
     }
   }
-  printf("This shouldn't happen (alloc_reg)");exit(1);
+  SysPrintf("This shouldn't happen (alloc_reg)");exit(1);
 }
 
 void alloc_reg64(struct regstat *cur,int i,signed char reg)
@@ -649,7 +665,7 @@ void alloc_reg64(struct regstat *cur,int i,signed char reg)
       }
     }
   }
-  printf("This shouldn't happen");exit(1);
+  SysPrintf("This shouldn't happen");exit(1);
 }
 
 // Allocate a temporary register.  This is done without regard to
@@ -772,7 +788,7 @@ void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
       }
     }
   }
-  printf("This shouldn't happen");exit(1);
+  SysPrintf("This shouldn't happen");exit(1);
 }
 // Allocate a specific ARM register.
 void alloc_arm_reg(struct regstat *cur,int i,signed char reg,char hr)
@@ -889,7 +905,7 @@ u_int genjmp(u_int addr)
   int offset=addr-(int)out-8;
   if(offset<-33554432||offset>=33554432) {
     if (addr>2) {
-      printf("genjmp: out of range: %08x\n", offset);
+      SysPrintf("genjmp: out of range: %08x\n", offset);
       exit(1);
     }
     return 0;
@@ -996,7 +1012,7 @@ void emit_movimm(u_int imm,u_int rt)
     assem_debug("mvn %s,#%d\n",regname[rt],imm);
     output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
   }else if(imm<65536) {
-    #ifdef ARMv5_ONLY
+    #ifndef HAVE_ARMV7
     assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00);
     output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8));
     assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF);
@@ -1005,7 +1021,7 @@ void emit_movimm(u_int imm,u_int rt)
     emit_movw(imm,rt);
     #endif
   }else{
-    #ifdef ARMv5_ONLY
+    #ifndef HAVE_ARMV7
     emit_loadlp(imm,rt);
     #else
     emit_movw(imm&0x0000FFFF,rt);
@@ -1023,7 +1039,7 @@ void emit_loadreg(int r, int hr)
 {
 #ifdef FORCE32
   if(r&64) {
-    printf("64bit load in 32bit mode!\n");
+    SysPrintf("64bit load in 32bit mode!\n");
     assert(0);
     return;
   }
@@ -1048,7 +1064,7 @@ void emit_storereg(int r, int hr)
 {
 #ifdef FORCE32
   if(r&64) {
-    printf("64bit store in 32bit mode!\n");
+    SysPrintf("64bit store in 32bit mode!\n");
     assert(0);
     return;
   }
@@ -1269,7 +1285,7 @@ void emit_andimm(int rs,int imm,int rt)
     assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm);
     output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval);
   }else if(imm==65535) {
-    #ifdef ARMv5_ONLY
+    #ifndef HAVE_ARMV7
     assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]);
     output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF);
     assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]);
@@ -1280,7 +1296,7 @@ void emit_andimm(int rs,int imm,int rt)
     #endif
   }else{
     assert(imm>0&&imm<65535);
-    #ifdef ARMv5_ONLY
+    #ifndef HAVE_ARMV7
     assem_debug("mov r14,#%d\n",imm&0xFF00);
     output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8));
     assem_debug("add r14,r14,#%d\n",imm&0xFF);
@@ -1344,6 +1360,14 @@ void emit_lsls_imm(int rs,int imm,int rt)
   output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
 }
 
+void emit_lslpls_imm(int rs,int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
+}
+
 void emit_shrimm(int rs,u_int imm,int rt)
 {
   assert(imm>0);
@@ -1394,7 +1418,7 @@ void emit_shrdimm(int rs,int rs2,u_int imm,int rt)
 
 void emit_signextend16(int rs,int rt)
 {
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   emit_shlimm(rs,16,rt);
   emit_sarimm(rt,16,rt);
   #else
@@ -1405,7 +1429,7 @@ void emit_signextend16(int rs,int rt)
 
 void emit_signextend8(int rs,int rt)
 {
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   emit_shlimm(rs,24,rt);
   emit_sarimm(rt,24,rt);
   #else
@@ -1493,20 +1517,12 @@ void emit_cmpimm(int rs,int imm)
     output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval);
   }else if(imm>0) {
     assert(imm<65536);
-    #ifdef ARMv5_ONLY
     emit_movimm(imm,HOST_TEMPREG);
-    #else
-    emit_movw(imm,HOST_TEMPREG);
-    #endif
     assem_debug("cmp %s,r14\n",regname[rs]);
     output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG));
   }else{
     assert(imm>-65536);
-    #ifdef ARMv5_ONLY
     emit_movimm(-imm,HOST_TEMPREG);
-    #else
-    emit_movw(-imm,HOST_TEMPREG);
-    #endif
     assem_debug("cmn %s,r14\n",regname[rs]);
     output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG));
   }
@@ -2286,7 +2302,7 @@ void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt)
     output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval);
   }
   else {
-    #ifdef ARMv5_ONLY
+    #ifndef HAVE_ARMV7
     emit_movimm(imm1,rt);
     add_literal((int)out,imm2);
     assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2);
@@ -2577,6 +2593,14 @@ void emit_andne_imm(int rs,int imm,int rt)
   output_w32(0x12000000|rd_rn_rm(rt,rs,0)|armval);
 }
 
+void emit_addpl_imm(int rs,int imm,int rt)
+{
+  u_int armval;
+  genimm_checked(imm,&armval);
+  assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval);
+}
+
 void emit_jno_unlikely(int a)
 {
   //emit_jno(a);
@@ -2609,13 +2633,13 @@ static void restore_regs_all(u_int reglist)
 // Save registers before function call
 static void save_regs(u_int reglist)
 {
-  reglist&=0x100f; // only save the caller-save registers, r0-r3, r12
+  reglist&=CALLER_SAVE_REGS; // only save the caller-save registers, r0-r3, r12
   save_regs_all(reglist);
 }
 // Restore registers after function call
 static void restore_regs(u_int reglist)
 {
-  reglist&=0x100f; // only restore the caller-save registers, r0-r3, r12
+  reglist&=CALLER_SAVE_REGS;
   restore_regs_all(reglist);
 }
 
@@ -2693,7 +2717,7 @@ void literal_pool_jumpover(int n)
   set_jump_target(jaddr,(int)out);
 }
 
-emit_extjump2(int addr, int target, int linker)
+emit_extjump2(u_int addr, int target, int linker)
 {
   u_char *ptr=(u_char *)addr;
   assert((ptr[3]&0x0e)==0xa);
@@ -2726,14 +2750,45 @@ emit_extjump_ds(int addr, int target)
 // put rt_val into rt, potentially making use of rs with value rs_val
 static void emit_movimm_from(u_int rs_val,int rs,u_int rt_val,int rt)
 {
-  u_int xor=rs_val^rt_val;
+  u_int armval;
+  int diff;
+  if(genimm(rt_val,&armval)) {
+    assem_debug("mov %s,#%d\n",regname[rt],rt_val);
+    output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval);
+    return;
+  }
+  if(genimm(~rt_val,&armval)) {
+    assem_debug("mvn %s,#%d\n",regname[rt],rt_val);
+    output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
+    return;
+  }
+  diff=rt_val-rs_val;
+  if(genimm(diff,&armval)) {
+    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],diff);
+    output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
+    return;
+  }else if(genimm(-diff,&armval)) {
+    assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-diff);
+    output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval);
+    return;
+  }
+  emit_movimm(rt_val,rt);
+}
+
+// return 1 if above function can do it's job cheaply
+static int is_similar_value(u_int v1,u_int v2)
+{
   u_int xs;
-  for(xs=xor;xs!=0&&(xs&3)==0;xs>>=2)
+  int diff;
+  if(v1==v2) return 1;
+  diff=v2-v1;
+  for(xs=diff;xs!=0&&(xs&3)==0;xs>>=2)
     ;
-  if(xs<0x100)
-    emit_xorimm(rs,xor,rt);
-  else
-    emit_movimm(rt_val,rt);
+  if(xs<0x100) return 1;
+  for(xs=-diff;xs!=0&&(xs&3)==0;xs>>=2)
+    ;
+  if(xs<0x100) return 1;
+  return 0;
 }
 
 // trashes r2
@@ -2800,7 +2855,7 @@ do_readstub(int n)
       temp=r; break;
     }
   }
-  if(rt>=0)
+  if(rt>=0&&rt1[i]!=0)
     reglist&=~(1<<rt);
   if(temp==-1) {
     save_regs(reglist);
@@ -2977,7 +3032,7 @@ inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, i
     return;
   handler=get_direct_memhandler(mem_rtab,addr,type,&host_addr);
   if (handler==0) {
-    if(rt<0)
+    if(rt<0||rt1[i]==0)
       return;
     if(addr!=host_addr)
       emit_movimm_from(addr,rs,host_addr,rs);
@@ -3002,7 +3057,7 @@ inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, i
   }
 
   // call a memhandler
-  if(rt>=0)
+  if(rt>=0&&rt1[i]!=0)
     reglist&=~(1<<rt);
   save_regs(reglist);
   if(target==0)
@@ -3033,7 +3088,7 @@ inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, i
   else
     emit_call(handler);
 
-  if(rt>=0) {
+  if(rt>=0&&rt1[i]!=0) {
     switch(type) {
       case LOADB_STUB:  emit_signextend8(0,rt); break;
       case LOADBU_STUB: emit_andimm(0,0xff,rt); break;
@@ -3239,7 +3294,7 @@ do_writestub(int n)
     emit_writeword(rt,(int)&dword);
     emit_writeword(r?rth:rt,(int)&dword+4);
 #else
-    printf("STORED_STUB\n");
+    SysPrintf("STORED_STUB\n");
 #endif
   }
   //emit_pusha();
@@ -3348,7 +3403,7 @@ inline_writestub(int type, int i, u_int addr, signed char regmap[], int target,
     emit_writeword(rt,(int)&dword);
     emit_writeword(target?rth:rt,(int)&dword+4);
 #else
-    printf("STORED_STUB\n");
+    SysPrintf("STORED_STUB\n");
 #endif
   }
   //emit_pusha();
@@ -3525,7 +3580,7 @@ int do_dirty_stub(int i)
   addr=(u_int)source;
   #endif
   // Careful about the code output here, verify_dirty needs to parse it.
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   emit_loadlp(addr,1);
   emit_loadlp((int)copy,2);
   emit_loadlp(slen*4,3);
@@ -3548,7 +3603,7 @@ int do_dirty_stub(int i)
 void do_dirty_stub_ds()
 {
   // Careful about the code output here, verify_dirty needs to parse it.
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   emit_loadlp((int)start<(int)0xC0000000?(int)source:(int)start,1);
   emit_loadlp((int)copy,2);
   emit_loadlp(slen*4,3);
@@ -3671,12 +3726,12 @@ generate_map_const(u_int addr,int reg) {
 
 #else
 
-static int do_tlb_r() { return 0; }
-static int do_tlb_r_branch() { return 0; }
-static int gen_tlb_addr_r() { return 0; }
-static int do_tlb_w() { return 0; }
-static int do_tlb_w_branch() { return 0; }
-static int gen_tlb_addr_w() { return 0; }
+static int do_tlb_r(int a, ...) { return 0; }
+static int do_tlb_r_branch(int a, ...) { return 0; }
+static int gen_tlb_addr_r(int a, ...) { return 0; }
+static int do_tlb_w(int a, ...) { return 0; }
+static int do_tlb_w_branch(int a, ...) { return 0; }
+static int gen_tlb_addr_w(int a, ...) { return 0; }
 
 #endif // DISABLE_TLB
 
@@ -3944,10 +3999,16 @@ static int emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
     type=0;
   }
   else if(type==MTYPE_1F80) { // scratchpad
-    emit_addimm(addr,-0x1f800000,HOST_TEMPREG);
-    emit_cmpimm(HOST_TEMPREG,0x1000);
-    jaddr=(int)out;
-    emit_jc(0);
+    if (psxH == (void *)0x1f800000) {
+      emit_addimm(addr,-0x1f800000,HOST_TEMPREG);
+      emit_cmpimm(HOST_TEMPREG,0x1000);
+      jaddr=(int)out;
+      emit_jc(0);
+    }
+    else {
+      // do usual RAM check, jump will go to the right handler
+      type=0;
+    }
   }
 #endif
 
@@ -3962,6 +4023,10 @@ static int emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
     else
     #endif
       emit_jno(0);
+    if(ram_offset!=0) {
+      emit_addimm(addr,ram_offset,HOST_TEMPREG);
+      addr=*addr_reg_override=HOST_TEMPREG;
+    }
   }
 
   return jaddr;
@@ -4013,6 +4078,10 @@ void loadlr_assemble_arm(int i,struct regstat *i_regs)
       jaddr=emit_fastpath_cmp_jump(i,temp2,&fastload_reg_override);
     }
     else {
+      if(ram_offset&&memtarget) {
+        emit_addimm(temp2,ram_offset,HOST_TEMPREG);
+        fastload_reg_override=HOST_TEMPREG;
+      }
       if (opcode[i]==0x22||opcode[i]==0x26) {
         emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
       }else{
@@ -4180,10 +4249,10 @@ void cop0_assemble(int i,struct regstat *i_regs)
         emit_writeword(HOST_CCREG,(int)&last_count);
         emit_movimm(0,HOST_CCREG);
         emit_storereg(CCREG,HOST_CCREG);
-        if(s!=1)
-          emit_mov(s,1);
+        emit_loadreg(rs1[i],1);
         emit_movimm(copr,0);
         emit_call((int)pcsx_mtc0_ds);
+        emit_loadreg(rs1[i],s);
         return;
       }
 #endif
@@ -4195,7 +4264,9 @@ void cop0_assemble(int i,struct regstat *i_regs)
     //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12);
     //else
 #ifdef PCSX
-    if(s!=1)
+    if(s==HOST_CCREG)
+      emit_loadreg(rs1[i],1);
+    else if(s!=1)
       emit_mov(s,1);
     emit_movimm(copr,0);
     emit_call((int)pcsx_mtc0);
@@ -4204,23 +4275,21 @@ void cop0_assemble(int i,struct regstat *i_regs)
 #endif
     if(copr==9||copr==11||copr==12||copr==13) {
       emit_readword((int)&Count,HOST_CCREG);
-      emit_readword((int)&next_interupt,ECX);
+      emit_readword((int)&next_interupt,HOST_TEMPREG);
       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
-      emit_sub(HOST_CCREG,ECX,HOST_CCREG);
-      emit_writeword(ECX,(int)&last_count);
+      emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
+      emit_writeword(HOST_TEMPREG,(int)&last_count);
       emit_storereg(CCREG,HOST_CCREG);
     }
     if(copr==12||copr==13) {
       assert(!is_delayslot);
       emit_readword((int)&pending_exception,14);
+      emit_test(14,14);
+      emit_jne((int)&do_interrupt);
     }
     emit_loadreg(rs1[i],s);
     if(get_reg(i_regs->regmap,rs1[i]|64)>=0)
       emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64));
-    if(copr==12||copr==13) {
-      emit_test(14,14);
-      emit_jne((int)&do_interrupt);
-    }
     cop1_usable=0;
   }
   else
@@ -4343,7 +4412,16 @@ static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
     case 30:
       emit_movs(sl,temp);
       emit_mvnmi(temp,temp);
+#ifdef HAVE_ARMV5
       emit_clz(temp,temp);
+#else
+      emit_movs(temp,HOST_TEMPREG);
+      emit_movimm(0,temp);
+      emit_jeq((int)out+4*4);
+      emit_addpl_imm(temp,1,temp);
+      emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+      emit_jns((int)out-2*4);
+#endif
       emit_writeword(sl,(int)&reg_cop2d[30]);
       emit_writeword(temp,(int)&reg_cop2d[31]);
       break;
@@ -4425,33 +4503,52 @@ static void c2op_epilogue(u_int op,u_int reglist)
   restore_regs_all(reglist);
 }
 
+static void c2op_call_MACtoIR(int lm,int need_flags)
+{
+  if(need_flags)
+    emit_call((int)(lm?gteMACtoIR_lm1:gteMACtoIR_lm0));
+  else
+    emit_call((int)(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf));
+}
+
+static void c2op_call_rgb_func(void *func,int lm,int need_ir,int need_flags)
+{
+  emit_call((int)func);
+  // func is C code and trashes r0
+  emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+  if(need_flags||need_ir)
+    c2op_call_MACtoIR(lm,need_flags);
+  emit_call((int)(need_flags?gteMACtoRGB:gteMACtoRGB_nf));
+}
+
 static void c2op_assemble(int i,struct regstat *i_regs)
 {
   signed char temp=get_reg(i_regs->regmap,-1);
   u_int c2op=source[i]&0x3f;
-  u_int hr,reglist=0;
+  u_int hr,reglist_full=0,reglist;
   int need_flags,need_ir;
   for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
+    if(i_regs->regmap[hr]>=0) reglist_full|=1<<hr;
   }
+  reglist=reglist_full&CALLER_SAVE_REGS;
 
   if (gte_handlers[c2op]!=NULL) {
     need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
     need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
-    assem_debug("gte unneeded %016llx, need_flags %d, need_ir %d\n",
-      gte_unneeded[i+1],need_flags,need_ir);
-#ifdef ARMv5_ONLY
-    // let's take more risk here
-    need_flags=need_flags&&gte_reads_flags;
-#endif
+    assem_debug("gte op %08x, unneeded %016llx, need_flags %d, need_ir %d\n",
+      source[i],gte_unneeded[i+1],need_flags,need_ir);
+    if(new_dynarec_hacks&NDHACK_GTE_NO_FLAGS)
+      need_flags=0;
+    int shift = (source[i] >> 19) & 1;
+    int lm = (source[i] >> 10) & 1;
     switch(c2op) {
+#ifndef DRC_DBG
       case GTE_MVMVA: {
-        int shift = (source[i] >> 19) & 1;
+#ifdef HAVE_ARMV5
         int v  = (source[i] >> 15) & 3;
         int cv = (source[i] >> 13) & 3;
         int mx = (source[i] >> 17) & 3;
-        int lm = (source[i] >> 10) & 1;
-        reglist&=0x10ff; // +{r4-r7}
+        reglist=reglist_full&(CALLER_SAVE_REGS|0xf0); // +{r4-r7}
         c2op_prologue(c2op,reglist);
         /* r4,r5 = VXYZ(v) packed; r6 = &MX11(mx); r7 = &CV1(cv) */
         if(v<3)
@@ -4484,21 +4581,60 @@ static void c2op_assemble(int i,struct regstat *i_regs)
           emit_movimm(shift,1);
           emit_call((int)(need_flags?gteMVMVA_part_arm:gteMVMVA_part_nf_arm));
         }
-        if(need_flags||need_ir) {
-          if(need_flags)
-            emit_call((int)(lm?gteMACtoIR_lm1:gteMACtoIR_lm0));
-          else
-            emit_call((int)(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf)); // lm0 borked
-        }
+        if(need_flags||need_ir)
+          c2op_call_MACtoIR(lm,need_flags);
+#endif
+#else /* if not HAVE_ARMV5 */
+        c2op_prologue(c2op,reglist);
+        emit_movimm(source[i],1); // opcode
+        emit_writeword(1,(int)&psxRegs.code);
+        emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]));
 #endif
         break;
       }
-
+      case GTE_OP:
+        c2op_prologue(c2op,reglist);
+        emit_call((int)(shift?gteOP_part_shift:gteOP_part_noshift));
+        if(need_flags||need_ir) {
+          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+          c2op_call_MACtoIR(lm,need_flags);
+        }
+        break;
+      case GTE_DPCS:
+        c2op_prologue(c2op,reglist);
+        c2op_call_rgb_func(shift?gteDPCS_part_shift:gteDPCS_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_INTPL:
+        c2op_prologue(c2op,reglist);
+        c2op_call_rgb_func(shift?gteINTPL_part_shift:gteINTPL_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_SQR:
+        c2op_prologue(c2op,reglist);
+        emit_call((int)(shift?gteSQR_part_shift:gteSQR_part_noshift));
+        if(need_flags||need_ir) {
+          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+          c2op_call_MACtoIR(lm,need_flags);
+        }
+        break;
+      case GTE_DCPL:
+        c2op_prologue(c2op,reglist);
+        c2op_call_rgb_func(gteDCPL_part,lm,need_ir,need_flags);
+        break;
+      case GTE_GPF:
+        c2op_prologue(c2op,reglist);
+        c2op_call_rgb_func(shift?gteGPF_part_shift:gteGPF_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_GPL:
+        c2op_prologue(c2op,reglist);
+        c2op_call_rgb_func(shift?gteGPL_part_shift:gteGPL_part_noshift,lm,need_ir,need_flags);
+        break;
+#endif
       default:
-        reglist&=0x100f;
         c2op_prologue(c2op,reglist);
+#ifdef DRC_DBG
         emit_movimm(source[i],1); // opcode
         emit_writeword(1,(int)&psxRegs.code);
+#endif
         emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]));
         break;
     }
@@ -5193,9 +5329,16 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         emit_negmi(remainder,remainder); // .. remainder for div0 case (will be negated back after jump)
         emit_movs(d2,HOST_TEMPREG);
         emit_jeq((int)out+52); // Division by zero
-        emit_negmi(HOST_TEMPREG,HOST_TEMPREG);
+        emit_negsmi(HOST_TEMPREG,HOST_TEMPREG);
+#ifdef HAVE_ARMV5
         emit_clz(HOST_TEMPREG,quotient);
         emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG);
+#else
+        emit_movimm(0,quotient);
+        emit_addpl_imm(quotient,1,quotient);
+        emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+        emit_jns((int)out-2*4);
+#endif
         emit_orimm(quotient,1<<31,quotient);
         emit_shr(quotient,quotient,quotient);
         emit_cmp(remainder,HOST_TEMPREG);
@@ -5222,9 +5365,17 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         emit_movimm(0xffffffff,quotient); // div0 case
         emit_test(d2,d2);
         emit_jeq((int)out+40); // Division by zero
+#ifdef HAVE_ARMV5
         emit_clz(d2,HOST_TEMPREG);
         emit_movimm(1<<31,quotient);
         emit_shl(d2,HOST_TEMPREG,d2);
+#else
+        emit_movimm(0,HOST_TEMPREG);
+        emit_addpl_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+        emit_lslpls_imm(d2,1,d2);
+        emit_jns((int)out-2*4);
+        emit_movimm(1<<31,quotient);
+#endif
         emit_shr(quotient,HOST_TEMPREG,quotient);
         emit_cmp(remainder,d2);
         emit_subcs(remainder,d2,remainder);
@@ -5275,7 +5426,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         assert(m2h>=0);
         assert(m1l>=0);
         assert(m2l>=0);
-        save_regs(0x100f);
+        save_regs(CALLER_SAVE_REGS);
         if(m1l!=0) emit_mov(m1l,0);
         if(m1h==0) emit_readword((int)&dynarec_local,1);
         else if(m1h>1) emit_mov(m1h,1);
@@ -5284,7 +5435,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         if(m2h<3) emit_readword((int)&dynarec_local+m2h*4,3);
         else if(m2h>3) emit_mov(m2h,3);
         emit_call((int)&multu64);
-        restore_regs(0x100f);
+        restore_regs(CALLER_SAVE_REGS);
         signed char hih=get_reg(i_regs->regmap,HIREG|64);
         signed char hil=get_reg(i_regs->regmap,HIREG);
         signed char loh=get_reg(i_regs->regmap,LOREG|64);
@@ -5355,7 +5506,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         assert(d2h>=0);
         assert(d1l>=0);
         assert(d2l>=0);
-        save_regs(0x100f);
+        save_regs(CALLER_SAVE_REGS);
         if(d1l!=0) emit_mov(d1l,0);
         if(d1h==0) emit_readword((int)&dynarec_local,1);
         else if(d1h>1) emit_mov(d1h,1);
@@ -5364,7 +5515,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         if(d2h<3) emit_readword((int)&dynarec_local+d2h*4,3);
         else if(d2h>3) emit_mov(d2h,3);
         emit_call((int)&div64);
-        restore_regs(0x100f);
+        restore_regs(CALLER_SAVE_REGS);
         signed char hih=get_reg(i_regs->regmap,HIREG|64);
         signed char hil=get_reg(i_regs->regmap,HIREG);
         signed char loh=get_reg(i_regs->regmap,LOREG|64);
@@ -5388,7 +5539,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         assert(d2h>=0);
         assert(d1l>=0);
         assert(d2l>=0);
-        save_regs(0x100f);
+        save_regs(CALLER_SAVE_REGS);
         if(d1l!=0) emit_mov(d1l,0);
         if(d1h==0) emit_readword((int)&dynarec_local,1);
         else if(d1h>1) emit_mov(d1h,1);
@@ -5397,7 +5548,7 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs)
         if(d2h<3) emit_readword((int)&dynarec_local+d2h*4,3);
         else if(d2h>3) emit_mov(d2h,3);
         emit_call((int)&divu64);
-        restore_regs(0x100f);
+        restore_regs(CALLER_SAVE_REGS);
         signed char hih=get_reg(i_regs->regmap,HIREG|64);
         signed char hil=get_reg(i_regs->regmap,HIREG);
         signed char loh=get_reg(i_regs->regmap,LOREG|64);
@@ -5455,7 +5606,7 @@ void do_miniht_jump(int rs,int rh,int ht) {
 }
 
 void do_miniht_insert(u_int return_address,int rt,int temp) {
-  #ifdef ARMv5_ONLY
+  #ifndef HAVE_ARMV7
   emit_movimm(return_address,rt); // PC into link register
   add_to_linker((int)out,return_address,1);
   emit_pcreladdr(temp);
@@ -5594,7 +5745,7 @@ void do_clear_cache()
       for(j=0;j<32;j++) 
       {
         if(bitmap&(1<<j)) {
-          start=BASE_ADDR+i*131072+j*4096;
+          start=(u_int)BASE_ADDR+i*131072+j*4096;
           end=start+4095;
           j++;
           while(j<32) {