82b7445ac874468d7ce4aede33d5d9c9560c89e9
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #endif
57
58 #define MAXBLOCK 4096
59 #define MAX_OUTPUT_BLOCK_SIZE 262144
60
61 struct regstat
62 {
63   signed char regmap_entry[HOST_REGS];
64   signed char regmap[HOST_REGS];
65   uint64_t was32;
66   uint64_t is32;
67   uint64_t wasdirty;
68   uint64_t dirty;
69   uint64_t u;
70   uint64_t uu;
71   u_int wasconst;
72   u_int isconst;
73   u_int loadedconst;             // host regs that have constants loaded
74   u_int waswritten;              // MIPS regs that were used as store base before
75 };
76
77 // note: asm depends on this layout
78 struct ll_entry
79 {
80   u_int vaddr;
81   u_int reg_sv_flags;
82   void *addr;
83   struct ll_entry *next;
84 };
85
86   u_int start;
87   u_int *source;
88   u_int pagelimit;
89   char insn[MAXBLOCK][10];
90   u_char itype[MAXBLOCK];
91   u_char opcode[MAXBLOCK];
92   u_char opcode2[MAXBLOCK];
93   u_char bt[MAXBLOCK];
94   u_char rs1[MAXBLOCK];
95   u_char rs2[MAXBLOCK];
96   u_char rt1[MAXBLOCK];
97   u_char rt2[MAXBLOCK];
98   u_char us1[MAXBLOCK];
99   u_char us2[MAXBLOCK];
100   u_char dep1[MAXBLOCK];
101   u_char dep2[MAXBLOCK];
102   u_char lt1[MAXBLOCK];
103   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
104   static uint64_t gte_rt[MAXBLOCK];
105   static uint64_t gte_unneeded[MAXBLOCK];
106   static u_int smrv[32]; // speculated MIPS register values
107   static u_int smrv_strong; // mask or regs that are likely to have correct values
108   static u_int smrv_weak; // same, but somewhat less likely
109   static u_int smrv_strong_next; // same, but after current insn executes
110   static u_int smrv_weak_next;
111   int imm[MAXBLOCK];
112   u_int ba[MAXBLOCK];
113   char likely[MAXBLOCK];
114   char is_ds[MAXBLOCK];
115   char ooo[MAXBLOCK];
116   uint64_t unneeded_reg[MAXBLOCK];
117   uint64_t unneeded_reg_upper[MAXBLOCK];
118   uint64_t branch_unneeded_reg[MAXBLOCK];
119   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
120   uint64_t p32[MAXBLOCK];
121   uint64_t pr32[MAXBLOCK];
122   signed char regmap_pre[MAXBLOCK][HOST_REGS];
123   static uint64_t current_constmap[HOST_REGS];
124   static uint64_t constmap[MAXBLOCK][HOST_REGS];
125   static struct regstat regs[MAXBLOCK];
126   static struct regstat branch_regs[MAXBLOCK];
127   signed char minimum_free_regs[MAXBLOCK];
128   u_int needed_reg[MAXBLOCK];
129   uint64_t requires_32bit[MAXBLOCK];
130   u_int wont_dirty[MAXBLOCK];
131   u_int will_dirty[MAXBLOCK];
132   int ccadj[MAXBLOCK];
133   int slen;
134   u_int instr_addr[MAXBLOCK];
135   u_int link_addr[MAXBLOCK][3];
136   int linkcount;
137   u_int stubs[MAXBLOCK*3][8];
138   int stubcount;
139   u_int literals[1024][2];
140   int literalcount;
141   int is_delayslot;
142   int cop1_usable;
143   u_char *out;
144   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
145   struct ll_entry *jump_out[4096];
146   struct ll_entry *jump_dirty[4096];
147   u_int hash_table[65536][4]  __attribute__((aligned(16)));
148   char shadow[1048576]  __attribute__((aligned(16)));
149   void *copy;
150   int expirep;
151 #ifndef PCSX
152   u_int using_tlb;
153 #else
154   static const u_int using_tlb=0;
155 #endif
156   int new_dynarec_did_compile;
157   int new_dynarec_hacks;
158   u_int stop_after_jal;
159 #ifndef RAM_FIXED
160   static u_int ram_offset;
161 #else
162   static const u_int ram_offset=0;
163 #endif
164   extern u_char restore_candidate[512];
165   extern int cycle_count;
166
167   /* registers that may be allocated */
168   /* 1-31 gpr */
169 #define HIREG 32 // hi
170 #define LOREG 33 // lo
171 #define FSREG 34 // FPU status (FCSR)
172 #define CSREG 35 // Coprocessor status
173 #define CCREG 36 // Cycle count
174 #define INVCP 37 // Pointer to invalid_code
175 #define MMREG 38 // Pointer to memory_map
176 #define ROREG 39 // ram offset (if rdram!=0x80000000)
177 #define TEMPREG 40
178 #define FTEMP 40 // FPU temporary register
179 #define PTEMP 41 // Prefetch temporary register
180 #define TLREG 42 // TLB mapping offset
181 #define RHASH 43 // Return address hash
182 #define RHTBL 44 // Return address hash table address
183 #define RTEMP 45 // JR/JALR address register
184 #define MAXREG 45
185 #define AGEN1 46 // Address generation temporary register
186 #define AGEN2 47 // Address generation temporary register
187 #define MGEN1 48 // Maptable address generation temporary register
188 #define MGEN2 49 // Maptable address generation temporary register
189 #define BTREG 50 // Branch target temporary register
190
191   /* instruction types */
192 #define NOP 0     // No operation
193 #define LOAD 1    // Load
194 #define STORE 2   // Store
195 #define LOADLR 3  // Unaligned load
196 #define STORELR 4 // Unaligned store
197 #define MOV 5     // Move 
198 #define ALU 6     // Arithmetic/logic
199 #define MULTDIV 7 // Multiply/divide
200 #define SHIFT 8   // Shift by register
201 #define SHIFTIMM 9// Shift by immediate
202 #define IMM16 10  // 16-bit immediate
203 #define RJUMP 11  // Unconditional jump to register
204 #define UJUMP 12  // Unconditional jump
205 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
206 #define SJUMP 14  // Conditional branch (regimm format)
207 #define COP0 15   // Coprocessor 0
208 #define COP1 16   // Coprocessor 1
209 #define C1LS 17   // Coprocessor 1 load/store
210 #define FJUMP 18  // Conditional branch (floating point)
211 #define FLOAT 19  // Floating point unit
212 #define FCONV 20  // Convert integer to float
213 #define FCOMP 21  // Floating point compare (sets FSREG)
214 #define SYSCALL 22// SYSCALL
215 #define OTHER 23  // Other
216 #define SPAN 24   // Branch/delay slot spans 2 pages
217 #define NI 25     // Not implemented
218 #define HLECALL 26// PCSX fake opcodes for HLE
219 #define COP2 27   // Coprocessor 2 move
220 #define C2LS 28   // Coprocessor 2 load/store
221 #define C2OP 29   // Coprocessor 2 operation
222 #define INTCALL 30// Call interpreter to handle rare corner cases
223
224   /* stubs */
225 #define CC_STUB 1
226 #define FP_STUB 2
227 #define LOADB_STUB 3
228 #define LOADH_STUB 4
229 #define LOADW_STUB 5
230 #define LOADD_STUB 6
231 #define LOADBU_STUB 7
232 #define LOADHU_STUB 8
233 #define STOREB_STUB 9
234 #define STOREH_STUB 10
235 #define STOREW_STUB 11
236 #define STORED_STUB 12
237 #define STORELR_STUB 13
238 #define INVCODE_STUB 14
239
240   /* branch codes */
241 #define TAKEN 1
242 #define NOTTAKEN 2
243 #define NULLDS 3
244
245 // asm linkage
246 int new_recompile_block(int addr);
247 void *get_addr_ht(u_int vaddr);
248 void invalidate_block(u_int block);
249 void invalidate_addr(u_int addr);
250 void remove_hash(int vaddr);
251 void jump_vaddr();
252 void dyna_linker();
253 void dyna_linker_ds();
254 void verify_code();
255 void verify_code_vm();
256 void verify_code_ds();
257 void cc_interrupt();
258 void fp_exception();
259 void fp_exception_ds();
260 void jump_syscall();
261 void jump_syscall_hle();
262 void jump_eret();
263 void jump_hlecall();
264 void jump_intcall();
265 void new_dyna_leave();
266
267 // TLB
268 void TLBWI_new();
269 void TLBWR_new();
270 void read_nomem_new();
271 void read_nomemb_new();
272 void read_nomemh_new();
273 void read_nomemd_new();
274 void write_nomem_new();
275 void write_nomemb_new();
276 void write_nomemh_new();
277 void write_nomemd_new();
278 void write_rdram_new();
279 void write_rdramb_new();
280 void write_rdramh_new();
281 void write_rdramd_new();
282 extern u_int memory_map[1048576];
283
284 // Needed by assembler
285 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
286 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
287 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
288 void load_all_regs(signed char i_regmap[]);
289 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
290 void load_regs_entry(int t);
291 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
292
293 int tracedebug=0;
294
295 //#define DEBUG_CYCLE_COUNT 1
296
297 #define NO_CYCLE_PENALTY_THR 12
298
299 int cycle_multiplier; // 100 for 1.0
300
301 static int CLOCK_ADJUST(int x)
302 {
303   int s=(x>>31)|1;
304   return (x * cycle_multiplier + s * 50) / 100;
305 }
306
307 static void tlb_hacks()
308 {
309 #ifndef DISABLE_TLB
310   // Goldeneye hack
311   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
312   {
313     u_int addr;
314     int n;
315     switch (ROM_HEADER->Country_code&0xFF) 
316     {
317       case 0x45: // U
318         addr=0x34b30;
319         break;                   
320       case 0x4A: // J 
321         addr=0x34b70;    
322         break;    
323       case 0x50: // E 
324         addr=0x329f0;
325         break;                        
326       default: 
327         // Unknown country code
328         addr=0;
329         break;
330     }
331     u_int rom_addr=(u_int)rom;
332     #ifdef ROM_COPY
333     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
334     // in the lower 4G of memory to use this hack.  Copy it if necessary.
335     if((void *)rom>(void *)0xffffffff) {
336       munmap(ROM_COPY, 67108864);
337       if(mmap(ROM_COPY, 12582912,
338               PROT_READ | PROT_WRITE,
339               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
340               -1, 0) <= 0) {printf("mmap() failed\n");}
341       memcpy(ROM_COPY,rom,12582912);
342       rom_addr=(u_int)ROM_COPY;
343     }
344     #endif
345     if(addr) {
346       for(n=0x7F000;n<0x80000;n++) {
347         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
348       }
349     }
350   }
351 #endif
352 }
353
354 static u_int get_page(u_int vaddr)
355 {
356 #ifndef PCSX
357   u_int page=(vaddr^0x80000000)>>12;
358 #else
359   u_int page=vaddr&~0xe0000000;
360   if (page < 0x1000000)
361     page &= ~0x0e00000; // RAM mirrors
362   page>>=12;
363 #endif
364 #ifndef DISABLE_TLB
365   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
366 #endif
367   if(page>2048) page=2048+(page&2047);
368   return page;
369 }
370
371 #ifndef PCSX
372 static u_int get_vpage(u_int vaddr)
373 {
374   u_int vpage=(vaddr^0x80000000)>>12;
375 #ifndef DISABLE_TLB
376   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
377 #endif
378   if(vpage>2048) vpage=2048+(vpage&2047);
379   return vpage;
380 }
381 #else
382 // no virtual mem in PCSX
383 static u_int get_vpage(u_int vaddr)
384 {
385   return get_page(vaddr);
386 }
387 #endif
388
389 // Get address from virtual address
390 // This is called from the recompiled JR/JALR instructions
391 void *get_addr(u_int vaddr)
392 {
393   u_int page=get_page(vaddr);
394   u_int vpage=get_vpage(vaddr);
395   struct ll_entry *head;
396   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
397   head=jump_in[page];
398   while(head!=NULL) {
399     if(head->vaddr==vaddr) {
400   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
401       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
402       ht_bin[3]=ht_bin[1];
403       ht_bin[2]=ht_bin[0];
404       ht_bin[1]=(int)head->addr;
405       ht_bin[0]=vaddr;
406       return head->addr;
407     }
408     head=head->next;
409   }
410   head=jump_dirty[vpage];
411   while(head!=NULL) {
412     if(head->vaddr==vaddr) {
413       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
414       // Don't restore blocks which are about to expire from the cache
415       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
416       if(verify_dirty(head->addr)) {
417         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
418         invalid_code[vaddr>>12]=0;
419         inv_code_start=inv_code_end=~0;
420 #ifndef DISABLE_TLB
421         memory_map[vaddr>>12]|=0x40000000;
422 #endif
423         if(vpage<2048) {
424 #ifndef DISABLE_TLB
425           if(tlb_LUT_r[vaddr>>12]) {
426             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
427             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
428           }
429 #endif
430           restore_candidate[vpage>>3]|=1<<(vpage&7);
431         }
432         else restore_candidate[page>>3]|=1<<(page&7);
433         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
434         if(ht_bin[0]==vaddr) {
435           ht_bin[1]=(int)head->addr; // Replace existing entry
436         }
437         else
438         {
439           ht_bin[3]=ht_bin[1];
440           ht_bin[2]=ht_bin[0];
441           ht_bin[1]=(int)head->addr;
442           ht_bin[0]=vaddr;
443         }
444         return head->addr;
445       }
446     }
447     head=head->next;
448   }
449   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
450   int r=new_recompile_block(vaddr);
451   if(r==0) return get_addr(vaddr);
452   // Execute in unmapped page, generate pagefault execption
453   Status|=2;
454   Cause=(vaddr<<31)|0x8;
455   EPC=(vaddr&1)?vaddr-5:vaddr;
456   BadVAddr=(vaddr&~1);
457   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
458   EntryHi=BadVAddr&0xFFFFE000;
459   return get_addr_ht(0x80000000);
460 }
461 // Look up address in hash table first
462 void *get_addr_ht(u_int vaddr)
463 {
464   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
465   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
466   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
467   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
468   return get_addr(vaddr);
469 }
470
471 void clear_all_regs(signed char regmap[])
472 {
473   int hr;
474   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
475 }
476
477 signed char get_reg(signed char regmap[],int r)
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
481   return -1;
482 }
483
484 // Find a register that is available for two consecutive cycles
485 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
486 {
487   int hr;
488   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
489   return -1;
490 }
491
492 int count_free_regs(signed char regmap[])
493 {
494   int count=0;
495   int hr;
496   for(hr=0;hr<HOST_REGS;hr++)
497   {
498     if(hr!=EXCLUDE_REG) {
499       if(regmap[hr]<0) count++;
500     }
501   }
502   return count;
503 }
504
505 void dirty_reg(struct regstat *cur,signed char reg)
506 {
507   int hr;
508   if(!reg) return;
509   for (hr=0;hr<HOST_REGS;hr++) {
510     if((cur->regmap[hr]&63)==reg) {
511       cur->dirty|=1<<hr;
512     }
513   }
514 }
515
516 // If we dirty the lower half of a 64 bit register which is now being
517 // sign-extended, we need to dump the upper half.
518 // Note: Do this only after completion of the instruction, because
519 // some instructions may need to read the full 64-bit value even if
520 // overwriting it (eg SLTI, DSRA32).
521 static void flush_dirty_uppers(struct regstat *cur)
522 {
523   int hr,reg;
524   for (hr=0;hr<HOST_REGS;hr++) {
525     if((cur->dirty>>hr)&1) {
526       reg=cur->regmap[hr];
527       if(reg>=64) 
528         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
529     }
530   }
531 }
532
533 void set_const(struct regstat *cur,signed char reg,uint64_t value)
534 {
535   int hr;
536   if(!reg) return;
537   for (hr=0;hr<HOST_REGS;hr++) {
538     if(cur->regmap[hr]==reg) {
539       cur->isconst|=1<<hr;
540       current_constmap[hr]=value;
541     }
542     else if((cur->regmap[hr]^64)==reg) {
543       cur->isconst|=1<<hr;
544       current_constmap[hr]=value>>32;
545     }
546   }
547 }
548
549 void clear_const(struct regstat *cur,signed char reg)
550 {
551   int hr;
552   if(!reg) return;
553   for (hr=0;hr<HOST_REGS;hr++) {
554     if((cur->regmap[hr]&63)==reg) {
555       cur->isconst&=~(1<<hr);
556     }
557   }
558 }
559
560 int is_const(struct regstat *cur,signed char reg)
561 {
562   int hr;
563   if(reg<0) return 0;
564   if(!reg) return 1;
565   for (hr=0;hr<HOST_REGS;hr++) {
566     if((cur->regmap[hr]&63)==reg) {
567       return (cur->isconst>>hr)&1;
568     }
569   }
570   return 0;
571 }
572 uint64_t get_const(struct regstat *cur,signed char reg)
573 {
574   int hr;
575   if(!reg) return 0;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       return current_constmap[hr];
579     }
580   }
581   SysPrintf("Unknown constant in r%d\n",reg);
582   exit(1);
583 }
584
585 // Least soon needed registers
586 // Look at the next ten instructions and see which registers
587 // will be used.  Try not to reallocate these.
588 void lsn(u_char hsn[], int i, int *preferred_reg)
589 {
590   int j;
591   int b=-1;
592   for(j=0;j<9;j++)
593   {
594     if(i+j>=slen) {
595       j=slen-i-1;
596       break;
597     }
598     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
599     {
600       // Don't go past an unconditonal jump
601       j++;
602       break;
603     }
604   }
605   for(;j>=0;j--)
606   {
607     if(rs1[i+j]) hsn[rs1[i+j]]=j;
608     if(rs2[i+j]) hsn[rs2[i+j]]=j;
609     if(rt1[i+j]) hsn[rt1[i+j]]=j;
610     if(rt2[i+j]) hsn[rt2[i+j]]=j;
611     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
612       // Stores can allocate zero
613       hsn[rs1[i+j]]=j;
614       hsn[rs2[i+j]]=j;
615     }
616     // On some architectures stores need invc_ptr
617     #if defined(HOST_IMM8)
618     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
619       hsn[INVCP]=j;
620     }
621     #endif
622     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
623     {
624       hsn[CCREG]=j;
625       b=j;
626     }
627   }
628   if(b>=0)
629   {
630     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
631     {
632       // Follow first branch
633       int t=(ba[i+b]-start)>>2;
634       j=7-b;if(t+j>=slen) j=slen-t-1;
635       for(;j>=0;j--)
636       {
637         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
638         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
639         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
640         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
641       }
642     }
643     // TODO: preferred register based on backward branch
644   }
645   // Delay slot should preferably not overwrite branch conditions or cycle count
646   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
647     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
648     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
649     hsn[CCREG]=1;
650     // ...or hash tables
651     hsn[RHASH]=1;
652     hsn[RHTBL]=1;
653   }
654   // Coprocessor load/store needs FTEMP, even if not declared
655   if(itype[i]==C1LS||itype[i]==C2LS) {
656     hsn[FTEMP]=0;
657   }
658   // Load L/R also uses FTEMP as a temporary register
659   if(itype[i]==LOADLR) {
660     hsn[FTEMP]=0;
661   }
662   // Also SWL/SWR/SDL/SDR
663   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
664     hsn[FTEMP]=0;
665   }
666   // Don't remove the TLB registers either
667   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
668     hsn[TLREG]=0;
669   }
670   // Don't remove the miniht registers
671   if(itype[i]==UJUMP||itype[i]==RJUMP)
672   {
673     hsn[RHASH]=0;
674     hsn[RHTBL]=0;
675   }
676 }
677
678 // We only want to allocate registers if we're going to use them again soon
679 int needed_again(int r, int i)
680 {
681   int j;
682   int b=-1;
683   int rn=10;
684   
685   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
686   {
687     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
688       return 0; // Don't need any registers if exiting the block
689   }
690   for(j=0;j<9;j++)
691   {
692     if(i+j>=slen) {
693       j=slen-i-1;
694       break;
695     }
696     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
697     {
698       // Don't go past an unconditonal jump
699       j++;
700       break;
701     }
702     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
703     {
704       break;
705     }
706   }
707   for(;j>=1;j--)
708   {
709     if(rs1[i+j]==r) rn=j;
710     if(rs2[i+j]==r) rn=j;
711     if((unneeded_reg[i+j]>>r)&1) rn=10;
712     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
713     {
714       b=j;
715     }
716   }
717   /*
718   if(b>=0)
719   {
720     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
721     {
722       // Follow first branch
723       int o=rn;
724       int t=(ba[i+b]-start)>>2;
725       j=7-b;if(t+j>=slen) j=slen-t-1;
726       for(;j>=0;j--)
727       {
728         if(!((unneeded_reg[t+j]>>r)&1)) {
729           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
730           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
731         }
732         else rn=o;
733       }
734     }
735   }*/
736   if(rn<10) return 1;
737   return 0;
738 }
739
740 // Try to match register allocations at the end of a loop with those
741 // at the beginning
742 int loop_reg(int i, int r, int hr)
743 {
744   int j,k;
745   for(j=0;j<9;j++)
746   {
747     if(i+j>=slen) {
748       j=slen-i-1;
749       break;
750     }
751     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
752     {
753       // Don't go past an unconditonal jump
754       j++;
755       break;
756     }
757   }
758   k=0;
759   if(i>0){
760     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
761       k--;
762   }
763   for(;k<j;k++)
764   {
765     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
766     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
767     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
768     {
769       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
770       {
771         int t=(ba[i+k]-start)>>2;
772         int reg=get_reg(regs[t].regmap_entry,r);
773         if(reg>=0) return reg;
774         //reg=get_reg(regs[t+1].regmap_entry,r);
775         //if(reg>=0) return reg;
776       }
777     }
778   }
779   return hr;
780 }
781
782
783 // Allocate every register, preserving source/target regs
784 void alloc_all(struct regstat *cur,int i)
785 {
786   int hr;
787   
788   for(hr=0;hr<HOST_REGS;hr++) {
789     if(hr!=EXCLUDE_REG) {
790       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
791          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
792       {
793         cur->regmap[hr]=-1;
794         cur->dirty&=~(1<<hr);
795       }
796       // Don't need zeros
797       if((cur->regmap[hr]&63)==0)
798       {
799         cur->regmap[hr]=-1;
800         cur->dirty&=~(1<<hr);
801       }
802     }
803   }
804 }
805
806 #ifndef FORCE32
807 void div64(int64_t dividend,int64_t divisor)
808 {
809   lo=dividend/divisor;
810   hi=dividend%divisor;
811   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
812   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
813 }
814 void divu64(uint64_t dividend,uint64_t divisor)
815 {
816   lo=dividend/divisor;
817   hi=dividend%divisor;
818   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
819   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
820 }
821
822 void mult64(uint64_t m1,uint64_t m2)
823 {
824    unsigned long long int op1, op2, op3, op4;
825    unsigned long long int result1, result2, result3, result4;
826    unsigned long long int temp1, temp2, temp3, temp4;
827    int sign = 0;
828    
829    if (m1 < 0)
830      {
831     op2 = -m1;
832     sign = 1 - sign;
833      }
834    else op2 = m1;
835    if (m2 < 0)
836      {
837     op4 = -m2;
838     sign = 1 - sign;
839      }
840    else op4 = m2;
841    
842    op1 = op2 & 0xFFFFFFFF;
843    op2 = (op2 >> 32) & 0xFFFFFFFF;
844    op3 = op4 & 0xFFFFFFFF;
845    op4 = (op4 >> 32) & 0xFFFFFFFF;
846    
847    temp1 = op1 * op3;
848    temp2 = (temp1 >> 32) + op1 * op4;
849    temp3 = op2 * op3;
850    temp4 = (temp3 >> 32) + op2 * op4;
851    
852    result1 = temp1 & 0xFFFFFFFF;
853    result2 = temp2 + (temp3 & 0xFFFFFFFF);
854    result3 = (result2 >> 32) + temp4;
855    result4 = (result3 >> 32);
856    
857    lo = result1 | (result2 << 32);
858    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
859    if (sign)
860      {
861     hi = ~hi;
862     if (!lo) hi++;
863     else lo = ~lo + 1;
864      }
865 }
866
867 void multu64(uint64_t m1,uint64_t m2)
868 {
869    unsigned long long int op1, op2, op3, op4;
870    unsigned long long int result1, result2, result3, result4;
871    unsigned long long int temp1, temp2, temp3, temp4;
872    
873    op1 = m1 & 0xFFFFFFFF;
874    op2 = (m1 >> 32) & 0xFFFFFFFF;
875    op3 = m2 & 0xFFFFFFFF;
876    op4 = (m2 >> 32) & 0xFFFFFFFF;
877    
878    temp1 = op1 * op3;
879    temp2 = (temp1 >> 32) + op1 * op4;
880    temp3 = op2 * op3;
881    temp4 = (temp3 >> 32) + op2 * op4;
882    
883    result1 = temp1 & 0xFFFFFFFF;
884    result2 = temp2 + (temp3 & 0xFFFFFFFF);
885    result3 = (result2 >> 32) + temp4;
886    result4 = (result3 >> 32);
887    
888    lo = result1 | (result2 << 32);
889    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
890    
891   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
892   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
893 }
894
895 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
896 {
897   if(bits) {
898     original<<=64-bits;
899     original>>=64-bits;
900     loaded<<=bits;
901     original|=loaded;
902   }
903   else original=loaded;
904   return original;
905 }
906 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
907 {
908   if(bits^56) {
909     original>>=64-(bits^56);
910     original<<=64-(bits^56);
911     loaded>>=bits^56;
912     original|=loaded;
913   }
914   else original=loaded;
915   return original;
916 }
917 #endif
918
919 #ifdef __i386__
920 #include "assem_x86.c"
921 #endif
922 #ifdef __x86_64__
923 #include "assem_x64.c"
924 #endif
925 #ifdef __arm__
926 #include "assem_arm.c"
927 #endif
928
929 // Add virtual address mapping to linked list
930 void ll_add(struct ll_entry **head,int vaddr,void *addr)
931 {
932   struct ll_entry *new_entry;
933   new_entry=malloc(sizeof(struct ll_entry));
934   assert(new_entry!=NULL);
935   new_entry->vaddr=vaddr;
936   new_entry->reg_sv_flags=0;
937   new_entry->addr=addr;
938   new_entry->next=*head;
939   *head=new_entry;
940 }
941
942 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
943 {
944   ll_add(head,vaddr,addr);
945   (*head)->reg_sv_flags=reg_sv_flags;
946 }
947
948 // Check if an address is already compiled
949 // but don't return addresses which are about to expire from the cache
950 void *check_addr(u_int vaddr)
951 {
952   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
953   if(ht_bin[0]==vaddr) {
954     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
955       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
956   }
957   if(ht_bin[2]==vaddr) {
958     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
959       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
960   }
961   u_int page=get_page(vaddr);
962   struct ll_entry *head;
963   head=jump_in[page];
964   while(head!=NULL) {
965     if(head->vaddr==vaddr) {
966       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
967         // Update existing entry with current address
968         if(ht_bin[0]==vaddr) {
969           ht_bin[1]=(int)head->addr;
970           return head->addr;
971         }
972         if(ht_bin[2]==vaddr) {
973           ht_bin[3]=(int)head->addr;
974           return head->addr;
975         }
976         // Insert into hash table with low priority.
977         // Don't evict existing entries, as they are probably
978         // addresses that are being accessed frequently.
979         if(ht_bin[0]==-1) {
980           ht_bin[1]=(int)head->addr;
981           ht_bin[0]=vaddr;
982         }else if(ht_bin[2]==-1) {
983           ht_bin[3]=(int)head->addr;
984           ht_bin[2]=vaddr;
985         }
986         return head->addr;
987       }
988     }
989     head=head->next;
990   }
991   return 0;
992 }
993
994 void remove_hash(int vaddr)
995 {
996   //printf("remove hash: %x\n",vaddr);
997   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
998   if(ht_bin[2]==vaddr) {
999     ht_bin[2]=ht_bin[3]=-1;
1000   }
1001   if(ht_bin[0]==vaddr) {
1002     ht_bin[0]=ht_bin[2];
1003     ht_bin[1]=ht_bin[3];
1004     ht_bin[2]=ht_bin[3]=-1;
1005   }
1006 }
1007
1008 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1009 {
1010   struct ll_entry *next;
1011   while(*head) {
1012     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1013        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1014     {
1015       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1016       remove_hash((*head)->vaddr);
1017       next=(*head)->next;
1018       free(*head);
1019       *head=next;
1020     }
1021     else
1022     {
1023       head=&((*head)->next);
1024     }
1025   }
1026 }
1027
1028 // Remove all entries from linked list
1029 void ll_clear(struct ll_entry **head)
1030 {
1031   struct ll_entry *cur;
1032   struct ll_entry *next;
1033   if(cur=*head) {
1034     *head=0;
1035     while(cur) {
1036       next=cur->next;
1037       free(cur);
1038       cur=next;
1039     }
1040   }
1041 }
1042
1043 // Dereference the pointers and remove if it matches
1044 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1045 {
1046   while(head) {
1047     int ptr=get_pointer(head->addr);
1048     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1049     if(((ptr>>shift)==(addr>>shift)) ||
1050        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1051     {
1052       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1053       u_int host_addr=(u_int)kill_pointer(head->addr);
1054       #ifdef __arm__
1055         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1056       #endif
1057     }
1058     head=head->next;
1059   }
1060 }
1061
1062 // This is called when we write to a compiled block (see do_invstub)
1063 void invalidate_page(u_int page)
1064 {
1065   struct ll_entry *head;
1066   struct ll_entry *next;
1067   head=jump_in[page];
1068   jump_in[page]=0;
1069   while(head!=NULL) {
1070     inv_debug("INVALIDATE: %x\n",head->vaddr);
1071     remove_hash(head->vaddr);
1072     next=head->next;
1073     free(head);
1074     head=next;
1075   }
1076   head=jump_out[page];
1077   jump_out[page]=0;
1078   while(head!=NULL) {
1079     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1080     u_int host_addr=(u_int)kill_pointer(head->addr);
1081     #ifdef __arm__
1082       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1083     #endif
1084     next=head->next;
1085     free(head);
1086     head=next;
1087   }
1088 }
1089
1090 static void invalidate_block_range(u_int block, u_int first, u_int last)
1091 {
1092   u_int page=get_page(block<<12);
1093   //printf("first=%d last=%d\n",first,last);
1094   invalidate_page(page);
1095   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1096   assert(last<page+5);
1097   // Invalidate the adjacent pages if a block crosses a 4K boundary
1098   while(first<page) {
1099     invalidate_page(first);
1100     first++;
1101   }
1102   for(first=page+1;first<last;first++) {
1103     invalidate_page(first);
1104   }
1105   #ifdef __arm__
1106     do_clear_cache();
1107   #endif
1108   
1109   // Don't trap writes
1110   invalid_code[block]=1;
1111 #ifndef DISABLE_TLB
1112   // If there is a valid TLB entry for this page, remove write protect
1113   if(tlb_LUT_w[block]) {
1114     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1115     // CHECK: Is this right?
1116     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1117     u_int real_block=tlb_LUT_w[block]>>12;
1118     invalid_code[real_block]=1;
1119     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1120   }
1121   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1122 #endif
1123
1124   #ifdef USE_MINI_HT
1125   memset(mini_ht,-1,sizeof(mini_ht));
1126   #endif
1127 }
1128
1129 void invalidate_block(u_int block)
1130 {
1131   u_int page=get_page(block<<12);
1132   u_int vpage=get_vpage(block<<12);
1133   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1134   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1135   u_int first,last;
1136   first=last=page;
1137   struct ll_entry *head;
1138   head=jump_dirty[vpage];
1139   //printf("page=%d vpage=%d\n",page,vpage);
1140   while(head!=NULL) {
1141     u_int start,end;
1142     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1143       get_bounds((int)head->addr,&start,&end);
1144       //printf("start: %x end: %x\n",start,end);
1145       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1146         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1147           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1148           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1149         }
1150       }
1151 #ifndef DISABLE_TLB
1152       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1153         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1154           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1155           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1156         }
1157       }
1158 #endif
1159     }
1160     head=head->next;
1161   }
1162   invalidate_block_range(block,first,last);
1163 }
1164
1165 void invalidate_addr(u_int addr)
1166 {
1167 #ifdef PCSX
1168   //static int rhits;
1169   // this check is done by the caller
1170   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1171   u_int page=get_vpage(addr);
1172   if(page<2048) { // RAM
1173     struct ll_entry *head;
1174     u_int addr_min=~0, addr_max=0;
1175     u_int mask=RAM_SIZE-1;
1176     u_int addr_main=0x80000000|(addr&mask);
1177     int pg1;
1178     inv_code_start=addr_main&~0xfff;
1179     inv_code_end=addr_main|0xfff;
1180     pg1=page;
1181     if (pg1>0) {
1182       // must check previous page too because of spans..
1183       pg1--;
1184       inv_code_start-=0x1000;
1185     }
1186     for(;pg1<=page;pg1++) {
1187       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1188         u_int start,end;
1189         get_bounds((int)head->addr,&start,&end);
1190         if(ram_offset) {
1191           start-=ram_offset;
1192           end-=ram_offset;
1193         }
1194         if(start<=addr_main&&addr_main<end) {
1195           if(start<addr_min) addr_min=start;
1196           if(end>addr_max) addr_max=end;
1197         }
1198         else if(addr_main<start) {
1199           if(start<inv_code_end)
1200             inv_code_end=start-1;
1201         }
1202         else {
1203           if(end>inv_code_start)
1204             inv_code_start=end;
1205         }
1206       }
1207     }
1208     if (addr_min!=~0) {
1209       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1210       inv_code_start=inv_code_end=~0;
1211       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1212       return;
1213     }
1214     else {
1215       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1216       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1217       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1218       return;
1219     }
1220   }
1221 #endif
1222   invalidate_block(addr>>12);
1223 }
1224
1225 // This is called when loading a save state.
1226 // Anything could have changed, so invalidate everything.
1227 void invalidate_all_pages()
1228 {
1229   u_int page,n;
1230   for(page=0;page<4096;page++)
1231     invalidate_page(page);
1232   for(page=0;page<1048576;page++)
1233     if(!invalid_code[page]) {
1234       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1235       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1236     }
1237   #ifdef __arm__
1238   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1239   #endif
1240   #ifdef USE_MINI_HT
1241   memset(mini_ht,-1,sizeof(mini_ht));
1242   #endif
1243   #ifndef DISABLE_TLB
1244   // TLB
1245   for(page=0;page<0x100000;page++) {
1246     if(tlb_LUT_r[page]) {
1247       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1248       if(!tlb_LUT_w[page]||!invalid_code[page])
1249         memory_map[page]|=0x40000000; // Write protect
1250     }
1251     else memory_map[page]=-1;
1252     if(page==0x80000) page=0xC0000;
1253   }
1254   tlb_hacks();
1255   #endif
1256 }
1257
1258 // Add an entry to jump_out after making a link
1259 void add_link(u_int vaddr,void *src)
1260 {
1261   u_int page=get_page(vaddr);
1262   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1263   int *ptr=(int *)(src+4);
1264   assert((*ptr&0x0fff0000)==0x059f0000);
1265   ll_add(jump_out+page,vaddr,src);
1266   //int ptr=get_pointer(src);
1267   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1268 }
1269
1270 // If a code block was found to be unmodified (bit was set in
1271 // restore_candidate) and it remains unmodified (bit is clear
1272 // in invalid_code) then move the entries for that 4K page from
1273 // the dirty list to the clean list.
1274 void clean_blocks(u_int page)
1275 {
1276   struct ll_entry *head;
1277   inv_debug("INV: clean_blocks page=%d\n",page);
1278   head=jump_dirty[page];
1279   while(head!=NULL) {
1280     if(!invalid_code[head->vaddr>>12]) {
1281       // Don't restore blocks which are about to expire from the cache
1282       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1283         u_int start,end;
1284         if(verify_dirty((int)head->addr)) {
1285           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1286           u_int i;
1287           u_int inv=0;
1288           get_bounds((int)head->addr,&start,&end);
1289           if(start-(u_int)rdram<RAM_SIZE) {
1290             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1291               inv|=invalid_code[i];
1292             }
1293           }
1294 #ifndef DISABLE_TLB
1295           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1296             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1297             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1298             if(addr<start||addr>=end) inv=1;
1299           }
1300 #endif
1301           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1302             inv=1;
1303           }
1304           if(!inv) {
1305             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1306             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1307               u_int ppage=page;
1308 #ifndef DISABLE_TLB
1309               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1310 #endif
1311               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1312               //printf("page=%x, addr=%x\n",page,head->vaddr);
1313               //assert(head->vaddr>>12==(page|0x80000));
1314               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1315               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1316               if(ht_bin[0]==head->vaddr) {
1317                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1318               }
1319               if(ht_bin[2]==head->vaddr) {
1320                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1321               }
1322             }
1323           }
1324         }
1325       }
1326     }
1327     head=head->next;
1328   }
1329 }
1330
1331
1332 void mov_alloc(struct regstat *current,int i)
1333 {
1334   // Note: Don't need to actually alloc the source registers
1335   if((~current->is32>>rs1[i])&1) {
1336     //alloc_reg64(current,i,rs1[i]);
1337     alloc_reg64(current,i,rt1[i]);
1338     current->is32&=~(1LL<<rt1[i]);
1339   } else {
1340     //alloc_reg(current,i,rs1[i]);
1341     alloc_reg(current,i,rt1[i]);
1342     current->is32|=(1LL<<rt1[i]);
1343   }
1344   clear_const(current,rs1[i]);
1345   clear_const(current,rt1[i]);
1346   dirty_reg(current,rt1[i]);
1347 }
1348
1349 void shiftimm_alloc(struct regstat *current,int i)
1350 {
1351   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1352   {
1353     if(rt1[i]) {
1354       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1355       else lt1[i]=rs1[i];
1356       alloc_reg(current,i,rt1[i]);
1357       current->is32|=1LL<<rt1[i];
1358       dirty_reg(current,rt1[i]);
1359       if(is_const(current,rs1[i])) {
1360         int v=get_const(current,rs1[i]);
1361         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1362         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1363         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1364       }
1365       else clear_const(current,rt1[i]);
1366     }
1367   }
1368   else
1369   {
1370     clear_const(current,rs1[i]);
1371     clear_const(current,rt1[i]);
1372   }
1373
1374   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1375   {
1376     if(rt1[i]) {
1377       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1378       alloc_reg64(current,i,rt1[i]);
1379       current->is32&=~(1LL<<rt1[i]);
1380       dirty_reg(current,rt1[i]);
1381     }
1382   }
1383   if(opcode2[i]==0x3c) // DSLL32
1384   {
1385     if(rt1[i]) {
1386       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1387       alloc_reg64(current,i,rt1[i]);
1388       current->is32&=~(1LL<<rt1[i]);
1389       dirty_reg(current,rt1[i]);
1390     }
1391   }
1392   if(opcode2[i]==0x3e) // DSRL32
1393   {
1394     if(rt1[i]) {
1395       alloc_reg64(current,i,rs1[i]);
1396       if(imm[i]==32) {
1397         alloc_reg64(current,i,rt1[i]);
1398         current->is32&=~(1LL<<rt1[i]);
1399       } else {
1400         alloc_reg(current,i,rt1[i]);
1401         current->is32|=1LL<<rt1[i];
1402       }
1403       dirty_reg(current,rt1[i]);
1404     }
1405   }
1406   if(opcode2[i]==0x3f) // DSRA32
1407   {
1408     if(rt1[i]) {
1409       alloc_reg64(current,i,rs1[i]);
1410       alloc_reg(current,i,rt1[i]);
1411       current->is32|=1LL<<rt1[i];
1412       dirty_reg(current,rt1[i]);
1413     }
1414   }
1415 }
1416
1417 void shift_alloc(struct regstat *current,int i)
1418 {
1419   if(rt1[i]) {
1420     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1421     {
1422       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1423       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1424       alloc_reg(current,i,rt1[i]);
1425       if(rt1[i]==rs2[i]) {
1426         alloc_reg_temp(current,i,-1);
1427         minimum_free_regs[i]=1;
1428       }
1429       current->is32|=1LL<<rt1[i];
1430     } else { // DSLLV/DSRLV/DSRAV
1431       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1432       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1433       alloc_reg64(current,i,rt1[i]);
1434       current->is32&=~(1LL<<rt1[i]);
1435       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1436       {
1437         alloc_reg_temp(current,i,-1);
1438         minimum_free_regs[i]=1;
1439       }
1440     }
1441     clear_const(current,rs1[i]);
1442     clear_const(current,rs2[i]);
1443     clear_const(current,rt1[i]);
1444     dirty_reg(current,rt1[i]);
1445   }
1446 }
1447
1448 void alu_alloc(struct regstat *current,int i)
1449 {
1450   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1451     if(rt1[i]) {
1452       if(rs1[i]&&rs2[i]) {
1453         alloc_reg(current,i,rs1[i]);
1454         alloc_reg(current,i,rs2[i]);
1455       }
1456       else {
1457         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1458         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1459       }
1460       alloc_reg(current,i,rt1[i]);
1461     }
1462     current->is32|=1LL<<rt1[i];
1463   }
1464   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1465     if(rt1[i]) {
1466       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1467       {
1468         alloc_reg64(current,i,rs1[i]);
1469         alloc_reg64(current,i,rs2[i]);
1470         alloc_reg(current,i,rt1[i]);
1471       } else {
1472         alloc_reg(current,i,rs1[i]);
1473         alloc_reg(current,i,rs2[i]);
1474         alloc_reg(current,i,rt1[i]);
1475       }
1476     }
1477     current->is32|=1LL<<rt1[i];
1478   }
1479   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1480     if(rt1[i]) {
1481       if(rs1[i]&&rs2[i]) {
1482         alloc_reg(current,i,rs1[i]);
1483         alloc_reg(current,i,rs2[i]);
1484       }
1485       else
1486       {
1487         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1488         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1489       }
1490       alloc_reg(current,i,rt1[i]);
1491       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1492       {
1493         if(!((current->uu>>rt1[i])&1)) {
1494           alloc_reg64(current,i,rt1[i]);
1495         }
1496         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1497           if(rs1[i]&&rs2[i]) {
1498             alloc_reg64(current,i,rs1[i]);
1499             alloc_reg64(current,i,rs2[i]);
1500           }
1501           else
1502           {
1503             // Is is really worth it to keep 64-bit values in registers?
1504             #ifdef NATIVE_64BIT
1505             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1506             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1507             #endif
1508           }
1509         }
1510         current->is32&=~(1LL<<rt1[i]);
1511       } else {
1512         current->is32|=1LL<<rt1[i];
1513       }
1514     }
1515   }
1516   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1517     if(rt1[i]) {
1518       if(rs1[i]&&rs2[i]) {
1519         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1520           alloc_reg64(current,i,rs1[i]);
1521           alloc_reg64(current,i,rs2[i]);
1522           alloc_reg64(current,i,rt1[i]);
1523         } else {
1524           alloc_reg(current,i,rs1[i]);
1525           alloc_reg(current,i,rs2[i]);
1526           alloc_reg(current,i,rt1[i]);
1527         }
1528       }
1529       else {
1530         alloc_reg(current,i,rt1[i]);
1531         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1532           // DADD used as move, or zeroing
1533           // If we have a 64-bit source, then make the target 64 bits too
1534           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1535             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1536             alloc_reg64(current,i,rt1[i]);
1537           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1538             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1539             alloc_reg64(current,i,rt1[i]);
1540           }
1541           if(opcode2[i]>=0x2e&&rs2[i]) {
1542             // DSUB used as negation - 64-bit result
1543             // If we have a 32-bit register, extend it to 64 bits
1544             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1545             alloc_reg64(current,i,rt1[i]);
1546           }
1547         }
1548       }
1549       if(rs1[i]&&rs2[i]) {
1550         current->is32&=~(1LL<<rt1[i]);
1551       } else if(rs1[i]) {
1552         current->is32&=~(1LL<<rt1[i]);
1553         if((current->is32>>rs1[i])&1)
1554           current->is32|=1LL<<rt1[i];
1555       } else if(rs2[i]) {
1556         current->is32&=~(1LL<<rt1[i]);
1557         if((current->is32>>rs2[i])&1)
1558           current->is32|=1LL<<rt1[i];
1559       } else {
1560         current->is32|=1LL<<rt1[i];
1561       }
1562     }
1563   }
1564   clear_const(current,rs1[i]);
1565   clear_const(current,rs2[i]);
1566   clear_const(current,rt1[i]);
1567   dirty_reg(current,rt1[i]);
1568 }
1569
1570 void imm16_alloc(struct regstat *current,int i)
1571 {
1572   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1573   else lt1[i]=rs1[i];
1574   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1575   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1576     current->is32&=~(1LL<<rt1[i]);
1577     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1578       // TODO: Could preserve the 32-bit flag if the immediate is zero
1579       alloc_reg64(current,i,rt1[i]);
1580       alloc_reg64(current,i,rs1[i]);
1581     }
1582     clear_const(current,rs1[i]);
1583     clear_const(current,rt1[i]);
1584   }
1585   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1586     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1587     current->is32|=1LL<<rt1[i];
1588     clear_const(current,rs1[i]);
1589     clear_const(current,rt1[i]);
1590   }
1591   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1592     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1593       if(rs1[i]!=rt1[i]) {
1594         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1595         alloc_reg64(current,i,rt1[i]);
1596         current->is32&=~(1LL<<rt1[i]);
1597       }
1598     }
1599     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1600     if(is_const(current,rs1[i])) {
1601       int v=get_const(current,rs1[i]);
1602       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1603       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1604       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1605     }
1606     else clear_const(current,rt1[i]);
1607   }
1608   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1609     if(is_const(current,rs1[i])) {
1610       int v=get_const(current,rs1[i]);
1611       set_const(current,rt1[i],v+imm[i]);
1612     }
1613     else clear_const(current,rt1[i]);
1614     current->is32|=1LL<<rt1[i];
1615   }
1616   else {
1617     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1618     current->is32|=1LL<<rt1[i];
1619   }
1620   dirty_reg(current,rt1[i]);
1621 }
1622
1623 void load_alloc(struct regstat *current,int i)
1624 {
1625   clear_const(current,rt1[i]);
1626   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1627   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1628   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1629   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1630     alloc_reg(current,i,rt1[i]);
1631     assert(get_reg(current->regmap,rt1[i])>=0);
1632     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1633     {
1634       current->is32&=~(1LL<<rt1[i]);
1635       alloc_reg64(current,i,rt1[i]);
1636     }
1637     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1638     {
1639       current->is32&=~(1LL<<rt1[i]);
1640       alloc_reg64(current,i,rt1[i]);
1641       alloc_all(current,i);
1642       alloc_reg64(current,i,FTEMP);
1643       minimum_free_regs[i]=HOST_REGS;
1644     }
1645     else current->is32|=1LL<<rt1[i];
1646     dirty_reg(current,rt1[i]);
1647     // If using TLB, need a register for pointer to the mapping table
1648     if(using_tlb) alloc_reg(current,i,TLREG);
1649     // LWL/LWR need a temporary register for the old value
1650     if(opcode[i]==0x22||opcode[i]==0x26)
1651     {
1652       alloc_reg(current,i,FTEMP);
1653       alloc_reg_temp(current,i,-1);
1654       minimum_free_regs[i]=1;
1655     }
1656   }
1657   else
1658   {
1659     // Load to r0 or unneeded register (dummy load)
1660     // but we still need a register to calculate the address
1661     if(opcode[i]==0x22||opcode[i]==0x26)
1662     {
1663       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1664     }
1665     // If using TLB, need a register for pointer to the mapping table
1666     if(using_tlb) alloc_reg(current,i,TLREG);
1667     alloc_reg_temp(current,i,-1);
1668     minimum_free_regs[i]=1;
1669     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1670     {
1671       alloc_all(current,i);
1672       alloc_reg64(current,i,FTEMP);
1673       minimum_free_regs[i]=HOST_REGS;
1674     }
1675   }
1676 }
1677
1678 void store_alloc(struct regstat *current,int i)
1679 {
1680   clear_const(current,rs2[i]);
1681   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1682   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1683   alloc_reg(current,i,rs2[i]);
1684   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1685     alloc_reg64(current,i,rs2[i]);
1686     if(rs2[i]) alloc_reg(current,i,FTEMP);
1687   }
1688   // If using TLB, need a register for pointer to the mapping table
1689   if(using_tlb) alloc_reg(current,i,TLREG);
1690   #if defined(HOST_IMM8)
1691   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1692   else alloc_reg(current,i,INVCP);
1693   #endif
1694   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1695     alloc_reg(current,i,FTEMP);
1696   }
1697   // We need a temporary register for address generation
1698   alloc_reg_temp(current,i,-1);
1699   minimum_free_regs[i]=1;
1700 }
1701
1702 void c1ls_alloc(struct regstat *current,int i)
1703 {
1704   //clear_const(current,rs1[i]); // FIXME
1705   clear_const(current,rt1[i]);
1706   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1707   alloc_reg(current,i,CSREG); // Status
1708   alloc_reg(current,i,FTEMP);
1709   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1710     alloc_reg64(current,i,FTEMP);
1711   }
1712   // If using TLB, need a register for pointer to the mapping table
1713   if(using_tlb) alloc_reg(current,i,TLREG);
1714   #if defined(HOST_IMM8)
1715   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1716   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1717     alloc_reg(current,i,INVCP);
1718   #endif
1719   // We need a temporary register for address generation
1720   alloc_reg_temp(current,i,-1);
1721 }
1722
1723 void c2ls_alloc(struct regstat *current,int i)
1724 {
1725   clear_const(current,rt1[i]);
1726   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1727   alloc_reg(current,i,FTEMP);
1728   // If using TLB, need a register for pointer to the mapping table
1729   if(using_tlb) alloc_reg(current,i,TLREG);
1730   #if defined(HOST_IMM8)
1731   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1732   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1733     alloc_reg(current,i,INVCP);
1734   #endif
1735   // We need a temporary register for address generation
1736   alloc_reg_temp(current,i,-1);
1737   minimum_free_regs[i]=1;
1738 }
1739
1740 #ifndef multdiv_alloc
1741 void multdiv_alloc(struct regstat *current,int i)
1742 {
1743   //  case 0x18: MULT
1744   //  case 0x19: MULTU
1745   //  case 0x1A: DIV
1746   //  case 0x1B: DIVU
1747   //  case 0x1C: DMULT
1748   //  case 0x1D: DMULTU
1749   //  case 0x1E: DDIV
1750   //  case 0x1F: DDIVU
1751   clear_const(current,rs1[i]);
1752   clear_const(current,rs2[i]);
1753   if(rs1[i]&&rs2[i])
1754   {
1755     if((opcode2[i]&4)==0) // 32-bit
1756     {
1757       current->u&=~(1LL<<HIREG);
1758       current->u&=~(1LL<<LOREG);
1759       alloc_reg(current,i,HIREG);
1760       alloc_reg(current,i,LOREG);
1761       alloc_reg(current,i,rs1[i]);
1762       alloc_reg(current,i,rs2[i]);
1763       current->is32|=1LL<<HIREG;
1764       current->is32|=1LL<<LOREG;
1765       dirty_reg(current,HIREG);
1766       dirty_reg(current,LOREG);
1767     }
1768     else // 64-bit
1769     {
1770       current->u&=~(1LL<<HIREG);
1771       current->u&=~(1LL<<LOREG);
1772       current->uu&=~(1LL<<HIREG);
1773       current->uu&=~(1LL<<LOREG);
1774       alloc_reg64(current,i,HIREG);
1775       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1776       alloc_reg64(current,i,rs1[i]);
1777       alloc_reg64(current,i,rs2[i]);
1778       alloc_all(current,i);
1779       current->is32&=~(1LL<<HIREG);
1780       current->is32&=~(1LL<<LOREG);
1781       dirty_reg(current,HIREG);
1782       dirty_reg(current,LOREG);
1783       minimum_free_regs[i]=HOST_REGS;
1784     }
1785   }
1786   else
1787   {
1788     // Multiply by zero is zero.
1789     // MIPS does not have a divide by zero exception.
1790     // The result is undefined, we return zero.
1791     alloc_reg(current,i,HIREG);
1792     alloc_reg(current,i,LOREG);
1793     current->is32|=1LL<<HIREG;
1794     current->is32|=1LL<<LOREG;
1795     dirty_reg(current,HIREG);
1796     dirty_reg(current,LOREG);
1797   }
1798 }
1799 #endif
1800
1801 void cop0_alloc(struct regstat *current,int i)
1802 {
1803   if(opcode2[i]==0) // MFC0
1804   {
1805     if(rt1[i]) {
1806       clear_const(current,rt1[i]);
1807       alloc_all(current,i);
1808       alloc_reg(current,i,rt1[i]);
1809       current->is32|=1LL<<rt1[i];
1810       dirty_reg(current,rt1[i]);
1811     }
1812   }
1813   else if(opcode2[i]==4) // MTC0
1814   {
1815     if(rs1[i]){
1816       clear_const(current,rs1[i]);
1817       alloc_reg(current,i,rs1[i]);
1818       alloc_all(current,i);
1819     }
1820     else {
1821       alloc_all(current,i); // FIXME: Keep r0
1822       current->u&=~1LL;
1823       alloc_reg(current,i,0);
1824     }
1825   }
1826   else
1827   {
1828     // TLBR/TLBWI/TLBWR/TLBP/ERET
1829     assert(opcode2[i]==0x10);
1830     alloc_all(current,i);
1831   }
1832   minimum_free_regs[i]=HOST_REGS;
1833 }
1834
1835 void cop1_alloc(struct regstat *current,int i)
1836 {
1837   alloc_reg(current,i,CSREG); // Load status
1838   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1839   {
1840     if(rt1[i]){
1841       clear_const(current,rt1[i]);
1842       if(opcode2[i]==1) {
1843         alloc_reg64(current,i,rt1[i]); // DMFC1
1844         current->is32&=~(1LL<<rt1[i]);
1845       }else{
1846         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1847         current->is32|=1LL<<rt1[i];
1848       }
1849       dirty_reg(current,rt1[i]);
1850     }
1851     alloc_reg_temp(current,i,-1);
1852   }
1853   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1854   {
1855     if(rs1[i]){
1856       clear_const(current,rs1[i]);
1857       if(opcode2[i]==5)
1858         alloc_reg64(current,i,rs1[i]); // DMTC1
1859       else
1860         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1861       alloc_reg_temp(current,i,-1);
1862     }
1863     else {
1864       current->u&=~1LL;
1865       alloc_reg(current,i,0);
1866       alloc_reg_temp(current,i,-1);
1867     }
1868   }
1869   minimum_free_regs[i]=1;
1870 }
1871 void fconv_alloc(struct regstat *current,int i)
1872 {
1873   alloc_reg(current,i,CSREG); // Load status
1874   alloc_reg_temp(current,i,-1);
1875   minimum_free_regs[i]=1;
1876 }
1877 void float_alloc(struct regstat *current,int i)
1878 {
1879   alloc_reg(current,i,CSREG); // Load status
1880   alloc_reg_temp(current,i,-1);
1881   minimum_free_regs[i]=1;
1882 }
1883 void c2op_alloc(struct regstat *current,int i)
1884 {
1885   alloc_reg_temp(current,i,-1);
1886 }
1887 void fcomp_alloc(struct regstat *current,int i)
1888 {
1889   alloc_reg(current,i,CSREG); // Load status
1890   alloc_reg(current,i,FSREG); // Load flags
1891   dirty_reg(current,FSREG); // Flag will be modified
1892   alloc_reg_temp(current,i,-1);
1893   minimum_free_regs[i]=1;
1894 }
1895
1896 void syscall_alloc(struct regstat *current,int i)
1897 {
1898   alloc_cc(current,i);
1899   dirty_reg(current,CCREG);
1900   alloc_all(current,i);
1901   minimum_free_regs[i]=HOST_REGS;
1902   current->isconst=0;
1903 }
1904
1905 void delayslot_alloc(struct regstat *current,int i)
1906 {
1907   switch(itype[i]) {
1908     case UJUMP:
1909     case CJUMP:
1910     case SJUMP:
1911     case RJUMP:
1912     case FJUMP:
1913     case SYSCALL:
1914     case HLECALL:
1915     case SPAN:
1916       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1917       SysPrintf("Disabled speculative precompilation\n");
1918       stop_after_jal=1;
1919       break;
1920     case IMM16:
1921       imm16_alloc(current,i);
1922       break;
1923     case LOAD:
1924     case LOADLR:
1925       load_alloc(current,i);
1926       break;
1927     case STORE:
1928     case STORELR:
1929       store_alloc(current,i);
1930       break;
1931     case ALU:
1932       alu_alloc(current,i);
1933       break;
1934     case SHIFT:
1935       shift_alloc(current,i);
1936       break;
1937     case MULTDIV:
1938       multdiv_alloc(current,i);
1939       break;
1940     case SHIFTIMM:
1941       shiftimm_alloc(current,i);
1942       break;
1943     case MOV:
1944       mov_alloc(current,i);
1945       break;
1946     case COP0:
1947       cop0_alloc(current,i);
1948       break;
1949     case COP1:
1950     case COP2:
1951       cop1_alloc(current,i);
1952       break;
1953     case C1LS:
1954       c1ls_alloc(current,i);
1955       break;
1956     case C2LS:
1957       c2ls_alloc(current,i);
1958       break;
1959     case FCONV:
1960       fconv_alloc(current,i);
1961       break;
1962     case FLOAT:
1963       float_alloc(current,i);
1964       break;
1965     case FCOMP:
1966       fcomp_alloc(current,i);
1967       break;
1968     case C2OP:
1969       c2op_alloc(current,i);
1970       break;
1971   }
1972 }
1973
1974 // Special case where a branch and delay slot span two pages in virtual memory
1975 static void pagespan_alloc(struct regstat *current,int i)
1976 {
1977   current->isconst=0;
1978   current->wasconst=0;
1979   regs[i].wasconst=0;
1980   minimum_free_regs[i]=HOST_REGS;
1981   alloc_all(current,i);
1982   alloc_cc(current,i);
1983   dirty_reg(current,CCREG);
1984   if(opcode[i]==3) // JAL
1985   {
1986     alloc_reg(current,i,31);
1987     dirty_reg(current,31);
1988   }
1989   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1990   {
1991     alloc_reg(current,i,rs1[i]);
1992     if (rt1[i]!=0) {
1993       alloc_reg(current,i,rt1[i]);
1994       dirty_reg(current,rt1[i]);
1995     }
1996   }
1997   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1998   {
1999     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2000     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2001     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2002     {
2003       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2004       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2005     }
2006   }
2007   else
2008   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2009   {
2010     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2011     if(!((current->is32>>rs1[i])&1))
2012     {
2013       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2014     }
2015   }
2016   else
2017   if(opcode[i]==0x11) // BC1
2018   {
2019     alloc_reg(current,i,FSREG);
2020     alloc_reg(current,i,CSREG);
2021   }
2022   //else ...
2023 }
2024
2025 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2026 {
2027   stubs[stubcount][0]=type;
2028   stubs[stubcount][1]=addr;
2029   stubs[stubcount][2]=retaddr;
2030   stubs[stubcount][3]=a;
2031   stubs[stubcount][4]=b;
2032   stubs[stubcount][5]=c;
2033   stubs[stubcount][6]=d;
2034   stubs[stubcount][7]=e;
2035   stubcount++;
2036 }
2037
2038 // Write out a single register
2039 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2040 {
2041   int hr;
2042   for(hr=0;hr<HOST_REGS;hr++) {
2043     if(hr!=EXCLUDE_REG) {
2044       if((regmap[hr]&63)==r) {
2045         if((dirty>>hr)&1) {
2046           if(regmap[hr]<64) {
2047             emit_storereg(r,hr);
2048 #ifndef FORCE32
2049             if((is32>>regmap[hr])&1) {
2050               emit_sarimm(hr,31,hr);
2051               emit_storereg(r|64,hr);
2052             }
2053 #endif
2054           }else{
2055             emit_storereg(r|64,hr);
2056           }
2057         }
2058       }
2059     }
2060   }
2061 }
2062
2063 int mchecksum()
2064 {
2065   //if(!tracedebug) return 0;
2066   int i;
2067   int sum=0;
2068   for(i=0;i<2097152;i++) {
2069     unsigned int temp=sum;
2070     sum<<=1;
2071     sum|=(~temp)>>31;
2072     sum^=((u_int *)rdram)[i];
2073   }
2074   return sum;
2075 }
2076 int rchecksum()
2077 {
2078   int i;
2079   int sum=0;
2080   for(i=0;i<64;i++)
2081     sum^=((u_int *)reg)[i];
2082   return sum;
2083 }
2084 void rlist()
2085 {
2086   int i;
2087   printf("TRACE: ");
2088   for(i=0;i<32;i++)
2089     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2090   printf("\n");
2091 #ifndef DISABLE_COP1
2092   printf("TRACE: ");
2093   for(i=0;i<32;i++)
2094     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2095   printf("\n");
2096 #endif
2097 }
2098
2099 void enabletrace()
2100 {
2101   tracedebug=1;
2102 }
2103
2104 void memdebug(int i)
2105 {
2106   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2107   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2108   //rlist();
2109   //if(tracedebug) {
2110   //if(Count>=-2084597794) {
2111   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2112   //if(0) {
2113     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2114     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2115     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2116     rlist();
2117     #ifdef __i386__
2118     printf("TRACE: %x\n",(&i)[-1]);
2119     #endif
2120     #ifdef __arm__
2121     int j;
2122     printf("TRACE: %x \n",(&j)[10]);
2123     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2124     #endif
2125     //fflush(stdout);
2126   }
2127   //printf("TRACE: %x\n",(&i)[-1]);
2128 }
2129
2130 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2131 {
2132   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2133 }
2134
2135 void alu_assemble(int i,struct regstat *i_regs)
2136 {
2137   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2138     if(rt1[i]) {
2139       signed char s1,s2,t;
2140       t=get_reg(i_regs->regmap,rt1[i]);
2141       if(t>=0) {
2142         s1=get_reg(i_regs->regmap,rs1[i]);
2143         s2=get_reg(i_regs->regmap,rs2[i]);
2144         if(rs1[i]&&rs2[i]) {
2145           assert(s1>=0);
2146           assert(s2>=0);
2147           if(opcode2[i]&2) emit_sub(s1,s2,t);
2148           else emit_add(s1,s2,t);
2149         }
2150         else if(rs1[i]) {
2151           if(s1>=0) emit_mov(s1,t);
2152           else emit_loadreg(rs1[i],t);
2153         }
2154         else if(rs2[i]) {
2155           if(s2>=0) {
2156             if(opcode2[i]&2) emit_neg(s2,t);
2157             else emit_mov(s2,t);
2158           }
2159           else {
2160             emit_loadreg(rs2[i],t);
2161             if(opcode2[i]&2) emit_neg(t,t);
2162           }
2163         }
2164         else emit_zeroreg(t);
2165       }
2166     }
2167   }
2168   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2169     if(rt1[i]) {
2170       signed char s1l,s2l,s1h,s2h,tl,th;
2171       tl=get_reg(i_regs->regmap,rt1[i]);
2172       th=get_reg(i_regs->regmap,rt1[i]|64);
2173       if(tl>=0) {
2174         s1l=get_reg(i_regs->regmap,rs1[i]);
2175         s2l=get_reg(i_regs->regmap,rs2[i]);
2176         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2177         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2178         if(rs1[i]&&rs2[i]) {
2179           assert(s1l>=0);
2180           assert(s2l>=0);
2181           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2182           else emit_adds(s1l,s2l,tl);
2183           if(th>=0) {
2184             #ifdef INVERTED_CARRY
2185             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2186             #else
2187             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2188             #endif
2189             else emit_add(s1h,s2h,th);
2190           }
2191         }
2192         else if(rs1[i]) {
2193           if(s1l>=0) emit_mov(s1l,tl);
2194           else emit_loadreg(rs1[i],tl);
2195           if(th>=0) {
2196             if(s1h>=0) emit_mov(s1h,th);
2197             else emit_loadreg(rs1[i]|64,th);
2198           }
2199         }
2200         else if(rs2[i]) {
2201           if(s2l>=0) {
2202             if(opcode2[i]&2) emit_negs(s2l,tl);
2203             else emit_mov(s2l,tl);
2204           }
2205           else {
2206             emit_loadreg(rs2[i],tl);
2207             if(opcode2[i]&2) emit_negs(tl,tl);
2208           }
2209           if(th>=0) {
2210             #ifdef INVERTED_CARRY
2211             if(s2h>=0) emit_mov(s2h,th);
2212             else emit_loadreg(rs2[i]|64,th);
2213             if(opcode2[i]&2) {
2214               emit_adcimm(-1,th); // x86 has inverted carry flag
2215               emit_not(th,th);
2216             }
2217             #else
2218             if(opcode2[i]&2) {
2219               if(s2h>=0) emit_rscimm(s2h,0,th);
2220               else {
2221                 emit_loadreg(rs2[i]|64,th);
2222                 emit_rscimm(th,0,th);
2223               }
2224             }else{
2225               if(s2h>=0) emit_mov(s2h,th);
2226               else emit_loadreg(rs2[i]|64,th);
2227             }
2228             #endif
2229           }
2230         }
2231         else {
2232           emit_zeroreg(tl);
2233           if(th>=0) emit_zeroreg(th);
2234         }
2235       }
2236     }
2237   }
2238   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2239     if(rt1[i]) {
2240       signed char s1l,s1h,s2l,s2h,t;
2241       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2242       {
2243         t=get_reg(i_regs->regmap,rt1[i]);
2244         //assert(t>=0);
2245         if(t>=0) {
2246           s1l=get_reg(i_regs->regmap,rs1[i]);
2247           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2248           s2l=get_reg(i_regs->regmap,rs2[i]);
2249           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2250           if(rs2[i]==0) // rx<r0
2251           {
2252             assert(s1h>=0);
2253             if(opcode2[i]==0x2a) // SLT
2254               emit_shrimm(s1h,31,t);
2255             else // SLTU (unsigned can not be less than zero)
2256               emit_zeroreg(t);
2257           }
2258           else if(rs1[i]==0) // r0<rx
2259           {
2260             assert(s2h>=0);
2261             if(opcode2[i]==0x2a) // SLT
2262               emit_set_gz64_32(s2h,s2l,t);
2263             else // SLTU (set if not zero)
2264               emit_set_nz64_32(s2h,s2l,t);
2265           }
2266           else {
2267             assert(s1l>=0);assert(s1h>=0);
2268             assert(s2l>=0);assert(s2h>=0);
2269             if(opcode2[i]==0x2a) // SLT
2270               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2271             else // SLTU
2272               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2273           }
2274         }
2275       } else {
2276         t=get_reg(i_regs->regmap,rt1[i]);
2277         //assert(t>=0);
2278         if(t>=0) {
2279           s1l=get_reg(i_regs->regmap,rs1[i]);
2280           s2l=get_reg(i_regs->regmap,rs2[i]);
2281           if(rs2[i]==0) // rx<r0
2282           {
2283             assert(s1l>=0);
2284             if(opcode2[i]==0x2a) // SLT
2285               emit_shrimm(s1l,31,t);
2286             else // SLTU (unsigned can not be less than zero)
2287               emit_zeroreg(t);
2288           }
2289           else if(rs1[i]==0) // r0<rx
2290           {
2291             assert(s2l>=0);
2292             if(opcode2[i]==0x2a) // SLT
2293               emit_set_gz32(s2l,t);
2294             else // SLTU (set if not zero)
2295               emit_set_nz32(s2l,t);
2296           }
2297           else{
2298             assert(s1l>=0);assert(s2l>=0);
2299             if(opcode2[i]==0x2a) // SLT
2300               emit_set_if_less32(s1l,s2l,t);
2301             else // SLTU
2302               emit_set_if_carry32(s1l,s2l,t);
2303           }
2304         }
2305       }
2306     }
2307   }
2308   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2309     if(rt1[i]) {
2310       signed char s1l,s1h,s2l,s2h,th,tl;
2311       tl=get_reg(i_regs->regmap,rt1[i]);
2312       th=get_reg(i_regs->regmap,rt1[i]|64);
2313       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2314       {
2315         assert(tl>=0);
2316         if(tl>=0) {
2317           s1l=get_reg(i_regs->regmap,rs1[i]);
2318           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2319           s2l=get_reg(i_regs->regmap,rs2[i]);
2320           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2321           if(rs1[i]&&rs2[i]) {
2322             assert(s1l>=0);assert(s1h>=0);
2323             assert(s2l>=0);assert(s2h>=0);
2324             if(opcode2[i]==0x24) { // AND
2325               emit_and(s1l,s2l,tl);
2326               emit_and(s1h,s2h,th);
2327             } else
2328             if(opcode2[i]==0x25) { // OR
2329               emit_or(s1l,s2l,tl);
2330               emit_or(s1h,s2h,th);
2331             } else
2332             if(opcode2[i]==0x26) { // XOR
2333               emit_xor(s1l,s2l,tl);
2334               emit_xor(s1h,s2h,th);
2335             } else
2336             if(opcode2[i]==0x27) { // NOR
2337               emit_or(s1l,s2l,tl);
2338               emit_or(s1h,s2h,th);
2339               emit_not(tl,tl);
2340               emit_not(th,th);
2341             }
2342           }
2343           else
2344           {
2345             if(opcode2[i]==0x24) { // AND
2346               emit_zeroreg(tl);
2347               emit_zeroreg(th);
2348             } else
2349             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2350               if(rs1[i]){
2351                 if(s1l>=0) emit_mov(s1l,tl);
2352                 else emit_loadreg(rs1[i],tl);
2353                 if(s1h>=0) emit_mov(s1h,th);
2354                 else emit_loadreg(rs1[i]|64,th);
2355               }
2356               else
2357               if(rs2[i]){
2358                 if(s2l>=0) emit_mov(s2l,tl);
2359                 else emit_loadreg(rs2[i],tl);
2360                 if(s2h>=0) emit_mov(s2h,th);
2361                 else emit_loadreg(rs2[i]|64,th);
2362               }
2363               else{
2364                 emit_zeroreg(tl);
2365                 emit_zeroreg(th);
2366               }
2367             } else
2368             if(opcode2[i]==0x27) { // NOR
2369               if(rs1[i]){
2370                 if(s1l>=0) emit_not(s1l,tl);
2371                 else{
2372                   emit_loadreg(rs1[i],tl);
2373                   emit_not(tl,tl);
2374                 }
2375                 if(s1h>=0) emit_not(s1h,th);
2376                 else{
2377                   emit_loadreg(rs1[i]|64,th);
2378                   emit_not(th,th);
2379                 }
2380               }
2381               else
2382               if(rs2[i]){
2383                 if(s2l>=0) emit_not(s2l,tl);
2384                 else{
2385                   emit_loadreg(rs2[i],tl);
2386                   emit_not(tl,tl);
2387                 }
2388                 if(s2h>=0) emit_not(s2h,th);
2389                 else{
2390                   emit_loadreg(rs2[i]|64,th);
2391                   emit_not(th,th);
2392                 }
2393               }
2394               else {
2395                 emit_movimm(-1,tl);
2396                 emit_movimm(-1,th);
2397               }
2398             }
2399           }
2400         }
2401       }
2402       else
2403       {
2404         // 32 bit
2405         if(tl>=0) {
2406           s1l=get_reg(i_regs->regmap,rs1[i]);
2407           s2l=get_reg(i_regs->regmap,rs2[i]);
2408           if(rs1[i]&&rs2[i]) {
2409             assert(s1l>=0);
2410             assert(s2l>=0);
2411             if(opcode2[i]==0x24) { // AND
2412               emit_and(s1l,s2l,tl);
2413             } else
2414             if(opcode2[i]==0x25) { // OR
2415               emit_or(s1l,s2l,tl);
2416             } else
2417             if(opcode2[i]==0x26) { // XOR
2418               emit_xor(s1l,s2l,tl);
2419             } else
2420             if(opcode2[i]==0x27) { // NOR
2421               emit_or(s1l,s2l,tl);
2422               emit_not(tl,tl);
2423             }
2424           }
2425           else
2426           {
2427             if(opcode2[i]==0x24) { // AND
2428               emit_zeroreg(tl);
2429             } else
2430             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2431               if(rs1[i]){
2432                 if(s1l>=0) emit_mov(s1l,tl);
2433                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2434               }
2435               else
2436               if(rs2[i]){
2437                 if(s2l>=0) emit_mov(s2l,tl);
2438                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2439               }
2440               else emit_zeroreg(tl);
2441             } else
2442             if(opcode2[i]==0x27) { // NOR
2443               if(rs1[i]){
2444                 if(s1l>=0) emit_not(s1l,tl);
2445                 else {
2446                   emit_loadreg(rs1[i],tl);
2447                   emit_not(tl,tl);
2448                 }
2449               }
2450               else
2451               if(rs2[i]){
2452                 if(s2l>=0) emit_not(s2l,tl);
2453                 else {
2454                   emit_loadreg(rs2[i],tl);
2455                   emit_not(tl,tl);
2456                 }
2457               }
2458               else emit_movimm(-1,tl);
2459             }
2460           }
2461         }
2462       }
2463     }
2464   }
2465 }
2466
2467 void imm16_assemble(int i,struct regstat *i_regs)
2468 {
2469   if (opcode[i]==0x0f) { // LUI
2470     if(rt1[i]) {
2471       signed char t;
2472       t=get_reg(i_regs->regmap,rt1[i]);
2473       //assert(t>=0);
2474       if(t>=0) {
2475         if(!((i_regs->isconst>>t)&1))
2476           emit_movimm(imm[i]<<16,t);
2477       }
2478     }
2479   }
2480   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2481     if(rt1[i]) {
2482       signed char s,t;
2483       t=get_reg(i_regs->regmap,rt1[i]);
2484       s=get_reg(i_regs->regmap,rs1[i]);
2485       if(rs1[i]) {
2486         //assert(t>=0);
2487         //assert(s>=0);
2488         if(t>=0) {
2489           if(!((i_regs->isconst>>t)&1)) {
2490             if(s<0) {
2491               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2492               emit_addimm(t,imm[i],t);
2493             }else{
2494               if(!((i_regs->wasconst>>s)&1))
2495                 emit_addimm(s,imm[i],t);
2496               else
2497                 emit_movimm(constmap[i][s]+imm[i],t);
2498             }
2499           }
2500         }
2501       } else {
2502         if(t>=0) {
2503           if(!((i_regs->isconst>>t)&1))
2504             emit_movimm(imm[i],t);
2505         }
2506       }
2507     }
2508   }
2509   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2510     if(rt1[i]) {
2511       signed char sh,sl,th,tl;
2512       th=get_reg(i_regs->regmap,rt1[i]|64);
2513       tl=get_reg(i_regs->regmap,rt1[i]);
2514       sh=get_reg(i_regs->regmap,rs1[i]|64);
2515       sl=get_reg(i_regs->regmap,rs1[i]);
2516       if(tl>=0) {
2517         if(rs1[i]) {
2518           assert(sh>=0);
2519           assert(sl>=0);
2520           if(th>=0) {
2521             emit_addimm64_32(sh,sl,imm[i],th,tl);
2522           }
2523           else {
2524             emit_addimm(sl,imm[i],tl);
2525           }
2526         } else {
2527           emit_movimm(imm[i],tl);
2528           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2529         }
2530       }
2531     }
2532   }
2533   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2534     if(rt1[i]) {
2535       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2536       signed char sh,sl,t;
2537       t=get_reg(i_regs->regmap,rt1[i]);
2538       sh=get_reg(i_regs->regmap,rs1[i]|64);
2539       sl=get_reg(i_regs->regmap,rs1[i]);
2540       //assert(t>=0);
2541       if(t>=0) {
2542         if(rs1[i]>0) {
2543           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2544           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2545             if(opcode[i]==0x0a) { // SLTI
2546               if(sl<0) {
2547                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2548                 emit_slti32(t,imm[i],t);
2549               }else{
2550                 emit_slti32(sl,imm[i],t);
2551               }
2552             }
2553             else { // SLTIU
2554               if(sl<0) {
2555                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2556                 emit_sltiu32(t,imm[i],t);
2557               }else{
2558                 emit_sltiu32(sl,imm[i],t);
2559               }
2560             }
2561           }else{ // 64-bit
2562             assert(sl>=0);
2563             if(opcode[i]==0x0a) // SLTI
2564               emit_slti64_32(sh,sl,imm[i],t);
2565             else // SLTIU
2566               emit_sltiu64_32(sh,sl,imm[i],t);
2567           }
2568         }else{
2569           // SLTI(U) with r0 is just stupid,
2570           // nonetheless examples can be found
2571           if(opcode[i]==0x0a) // SLTI
2572             if(0<imm[i]) emit_movimm(1,t);
2573             else emit_zeroreg(t);
2574           else // SLTIU
2575           {
2576             if(imm[i]) emit_movimm(1,t);
2577             else emit_zeroreg(t);
2578           }
2579         }
2580       }
2581     }
2582   }
2583   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2584     if(rt1[i]) {
2585       signed char sh,sl,th,tl;
2586       th=get_reg(i_regs->regmap,rt1[i]|64);
2587       tl=get_reg(i_regs->regmap,rt1[i]);
2588       sh=get_reg(i_regs->regmap,rs1[i]|64);
2589       sl=get_reg(i_regs->regmap,rs1[i]);
2590       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2591         if(opcode[i]==0x0c) //ANDI
2592         {
2593           if(rs1[i]) {
2594             if(sl<0) {
2595               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2596               emit_andimm(tl,imm[i],tl);
2597             }else{
2598               if(!((i_regs->wasconst>>sl)&1))
2599                 emit_andimm(sl,imm[i],tl);
2600               else
2601                 emit_movimm(constmap[i][sl]&imm[i],tl);
2602             }
2603           }
2604           else
2605             emit_zeroreg(tl);
2606           if(th>=0) emit_zeroreg(th);
2607         }
2608         else
2609         {
2610           if(rs1[i]) {
2611             if(sl<0) {
2612               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2613             }
2614             if(th>=0) {
2615               if(sh<0) {
2616                 emit_loadreg(rs1[i]|64,th);
2617               }else{
2618                 emit_mov(sh,th);
2619               }
2620             }
2621             if(opcode[i]==0x0d) //ORI
2622             if(sl<0) {
2623               emit_orimm(tl,imm[i],tl);
2624             }else{
2625               if(!((i_regs->wasconst>>sl)&1))
2626                 emit_orimm(sl,imm[i],tl);
2627               else
2628                 emit_movimm(constmap[i][sl]|imm[i],tl);
2629             }
2630             if(opcode[i]==0x0e) //XORI
2631             if(sl<0) {
2632               emit_xorimm(tl,imm[i],tl);
2633             }else{
2634               if(!((i_regs->wasconst>>sl)&1))
2635                 emit_xorimm(sl,imm[i],tl);
2636               else
2637                 emit_movimm(constmap[i][sl]^imm[i],tl);
2638             }
2639           }
2640           else {
2641             emit_movimm(imm[i],tl);
2642             if(th>=0) emit_zeroreg(th);
2643           }
2644         }
2645       }
2646     }
2647   }
2648 }
2649
2650 void shiftimm_assemble(int i,struct regstat *i_regs)
2651 {
2652   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2653   {
2654     if(rt1[i]) {
2655       signed char s,t;
2656       t=get_reg(i_regs->regmap,rt1[i]);
2657       s=get_reg(i_regs->regmap,rs1[i]);
2658       //assert(t>=0);
2659       if(t>=0&&!((i_regs->isconst>>t)&1)){
2660         if(rs1[i]==0)
2661         {
2662           emit_zeroreg(t);
2663         }
2664         else
2665         {
2666           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2667           if(imm[i]) {
2668             if(opcode2[i]==0) // SLL
2669             {
2670               emit_shlimm(s<0?t:s,imm[i],t);
2671             }
2672             if(opcode2[i]==2) // SRL
2673             {
2674               emit_shrimm(s<0?t:s,imm[i],t);
2675             }
2676             if(opcode2[i]==3) // SRA
2677             {
2678               emit_sarimm(s<0?t:s,imm[i],t);
2679             }
2680           }else{
2681             // Shift by zero
2682             if(s>=0 && s!=t) emit_mov(s,t);
2683           }
2684         }
2685       }
2686       //emit_storereg(rt1[i],t); //DEBUG
2687     }
2688   }
2689   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2690   {
2691     if(rt1[i]) {
2692       signed char sh,sl,th,tl;
2693       th=get_reg(i_regs->regmap,rt1[i]|64);
2694       tl=get_reg(i_regs->regmap,rt1[i]);
2695       sh=get_reg(i_regs->regmap,rs1[i]|64);
2696       sl=get_reg(i_regs->regmap,rs1[i]);
2697       if(tl>=0) {
2698         if(rs1[i]==0)
2699         {
2700           emit_zeroreg(tl);
2701           if(th>=0) emit_zeroreg(th);
2702         }
2703         else
2704         {
2705           assert(sl>=0);
2706           assert(sh>=0);
2707           if(imm[i]) {
2708             if(opcode2[i]==0x38) // DSLL
2709             {
2710               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2711               emit_shlimm(sl,imm[i],tl);
2712             }
2713             if(opcode2[i]==0x3a) // DSRL
2714             {
2715               emit_shrdimm(sl,sh,imm[i],tl);
2716               if(th>=0) emit_shrimm(sh,imm[i],th);
2717             }
2718             if(opcode2[i]==0x3b) // DSRA
2719             {
2720               emit_shrdimm(sl,sh,imm[i],tl);
2721               if(th>=0) emit_sarimm(sh,imm[i],th);
2722             }
2723           }else{
2724             // Shift by zero
2725             if(sl!=tl) emit_mov(sl,tl);
2726             if(th>=0&&sh!=th) emit_mov(sh,th);
2727           }
2728         }
2729       }
2730     }
2731   }
2732   if(opcode2[i]==0x3c) // DSLL32
2733   {
2734     if(rt1[i]) {
2735       signed char sl,tl,th;
2736       tl=get_reg(i_regs->regmap,rt1[i]);
2737       th=get_reg(i_regs->regmap,rt1[i]|64);
2738       sl=get_reg(i_regs->regmap,rs1[i]);
2739       if(th>=0||tl>=0){
2740         assert(tl>=0);
2741         assert(th>=0);
2742         assert(sl>=0);
2743         emit_mov(sl,th);
2744         emit_zeroreg(tl);
2745         if(imm[i]>32)
2746         {
2747           emit_shlimm(th,imm[i]&31,th);
2748         }
2749       }
2750     }
2751   }
2752   if(opcode2[i]==0x3e) // DSRL32
2753   {
2754     if(rt1[i]) {
2755       signed char sh,tl,th;
2756       tl=get_reg(i_regs->regmap,rt1[i]);
2757       th=get_reg(i_regs->regmap,rt1[i]|64);
2758       sh=get_reg(i_regs->regmap,rs1[i]|64);
2759       if(tl>=0){
2760         assert(sh>=0);
2761         emit_mov(sh,tl);
2762         if(th>=0) emit_zeroreg(th);
2763         if(imm[i]>32)
2764         {
2765           emit_shrimm(tl,imm[i]&31,tl);
2766         }
2767       }
2768     }
2769   }
2770   if(opcode2[i]==0x3f) // DSRA32
2771   {
2772     if(rt1[i]) {
2773       signed char sh,tl;
2774       tl=get_reg(i_regs->regmap,rt1[i]);
2775       sh=get_reg(i_regs->regmap,rs1[i]|64);
2776       if(tl>=0){
2777         assert(sh>=0);
2778         emit_mov(sh,tl);
2779         if(imm[i]>32)
2780         {
2781           emit_sarimm(tl,imm[i]&31,tl);
2782         }
2783       }
2784     }
2785   }
2786 }
2787
2788 #ifndef shift_assemble
2789 void shift_assemble(int i,struct regstat *i_regs)
2790 {
2791   printf("Need shift_assemble for this architecture.\n");
2792   exit(1);
2793 }
2794 #endif
2795
2796 void load_assemble(int i,struct regstat *i_regs)
2797 {
2798   int s,th,tl,addr,map=-1;
2799   int offset;
2800   int jaddr=0;
2801   int memtarget=0,c=0;
2802   int fastload_reg_override=0;
2803   u_int hr,reglist=0;
2804   th=get_reg(i_regs->regmap,rt1[i]|64);
2805   tl=get_reg(i_regs->regmap,rt1[i]);
2806   s=get_reg(i_regs->regmap,rs1[i]);
2807   offset=imm[i];
2808   for(hr=0;hr<HOST_REGS;hr++) {
2809     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2810   }
2811   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2812   if(s>=0) {
2813     c=(i_regs->wasconst>>s)&1;
2814     if (c) {
2815       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2816       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2817     }
2818   }
2819   //printf("load_assemble: c=%d\n",c);
2820   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2821   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2822 #ifdef PCSX
2823   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2824     ||rt1[i]==0) {
2825       // could be FIFO, must perform the read
2826       // ||dummy read
2827       assem_debug("(forced read)\n");
2828       tl=get_reg(i_regs->regmap,-1);
2829       assert(tl>=0);
2830   }
2831 #endif
2832   if(offset||s<0||c) addr=tl;
2833   else addr=s;
2834   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2835  if(tl>=0) {
2836   //printf("load_assemble: c=%d\n",c);
2837   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2838   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2839   reglist&=~(1<<tl);
2840   if(th>=0) reglist&=~(1<<th);
2841   if(!using_tlb) {
2842     if(!c) {
2843       #ifdef RAM_OFFSET
2844       map=get_reg(i_regs->regmap,ROREG);
2845       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2846       #endif
2847 //#define R29_HACK 1
2848       #ifdef R29_HACK
2849       // Strmnnrmn's speed hack
2850       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2851       #endif
2852       {
2853         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2854       }
2855     }
2856     else if(ram_offset&&memtarget) {
2857       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2858       fastload_reg_override=HOST_TEMPREG;
2859     }
2860   }else{ // using tlb
2861     int x=0;
2862     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2863     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2864     map=get_reg(i_regs->regmap,TLREG);
2865     assert(map>=0);
2866     reglist&=~(1<<map);
2867     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2868     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2869   }
2870   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2871   if (opcode[i]==0x20) { // LB
2872     if(!c||memtarget) {
2873       if(!dummy) {
2874         #ifdef HOST_IMM_ADDR32
2875         if(c)
2876           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2877         else
2878         #endif
2879         {
2880           //emit_xorimm(addr,3,tl);
2881           //gen_tlb_addr_r(tl,map);
2882           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2883           int x=0,a=tl;
2884 #ifdef BIG_ENDIAN_MIPS
2885           if(!c) emit_xorimm(addr,3,tl);
2886           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2887 #else
2888           if(!c) a=addr;
2889 #endif
2890           if(fastload_reg_override) a=fastload_reg_override;
2891
2892           emit_movsbl_indexed_tlb(x,a,map,tl);
2893         }
2894       }
2895       if(jaddr)
2896         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2897     }
2898     else
2899       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2900   }
2901   if (opcode[i]==0x21) { // LH
2902     if(!c||memtarget) {
2903       if(!dummy) {
2904         #ifdef HOST_IMM_ADDR32
2905         if(c)
2906           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2907         else
2908         #endif
2909         {
2910           int x=0,a=tl;
2911 #ifdef BIG_ENDIAN_MIPS
2912           if(!c) emit_xorimm(addr,2,tl);
2913           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2914 #else
2915           if(!c) a=addr;
2916 #endif
2917           if(fastload_reg_override) a=fastload_reg_override;
2918           //#ifdef
2919           //emit_movswl_indexed_tlb(x,tl,map,tl);
2920           //else
2921           if(map>=0) {
2922             gen_tlb_addr_r(a,map);
2923             emit_movswl_indexed(x,a,tl);
2924           }else{
2925             #if 1 //def RAM_OFFSET
2926             emit_movswl_indexed(x,a,tl);
2927             #else
2928             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2929             #endif
2930           }
2931         }
2932       }
2933       if(jaddr)
2934         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2935     }
2936     else
2937       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2938   }
2939   if (opcode[i]==0x23) { // LW
2940     if(!c||memtarget) {
2941       if(!dummy) {
2942         int a=addr;
2943         if(fastload_reg_override) a=fastload_reg_override;
2944         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2945         #ifdef HOST_IMM_ADDR32
2946         if(c)
2947           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2948         else
2949         #endif
2950         emit_readword_indexed_tlb(0,a,map,tl);
2951       }
2952       if(jaddr)
2953         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2954     }
2955     else
2956       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2957   }
2958   if (opcode[i]==0x24) { // LBU
2959     if(!c||memtarget) {
2960       if(!dummy) {
2961         #ifdef HOST_IMM_ADDR32
2962         if(c)
2963           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2964         else
2965         #endif
2966         {
2967           //emit_xorimm(addr,3,tl);
2968           //gen_tlb_addr_r(tl,map);
2969           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2970           int x=0,a=tl;
2971 #ifdef BIG_ENDIAN_MIPS
2972           if(!c) emit_xorimm(addr,3,tl);
2973           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2974 #else
2975           if(!c) a=addr;
2976 #endif
2977           if(fastload_reg_override) a=fastload_reg_override;
2978
2979           emit_movzbl_indexed_tlb(x,a,map,tl);
2980         }
2981       }
2982       if(jaddr)
2983         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2984     }
2985     else
2986       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2987   }
2988   if (opcode[i]==0x25) { // LHU
2989     if(!c||memtarget) {
2990       if(!dummy) {
2991         #ifdef HOST_IMM_ADDR32
2992         if(c)
2993           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2994         else
2995         #endif
2996         {
2997           int x=0,a=tl;
2998 #ifdef BIG_ENDIAN_MIPS
2999           if(!c) emit_xorimm(addr,2,tl);
3000           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3001 #else
3002           if(!c) a=addr;
3003 #endif
3004           if(fastload_reg_override) a=fastload_reg_override;
3005           //#ifdef
3006           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3007           //#else
3008           if(map>=0) {
3009             gen_tlb_addr_r(a,map);
3010             emit_movzwl_indexed(x,a,tl);
3011           }else{
3012             #if 1 //def RAM_OFFSET
3013             emit_movzwl_indexed(x,a,tl);
3014             #else
3015             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3016             #endif
3017           }
3018         }
3019       }
3020       if(jaddr)
3021         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3022     }
3023     else
3024       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3025   }
3026   if (opcode[i]==0x27) { // LWU
3027     assert(th>=0);
3028     if(!c||memtarget) {
3029       if(!dummy) {
3030         int a=addr;
3031         if(fastload_reg_override) a=fastload_reg_override;
3032         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3033         #ifdef HOST_IMM_ADDR32
3034         if(c)
3035           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3036         else
3037         #endif
3038         emit_readword_indexed_tlb(0,a,map,tl);
3039       }
3040       if(jaddr)
3041         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3042     }
3043     else {
3044       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3045     }
3046     emit_zeroreg(th);
3047   }
3048   if (opcode[i]==0x37) { // LD
3049     if(!c||memtarget) {
3050       if(!dummy) {
3051         int a=addr;
3052         if(fastload_reg_override) a=fastload_reg_override;
3053         //gen_tlb_addr_r(tl,map);
3054         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3055         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3056         #ifdef HOST_IMM_ADDR32
3057         if(c)
3058           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3059         else
3060         #endif
3061         emit_readdword_indexed_tlb(0,a,map,th,tl);
3062       }
3063       if(jaddr)
3064         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3065     }
3066     else
3067       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3068   }
3069  }
3070   //emit_storereg(rt1[i],tl); // DEBUG
3071   //if(opcode[i]==0x23)
3072   //if(opcode[i]==0x24)
3073   //if(opcode[i]==0x23||opcode[i]==0x24)
3074   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3075   {
3076     //emit_pusha();
3077     save_regs(0x100f);
3078         emit_readword((int)&last_count,ECX);
3079         #ifdef __i386__
3080         if(get_reg(i_regs->regmap,CCREG)<0)
3081           emit_loadreg(CCREG,HOST_CCREG);
3082         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3083         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3084         emit_writeword(HOST_CCREG,(int)&Count);
3085         #endif
3086         #ifdef __arm__
3087         if(get_reg(i_regs->regmap,CCREG)<0)
3088           emit_loadreg(CCREG,0);
3089         else
3090           emit_mov(HOST_CCREG,0);
3091         emit_add(0,ECX,0);
3092         emit_addimm(0,2*ccadj[i],0);
3093         emit_writeword(0,(int)&Count);
3094         #endif
3095     emit_call((int)memdebug);
3096     //emit_popa();
3097     restore_regs(0x100f);
3098   }/**/
3099 }
3100
3101 #ifndef loadlr_assemble
3102 void loadlr_assemble(int i,struct regstat *i_regs)
3103 {
3104   printf("Need loadlr_assemble for this architecture.\n");
3105   exit(1);
3106 }
3107 #endif
3108
3109 void store_assemble(int i,struct regstat *i_regs)
3110 {
3111   int s,th,tl,map=-1;
3112   int addr,temp;
3113   int offset;
3114   int jaddr=0,jaddr2,type;
3115   int memtarget=0,c=0;
3116   int agr=AGEN1+(i&1);
3117   int faststore_reg_override=0;
3118   u_int hr,reglist=0;
3119   th=get_reg(i_regs->regmap,rs2[i]|64);
3120   tl=get_reg(i_regs->regmap,rs2[i]);
3121   s=get_reg(i_regs->regmap,rs1[i]);
3122   temp=get_reg(i_regs->regmap,agr);
3123   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3124   offset=imm[i];
3125   if(s>=0) {
3126     c=(i_regs->wasconst>>s)&1;
3127     if(c) {
3128       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3129       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3130     }
3131   }
3132   assert(tl>=0);
3133   assert(temp>=0);
3134   for(hr=0;hr<HOST_REGS;hr++) {
3135     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3136   }
3137   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3138   if(offset||s<0||c) addr=temp;
3139   else addr=s;
3140   if(!using_tlb) {
3141     if(!c) {
3142       #ifndef PCSX
3143       #ifdef R29_HACK
3144       // Strmnnrmn's speed hack
3145       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3146       #endif
3147       emit_cmpimm(addr,RAM_SIZE);
3148       #ifdef DESTRUCTIVE_SHIFT
3149       if(s==addr) emit_mov(s,temp);
3150       #endif
3151       #ifdef R29_HACK
3152       memtarget=1;
3153       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3154       #endif
3155       {
3156         jaddr=(int)out;
3157         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3158         // Hint to branch predictor that the branch is unlikely to be taken
3159         if(rs1[i]>=28)
3160           emit_jno_unlikely(0);
3161         else
3162         #endif
3163         emit_jno(0);
3164       }
3165       #else
3166         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3167       #endif
3168     }
3169     else if(ram_offset&&memtarget) {
3170       emit_addimm(addr,ram_offset,HOST_TEMPREG);
3171       faststore_reg_override=HOST_TEMPREG;
3172     }
3173   }else{ // using tlb
3174     int x=0;
3175     if (opcode[i]==0x28) x=3; // SB
3176     if (opcode[i]==0x29) x=2; // SH
3177     map=get_reg(i_regs->regmap,TLREG);
3178     assert(map>=0);
3179     reglist&=~(1<<map);
3180     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3181     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3182   }
3183
3184   if (opcode[i]==0x28) { // SB
3185     if(!c||memtarget) {
3186       int x=0,a=temp;
3187 #ifdef BIG_ENDIAN_MIPS
3188       if(!c) emit_xorimm(addr,3,temp);
3189       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3190 #else
3191       if(!c) a=addr;
3192 #endif
3193       if(faststore_reg_override) a=faststore_reg_override;
3194       //gen_tlb_addr_w(temp,map);
3195       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3196       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3197     }
3198     type=STOREB_STUB;
3199   }
3200   if (opcode[i]==0x29) { // SH
3201     if(!c||memtarget) {
3202       int x=0,a=temp;
3203 #ifdef BIG_ENDIAN_MIPS
3204       if(!c) emit_xorimm(addr,2,temp);
3205       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3206 #else
3207       if(!c) a=addr;
3208 #endif
3209       if(faststore_reg_override) a=faststore_reg_override;
3210       //#ifdef
3211       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3212       //#else
3213       if(map>=0) {
3214         gen_tlb_addr_w(a,map);
3215         emit_writehword_indexed(tl,x,a);
3216       }else
3217         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3218         emit_writehword_indexed(tl,x,a);
3219     }
3220     type=STOREH_STUB;
3221   }
3222   if (opcode[i]==0x2B) { // SW
3223     if(!c||memtarget) {
3224       int a=addr;
3225       if(faststore_reg_override) a=faststore_reg_override;
3226       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3227       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3228     }
3229     type=STOREW_STUB;
3230   }
3231   if (opcode[i]==0x3F) { // SD
3232     if(!c||memtarget) {
3233       int a=addr;
3234       if(faststore_reg_override) a=faststore_reg_override;
3235       if(rs2[i]) {
3236         assert(th>=0);
3237         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3238         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3239         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3240       }else{
3241         // Store zero
3242         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3243         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3244         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3245       }
3246     }
3247     type=STORED_STUB;
3248   }
3249 #ifdef PCSX
3250   if(jaddr) {
3251     // PCSX store handlers don't check invcode again
3252     reglist|=1<<addr;
3253     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3254     jaddr=0;
3255   }
3256 #endif
3257   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3258     if(!c||memtarget) {
3259       #ifdef DESTRUCTIVE_SHIFT
3260       // The x86 shift operation is 'destructive'; it overwrites the
3261       // source register, so we need to make a copy first and use that.
3262       addr=temp;
3263       #endif
3264       #if defined(HOST_IMM8)
3265       int ir=get_reg(i_regs->regmap,INVCP);
3266       assert(ir>=0);
3267       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3268       #else
3269       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3270       #endif
3271       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3272       emit_callne(invalidate_addr_reg[addr]);
3273       #else
3274       jaddr2=(int)out;
3275       emit_jne(0);
3276       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3277       #endif
3278     }
3279   }
3280   u_int addr_val=constmap[i][s]+offset;
3281   if(jaddr) {
3282     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3283   } else if(c&&!memtarget) {
3284     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3285   }
3286   // basic current block modification detection..
3287   // not looking back as that should be in mips cache already
3288   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3289     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3290     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3291     if(i_regs->regmap==regs[i].regmap) {
3292       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3293       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3294       emit_movimm(start+i*4+4,0);
3295       emit_writeword(0,(int)&pcaddr);
3296       emit_jmp((int)do_interrupt);
3297     }
3298   }
3299   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3300   //if(opcode[i]==0x2B || opcode[i]==0x28)
3301   //if(opcode[i]==0x2B || opcode[i]==0x29)
3302   //if(opcode[i]==0x2B)
3303   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3304   {
3305     #ifdef __i386__
3306     emit_pusha();
3307     #endif
3308     #ifdef __arm__
3309     save_regs(0x100f);
3310     #endif
3311         emit_readword((int)&last_count,ECX);
3312         #ifdef __i386__
3313         if(get_reg(i_regs->regmap,CCREG)<0)
3314           emit_loadreg(CCREG,HOST_CCREG);
3315         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3316         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3317         emit_writeword(HOST_CCREG,(int)&Count);
3318         #endif
3319         #ifdef __arm__
3320         if(get_reg(i_regs->regmap,CCREG)<0)
3321           emit_loadreg(CCREG,0);
3322         else
3323           emit_mov(HOST_CCREG,0);
3324         emit_add(0,ECX,0);
3325         emit_addimm(0,2*ccadj[i],0);
3326         emit_writeword(0,(int)&Count);
3327         #endif
3328     emit_call((int)memdebug);
3329     #ifdef __i386__
3330     emit_popa();
3331     #endif
3332     #ifdef __arm__
3333     restore_regs(0x100f);
3334     #endif
3335   }/**/
3336 }
3337
3338 void storelr_assemble(int i,struct regstat *i_regs)
3339 {
3340   int s,th,tl;
3341   int temp;
3342   int temp2;
3343   int offset;
3344   int jaddr=0,jaddr2;
3345   int case1,case2,case3;
3346   int done0,done1,done2;
3347   int memtarget=0,c=0;
3348   int agr=AGEN1+(i&1);
3349   u_int hr,reglist=0;
3350   th=get_reg(i_regs->regmap,rs2[i]|64);
3351   tl=get_reg(i_regs->regmap,rs2[i]);
3352   s=get_reg(i_regs->regmap,rs1[i]);
3353   temp=get_reg(i_regs->regmap,agr);
3354   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3355   offset=imm[i];
3356   if(s>=0) {
3357     c=(i_regs->isconst>>s)&1;
3358     if(c) {
3359       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3360       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3361     }
3362   }
3363   assert(tl>=0);
3364   for(hr=0;hr<HOST_REGS;hr++) {
3365     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3366   }
3367   assert(temp>=0);
3368   if(!using_tlb) {
3369     if(!c) {
3370       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3371       if(!offset&&s!=temp) emit_mov(s,temp);
3372       jaddr=(int)out;
3373       emit_jno(0);
3374     }
3375     else
3376     {
3377       if(!memtarget||!rs1[i]) {
3378         jaddr=(int)out;
3379         emit_jmp(0);
3380       }
3381     }
3382     #ifdef RAM_OFFSET
3383     int map=get_reg(i_regs->regmap,ROREG);
3384     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3385     gen_tlb_addr_w(temp,map);
3386     #else
3387     if((u_int)rdram!=0x80000000) 
3388       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3389     #endif
3390   }else{ // using tlb
3391     int map=get_reg(i_regs->regmap,TLREG);
3392     assert(map>=0);
3393     reglist&=~(1<<map);
3394     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3395     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3396     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3397     if(!jaddr&&!memtarget) {
3398       jaddr=(int)out;
3399       emit_jmp(0);
3400     }
3401     gen_tlb_addr_w(temp,map);
3402   }
3403
3404   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3405     temp2=get_reg(i_regs->regmap,FTEMP);
3406     if(!rs2[i]) temp2=th=tl;
3407   }
3408
3409 #ifndef BIG_ENDIAN_MIPS
3410     emit_xorimm(temp,3,temp);
3411 #endif
3412   emit_testimm(temp,2);
3413   case2=(int)out;
3414   emit_jne(0);
3415   emit_testimm(temp,1);
3416   case1=(int)out;
3417   emit_jne(0);
3418   // 0
3419   if (opcode[i]==0x2A) { // SWL
3420     emit_writeword_indexed(tl,0,temp);
3421   }
3422   if (opcode[i]==0x2E) { // SWR
3423     emit_writebyte_indexed(tl,3,temp);
3424   }
3425   if (opcode[i]==0x2C) { // SDL
3426     emit_writeword_indexed(th,0,temp);
3427     if(rs2[i]) emit_mov(tl,temp2);
3428   }
3429   if (opcode[i]==0x2D) { // SDR
3430     emit_writebyte_indexed(tl,3,temp);
3431     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3432   }
3433   done0=(int)out;
3434   emit_jmp(0);
3435   // 1
3436   set_jump_target(case1,(int)out);
3437   if (opcode[i]==0x2A) { // SWL
3438     // Write 3 msb into three least significant bytes
3439     if(rs2[i]) emit_rorimm(tl,8,tl);
3440     emit_writehword_indexed(tl,-1,temp);
3441     if(rs2[i]) emit_rorimm(tl,16,tl);
3442     emit_writebyte_indexed(tl,1,temp);
3443     if(rs2[i]) emit_rorimm(tl,8,tl);
3444   }
3445   if (opcode[i]==0x2E) { // SWR
3446     // Write two lsb into two most significant bytes
3447     emit_writehword_indexed(tl,1,temp);
3448   }
3449   if (opcode[i]==0x2C) { // SDL
3450     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3451     // Write 3 msb into three least significant bytes
3452     if(rs2[i]) emit_rorimm(th,8,th);
3453     emit_writehword_indexed(th,-1,temp);
3454     if(rs2[i]) emit_rorimm(th,16,th);
3455     emit_writebyte_indexed(th,1,temp);
3456     if(rs2[i]) emit_rorimm(th,8,th);
3457   }
3458   if (opcode[i]==0x2D) { // SDR
3459     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3460     // Write two lsb into two most significant bytes
3461     emit_writehword_indexed(tl,1,temp);
3462   }
3463   done1=(int)out;
3464   emit_jmp(0);
3465   // 2
3466   set_jump_target(case2,(int)out);
3467   emit_testimm(temp,1);
3468   case3=(int)out;
3469   emit_jne(0);
3470   if (opcode[i]==0x2A) { // SWL
3471     // Write two msb into two least significant bytes
3472     if(rs2[i]) emit_rorimm(tl,16,tl);
3473     emit_writehword_indexed(tl,-2,temp);
3474     if(rs2[i]) emit_rorimm(tl,16,tl);
3475   }
3476   if (opcode[i]==0x2E) { // SWR
3477     // Write 3 lsb into three most significant bytes
3478     emit_writebyte_indexed(tl,-1,temp);
3479     if(rs2[i]) emit_rorimm(tl,8,tl);
3480     emit_writehword_indexed(tl,0,temp);
3481     if(rs2[i]) emit_rorimm(tl,24,tl);
3482   }
3483   if (opcode[i]==0x2C) { // SDL
3484     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3485     // Write two msb into two least significant bytes
3486     if(rs2[i]) emit_rorimm(th,16,th);
3487     emit_writehword_indexed(th,-2,temp);
3488     if(rs2[i]) emit_rorimm(th,16,th);
3489   }
3490   if (opcode[i]==0x2D) { // SDR
3491     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3492     // Write 3 lsb into three most significant bytes
3493     emit_writebyte_indexed(tl,-1,temp);
3494     if(rs2[i]) emit_rorimm(tl,8,tl);
3495     emit_writehword_indexed(tl,0,temp);
3496     if(rs2[i]) emit_rorimm(tl,24,tl);
3497   }
3498   done2=(int)out;
3499   emit_jmp(0);
3500   // 3
3501   set_jump_target(case3,(int)out);
3502   if (opcode[i]==0x2A) { // SWL
3503     // Write msb into least significant byte
3504     if(rs2[i]) emit_rorimm(tl,24,tl);
3505     emit_writebyte_indexed(tl,-3,temp);
3506     if(rs2[i]) emit_rorimm(tl,8,tl);
3507   }
3508   if (opcode[i]==0x2E) { // SWR
3509     // Write entire word
3510     emit_writeword_indexed(tl,-3,temp);
3511   }
3512   if (opcode[i]==0x2C) { // SDL
3513     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3514     // Write msb into least significant byte
3515     if(rs2[i]) emit_rorimm(th,24,th);
3516     emit_writebyte_indexed(th,-3,temp);
3517     if(rs2[i]) emit_rorimm(th,8,th);
3518   }
3519   if (opcode[i]==0x2D) { // SDR
3520     if(rs2[i]) emit_mov(th,temp2);
3521     // Write entire word
3522     emit_writeword_indexed(tl,-3,temp);
3523   }
3524   set_jump_target(done0,(int)out);
3525   set_jump_target(done1,(int)out);
3526   set_jump_target(done2,(int)out);
3527   if (opcode[i]==0x2C) { // SDL
3528     emit_testimm(temp,4);
3529     done0=(int)out;
3530     emit_jne(0);
3531     emit_andimm(temp,~3,temp);
3532     emit_writeword_indexed(temp2,4,temp);
3533     set_jump_target(done0,(int)out);
3534   }
3535   if (opcode[i]==0x2D) { // SDR
3536     emit_testimm(temp,4);
3537     done0=(int)out;
3538     emit_jeq(0);
3539     emit_andimm(temp,~3,temp);
3540     emit_writeword_indexed(temp2,-4,temp);
3541     set_jump_target(done0,(int)out);
3542   }
3543   if(!c||!memtarget)
3544     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3545   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3546     #ifdef RAM_OFFSET
3547     int map=get_reg(i_regs->regmap,ROREG);
3548     if(map<0) map=HOST_TEMPREG;
3549     gen_orig_addr_w(temp,map);
3550     #else
3551     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3552     #endif
3553     #if defined(HOST_IMM8)
3554     int ir=get_reg(i_regs->regmap,INVCP);
3555     assert(ir>=0);
3556     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3557     #else
3558     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3559     #endif
3560     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3561     emit_callne(invalidate_addr_reg[temp]);
3562     #else
3563     jaddr2=(int)out;
3564     emit_jne(0);
3565     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3566     #endif
3567   }
3568   /*
3569     emit_pusha();
3570     //save_regs(0x100f);
3571         emit_readword((int)&last_count,ECX);
3572         if(get_reg(i_regs->regmap,CCREG)<0)
3573           emit_loadreg(CCREG,HOST_CCREG);
3574         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3575         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3576         emit_writeword(HOST_CCREG,(int)&Count);
3577     emit_call((int)memdebug);
3578     emit_popa();
3579     //restore_regs(0x100f);
3580   /**/
3581 }
3582
3583 void c1ls_assemble(int i,struct regstat *i_regs)
3584 {
3585 #ifndef DISABLE_COP1
3586   int s,th,tl;
3587   int temp,ar;
3588   int map=-1;
3589   int offset;
3590   int c=0;
3591   int jaddr,jaddr2=0,jaddr3,type;
3592   int agr=AGEN1+(i&1);
3593   u_int hr,reglist=0;
3594   th=get_reg(i_regs->regmap,FTEMP|64);
3595   tl=get_reg(i_regs->regmap,FTEMP);
3596   s=get_reg(i_regs->regmap,rs1[i]);
3597   temp=get_reg(i_regs->regmap,agr);
3598   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3599   offset=imm[i];
3600   assert(tl>=0);
3601   assert(rs1[i]>0);
3602   assert(temp>=0);
3603   for(hr=0;hr<HOST_REGS;hr++) {
3604     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3605   }
3606   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3607   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3608   {
3609     // Loads use a temporary register which we need to save
3610     reglist|=1<<temp;
3611   }
3612   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3613     ar=temp;
3614   else // LWC1/LDC1
3615     ar=tl;
3616   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3617   //else c=(i_regs->wasconst>>s)&1;
3618   if(s>=0) c=(i_regs->wasconst>>s)&1;
3619   // Check cop1 unusable
3620   if(!cop1_usable) {
3621     signed char rs=get_reg(i_regs->regmap,CSREG);
3622     assert(rs>=0);
3623     emit_testimm(rs,0x20000000);
3624     jaddr=(int)out;
3625     emit_jeq(0);
3626     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3627     cop1_usable=1;
3628   }
3629   if (opcode[i]==0x39) { // SWC1 (get float address)
3630     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3631   }
3632   if (opcode[i]==0x3D) { // SDC1 (get double address)
3633     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3634   }
3635   // Generate address + offset
3636   if(!using_tlb) {
3637     if(!c)
3638       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3639   }
3640   else
3641   {
3642     map=get_reg(i_regs->regmap,TLREG);
3643     assert(map>=0);
3644     reglist&=~(1<<map);
3645     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3646       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3647     }
3648     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3649       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3650     }
3651   }
3652   if (opcode[i]==0x39) { // SWC1 (read float)
3653     emit_readword_indexed(0,tl,tl);
3654   }
3655   if (opcode[i]==0x3D) { // SDC1 (read double)
3656     emit_readword_indexed(4,tl,th);
3657     emit_readword_indexed(0,tl,tl);
3658   }
3659   if (opcode[i]==0x31) { // LWC1 (get target address)
3660     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3661   }
3662   if (opcode[i]==0x35) { // LDC1 (get target address)
3663     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3664   }
3665   if(!using_tlb) {
3666     if(!c) {
3667       jaddr2=(int)out;
3668       emit_jno(0);
3669     }
3670     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3671       jaddr2=(int)out;
3672       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3673     }
3674     #ifdef DESTRUCTIVE_SHIFT
3675     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3676       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3677     }
3678     #endif
3679   }else{
3680     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3681       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3682     }
3683     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3684       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3685     }
3686   }
3687   if (opcode[i]==0x31) { // LWC1
3688     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3689     //gen_tlb_addr_r(ar,map);
3690     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3691     #ifdef HOST_IMM_ADDR32
3692     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3693     else
3694     #endif
3695     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3696     type=LOADW_STUB;
3697   }
3698   if (opcode[i]==0x35) { // LDC1
3699     assert(th>=0);
3700     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3701     //gen_tlb_addr_r(ar,map);
3702     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3703     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3704     #ifdef HOST_IMM_ADDR32
3705     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3706     else
3707     #endif
3708     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3709     type=LOADD_STUB;
3710   }
3711   if (opcode[i]==0x39) { // SWC1
3712     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3713     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3714     type=STOREW_STUB;
3715   }
3716   if (opcode[i]==0x3D) { // SDC1
3717     assert(th>=0);
3718     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3719     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3720     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3721     type=STORED_STUB;
3722   }
3723   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3724     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3725       #ifndef DESTRUCTIVE_SHIFT
3726       temp=offset||c||s<0?ar:s;
3727       #endif
3728       #if defined(HOST_IMM8)
3729       int ir=get_reg(i_regs->regmap,INVCP);
3730       assert(ir>=0);
3731       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3732       #else
3733       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3734       #endif
3735       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3736       emit_callne(invalidate_addr_reg[temp]);
3737       #else
3738       jaddr3=(int)out;
3739       emit_jne(0);
3740       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3741       #endif
3742     }
3743   }
3744   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3745   if (opcode[i]==0x31) { // LWC1 (write float)
3746     emit_writeword_indexed(tl,0,temp);
3747   }
3748   if (opcode[i]==0x35) { // LDC1 (write double)
3749     emit_writeword_indexed(th,4,temp);
3750     emit_writeword_indexed(tl,0,temp);
3751   }
3752   //if(opcode[i]==0x39)
3753   /*if(opcode[i]==0x39||opcode[i]==0x31)
3754   {
3755     emit_pusha();
3756         emit_readword((int)&last_count,ECX);
3757         if(get_reg(i_regs->regmap,CCREG)<0)
3758           emit_loadreg(CCREG,HOST_CCREG);
3759         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3760         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3761         emit_writeword(HOST_CCREG,(int)&Count);
3762     emit_call((int)memdebug);
3763     emit_popa();
3764   }/**/
3765 #else
3766   cop1_unusable(i, i_regs);
3767 #endif
3768 }
3769
3770 void c2ls_assemble(int i,struct regstat *i_regs)
3771 {
3772   int s,tl;
3773   int ar;
3774   int offset;
3775   int memtarget=0,c=0;
3776   int jaddr2=0,jaddr3,type;
3777   int agr=AGEN1+(i&1);
3778   int fastio_reg_override=0;
3779   u_int hr,reglist=0;
3780   u_int copr=(source[i]>>16)&0x1f;
3781   s=get_reg(i_regs->regmap,rs1[i]);
3782   tl=get_reg(i_regs->regmap,FTEMP);
3783   offset=imm[i];
3784   assert(rs1[i]>0);
3785   assert(tl>=0);
3786   assert(!using_tlb);
3787
3788   for(hr=0;hr<HOST_REGS;hr++) {
3789     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3790   }
3791   if(i_regs->regmap[HOST_CCREG]==CCREG)
3792     reglist&=~(1<<HOST_CCREG);
3793
3794   // get the address
3795   if (opcode[i]==0x3a) { // SWC2
3796     ar=get_reg(i_regs->regmap,agr);
3797     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3798     reglist|=1<<ar;
3799   } else { // LWC2
3800     ar=tl;
3801   }
3802   if(s>=0) c=(i_regs->wasconst>>s)&1;
3803   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3804   if (!offset&&!c&&s>=0) ar=s;
3805   assert(ar>=0);
3806
3807   if (opcode[i]==0x3a) { // SWC2
3808     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3809     type=STOREW_STUB;
3810   }
3811   else
3812     type=LOADW_STUB;
3813
3814   if(c&&!memtarget) {
3815     jaddr2=(int)out;
3816     emit_jmp(0); // inline_readstub/inline_writestub?
3817   }
3818   else {
3819     if(!c) {
3820       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3821     }
3822     else if(ram_offset&&memtarget) {
3823       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3824       fastio_reg_override=HOST_TEMPREG;
3825     }
3826     if (opcode[i]==0x32) { // LWC2
3827       #ifdef HOST_IMM_ADDR32
3828       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3829       else
3830       #endif
3831       int a=ar;
3832       if(fastio_reg_override) a=fastio_reg_override;
3833       emit_readword_indexed(0,a,tl);
3834     }
3835     if (opcode[i]==0x3a) { // SWC2
3836       #ifdef DESTRUCTIVE_SHIFT
3837       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3838       #endif
3839       int a=ar;
3840       if(fastio_reg_override) a=fastio_reg_override;
3841       emit_writeword_indexed(tl,0,a);
3842     }
3843   }
3844   if(jaddr2)
3845     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3846   if(opcode[i]==0x3a) // SWC2
3847   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3848 #if defined(HOST_IMM8)
3849     int ir=get_reg(i_regs->regmap,INVCP);
3850     assert(ir>=0);
3851     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3852 #else
3853     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3854 #endif
3855     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3856     emit_callne(invalidate_addr_reg[ar]);
3857     #else
3858     jaddr3=(int)out;
3859     emit_jne(0);
3860     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3861     #endif
3862   }
3863   if (opcode[i]==0x32) { // LWC2
3864     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3865   }
3866 }
3867
3868 #ifndef multdiv_assemble
3869 void multdiv_assemble(int i,struct regstat *i_regs)
3870 {
3871   printf("Need multdiv_assemble for this architecture.\n");
3872   exit(1);
3873 }
3874 #endif
3875
3876 void mov_assemble(int i,struct regstat *i_regs)
3877 {
3878   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3879   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3880   if(rt1[i]) {
3881     signed char sh,sl,th,tl;
3882     th=get_reg(i_regs->regmap,rt1[i]|64);
3883     tl=get_reg(i_regs->regmap,rt1[i]);
3884     //assert(tl>=0);
3885     if(tl>=0) {
3886       sh=get_reg(i_regs->regmap,rs1[i]|64);
3887       sl=get_reg(i_regs->regmap,rs1[i]);
3888       if(sl>=0) emit_mov(sl,tl);
3889       else emit_loadreg(rs1[i],tl);
3890       if(th>=0) {
3891         if(sh>=0) emit_mov(sh,th);
3892         else emit_loadreg(rs1[i]|64,th);
3893       }
3894     }
3895   }
3896 }
3897
3898 #ifndef fconv_assemble
3899 void fconv_assemble(int i,struct regstat *i_regs)
3900 {
3901   printf("Need fconv_assemble for this architecture.\n");
3902   exit(1);
3903 }
3904 #endif
3905
3906 #if 0
3907 void float_assemble(int i,struct regstat *i_regs)
3908 {
3909   printf("Need float_assemble for this architecture.\n");
3910   exit(1);
3911 }
3912 #endif
3913
3914 void syscall_assemble(int i,struct regstat *i_regs)
3915 {
3916   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3917   assert(ccreg==HOST_CCREG);
3918   assert(!is_delayslot);
3919   emit_movimm(start+i*4,EAX); // Get PC
3920   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3921   emit_jmp((int)jump_syscall_hle); // XXX
3922 }
3923
3924 void hlecall_assemble(int i,struct regstat *i_regs)
3925 {
3926   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3927   assert(ccreg==HOST_CCREG);
3928   assert(!is_delayslot);
3929   emit_movimm(start+i*4+4,0); // Get PC
3930   emit_movimm((int)psxHLEt[source[i]&7],1);
3931   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3932   emit_jmp((int)jump_hlecall);
3933 }
3934
3935 void intcall_assemble(int i,struct regstat *i_regs)
3936 {
3937   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3938   assert(ccreg==HOST_CCREG);
3939   assert(!is_delayslot);
3940   emit_movimm(start+i*4,0); // Get PC
3941   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3942   emit_jmp((int)jump_intcall);
3943 }
3944
3945 void ds_assemble(int i,struct regstat *i_regs)
3946 {
3947   speculate_register_values(i);
3948   is_delayslot=1;
3949   switch(itype[i]) {
3950     case ALU:
3951       alu_assemble(i,i_regs);break;
3952     case IMM16:
3953       imm16_assemble(i,i_regs);break;
3954     case SHIFT:
3955       shift_assemble(i,i_regs);break;
3956     case SHIFTIMM:
3957       shiftimm_assemble(i,i_regs);break;
3958     case LOAD:
3959       load_assemble(i,i_regs);break;
3960     case LOADLR:
3961       loadlr_assemble(i,i_regs);break;
3962     case STORE:
3963       store_assemble(i,i_regs);break;
3964     case STORELR:
3965       storelr_assemble(i,i_regs);break;
3966     case COP0:
3967       cop0_assemble(i,i_regs);break;
3968     case COP1:
3969       cop1_assemble(i,i_regs);break;
3970     case C1LS:
3971       c1ls_assemble(i,i_regs);break;
3972     case COP2: