5120df05de9197c3b7a10e65c6ac1d644da41b72
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #endif
57
58 #define MAXBLOCK 4096
59 #define MAX_OUTPUT_BLOCK_SIZE 262144
60
61 struct regstat
62 {
63   signed char regmap_entry[HOST_REGS];
64   signed char regmap[HOST_REGS];
65   uint64_t was32;
66   uint64_t is32;
67   uint64_t wasdirty;
68   uint64_t dirty;
69   uint64_t u;
70   uint64_t uu;
71   u_int wasconst;
72   u_int isconst;
73   u_int loadedconst;             // host regs that have constants loaded
74   u_int waswritten;              // MIPS regs that were used as store base before
75 };
76
77 // note: asm depends on this layout
78 struct ll_entry
79 {
80   u_int vaddr;
81   u_int reg_sv_flags;
82   void *addr;
83   struct ll_entry *next;
84 };
85
86   u_int start;
87   u_int *source;
88   char insn[MAXBLOCK][10];
89   u_char itype[MAXBLOCK];
90   u_char opcode[MAXBLOCK];
91   u_char opcode2[MAXBLOCK];
92   u_char bt[MAXBLOCK];
93   u_char rs1[MAXBLOCK];
94   u_char rs2[MAXBLOCK];
95   u_char rt1[MAXBLOCK];
96   u_char rt2[MAXBLOCK];
97   u_char us1[MAXBLOCK];
98   u_char us2[MAXBLOCK];
99   u_char dep1[MAXBLOCK];
100   u_char dep2[MAXBLOCK];
101   u_char lt1[MAXBLOCK];
102   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
103   static uint64_t gte_rt[MAXBLOCK];
104   static uint64_t gte_unneeded[MAXBLOCK];
105   static u_int smrv[32]; // speculated MIPS register values
106   static u_int smrv_strong; // mask or regs that are likely to have correct values
107   static u_int smrv_weak; // same, but somewhat less likely
108   static u_int smrv_strong_next; // same, but after current insn executes
109   static u_int smrv_weak_next;
110   int imm[MAXBLOCK];
111   u_int ba[MAXBLOCK];
112   char likely[MAXBLOCK];
113   char is_ds[MAXBLOCK];
114   char ooo[MAXBLOCK];
115   uint64_t unneeded_reg[MAXBLOCK];
116   uint64_t unneeded_reg_upper[MAXBLOCK];
117   uint64_t branch_unneeded_reg[MAXBLOCK];
118   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
119   uint64_t p32[MAXBLOCK];
120   uint64_t pr32[MAXBLOCK];
121   signed char regmap_pre[MAXBLOCK][HOST_REGS];
122   static uint64_t current_constmap[HOST_REGS];
123   static uint64_t constmap[MAXBLOCK][HOST_REGS];
124   static struct regstat regs[MAXBLOCK];
125   static struct regstat branch_regs[MAXBLOCK];
126   signed char minimum_free_regs[MAXBLOCK];
127   u_int needed_reg[MAXBLOCK];
128   uint64_t requires_32bit[MAXBLOCK];
129   u_int wont_dirty[MAXBLOCK];
130   u_int will_dirty[MAXBLOCK];
131   int ccadj[MAXBLOCK];
132   int slen;
133   u_int instr_addr[MAXBLOCK];
134   u_int link_addr[MAXBLOCK][3];
135   int linkcount;
136   u_int stubs[MAXBLOCK*3][8];
137   int stubcount;
138   u_int literals[1024][2];
139   int literalcount;
140   int is_delayslot;
141   int cop1_usable;
142   u_char *out;
143   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
144   struct ll_entry *jump_out[4096];
145   struct ll_entry *jump_dirty[4096];
146   u_int hash_table[65536][4]  __attribute__((aligned(16)));
147   char shadow[1048576]  __attribute__((aligned(16)));
148   void *copy;
149   int expirep;
150 #ifndef PCSX
151   u_int using_tlb;
152 #else
153   static const u_int using_tlb=0;
154 #endif
155   int new_dynarec_did_compile;
156   int new_dynarec_hacks;
157   u_int stop_after_jal;
158 #ifndef RAM_FIXED
159   static u_int ram_offset;
160 #else
161   static const u_int ram_offset=0;
162 #endif
163   extern u_char restore_candidate[512];
164   extern int cycle_count;
165
166   /* registers that may be allocated */
167   /* 1-31 gpr */
168 #define HIREG 32 // hi
169 #define LOREG 33 // lo
170 #define FSREG 34 // FPU status (FCSR)
171 #define CSREG 35 // Coprocessor status
172 #define CCREG 36 // Cycle count
173 #define INVCP 37 // Pointer to invalid_code
174 #define MMREG 38 // Pointer to memory_map
175 #define ROREG 39 // ram offset (if rdram!=0x80000000)
176 #define TEMPREG 40
177 #define FTEMP 40 // FPU temporary register
178 #define PTEMP 41 // Prefetch temporary register
179 #define TLREG 42 // TLB mapping offset
180 #define RHASH 43 // Return address hash
181 #define RHTBL 44 // Return address hash table address
182 #define RTEMP 45 // JR/JALR address register
183 #define MAXREG 45
184 #define AGEN1 46 // Address generation temporary register
185 #define AGEN2 47 // Address generation temporary register
186 #define MGEN1 48 // Maptable address generation temporary register
187 #define MGEN2 49 // Maptable address generation temporary register
188 #define BTREG 50 // Branch target temporary register
189
190   /* instruction types */
191 #define NOP 0     // No operation
192 #define LOAD 1    // Load
193 #define STORE 2   // Store
194 #define LOADLR 3  // Unaligned load
195 #define STORELR 4 // Unaligned store
196 #define MOV 5     // Move 
197 #define ALU 6     // Arithmetic/logic
198 #define MULTDIV 7 // Multiply/divide
199 #define SHIFT 8   // Shift by register
200 #define SHIFTIMM 9// Shift by immediate
201 #define IMM16 10  // 16-bit immediate
202 #define RJUMP 11  // Unconditional jump to register
203 #define UJUMP 12  // Unconditional jump
204 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
205 #define SJUMP 14  // Conditional branch (regimm format)
206 #define COP0 15   // Coprocessor 0
207 #define COP1 16   // Coprocessor 1
208 #define C1LS 17   // Coprocessor 1 load/store
209 #define FJUMP 18  // Conditional branch (floating point)
210 #define FLOAT 19  // Floating point unit
211 #define FCONV 20  // Convert integer to float
212 #define FCOMP 21  // Floating point compare (sets FSREG)
213 #define SYSCALL 22// SYSCALL
214 #define OTHER 23  // Other
215 #define SPAN 24   // Branch/delay slot spans 2 pages
216 #define NI 25     // Not implemented
217 #define HLECALL 26// PCSX fake opcodes for HLE
218 #define COP2 27   // Coprocessor 2 move
219 #define C2LS 28   // Coprocessor 2 load/store
220 #define C2OP 29   // Coprocessor 2 operation
221 #define INTCALL 30// Call interpreter to handle rare corner cases
222
223   /* stubs */
224 #define CC_STUB 1
225 #define FP_STUB 2
226 #define LOADB_STUB 3
227 #define LOADH_STUB 4
228 #define LOADW_STUB 5
229 #define LOADD_STUB 6
230 #define LOADBU_STUB 7
231 #define LOADHU_STUB 8
232 #define STOREB_STUB 9
233 #define STOREH_STUB 10
234 #define STOREW_STUB 11
235 #define STORED_STUB 12
236 #define STORELR_STUB 13
237 #define INVCODE_STUB 14
238
239   /* branch codes */
240 #define TAKEN 1
241 #define NOTTAKEN 2
242 #define NULLDS 3
243
244 // asm linkage
245 int new_recompile_block(int addr);
246 void *get_addr_ht(u_int vaddr);
247 void invalidate_block(u_int block);
248 void invalidate_addr(u_int addr);
249 void remove_hash(int vaddr);
250 void jump_vaddr();
251 void dyna_linker();
252 void dyna_linker_ds();
253 void verify_code();
254 void verify_code_vm();
255 void verify_code_ds();
256 void cc_interrupt();
257 void fp_exception();
258 void fp_exception_ds();
259 void jump_syscall();
260 void jump_syscall_hle();
261 void jump_eret();
262 void jump_hlecall();
263 void jump_intcall();
264 void new_dyna_leave();
265
266 // TLB
267 void TLBWI_new();
268 void TLBWR_new();
269 void read_nomem_new();
270 void read_nomemb_new();
271 void read_nomemh_new();
272 void read_nomemd_new();
273 void write_nomem_new();
274 void write_nomemb_new();
275 void write_nomemh_new();
276 void write_nomemd_new();
277 void write_rdram_new();
278 void write_rdramb_new();
279 void write_rdramh_new();
280 void write_rdramd_new();
281 extern u_int memory_map[1048576];
282
283 // Needed by assembler
284 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
285 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
286 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
287 void load_all_regs(signed char i_regmap[]);
288 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
289 void load_regs_entry(int t);
290 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
291
292 int tracedebug=0;
293
294 //#define DEBUG_CYCLE_COUNT 1
295
296 #define NO_CYCLE_PENALTY_THR 12
297
298 int cycle_multiplier; // 100 for 1.0
299
300 static int CLOCK_ADJUST(int x)
301 {
302   int s=(x>>31)|1;
303   return (x * cycle_multiplier + s * 50) / 100;
304 }
305
306 static void tlb_hacks()
307 {
308 #ifndef DISABLE_TLB
309   // Goldeneye hack
310   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
311   {
312     u_int addr;
313     int n;
314     switch (ROM_HEADER->Country_code&0xFF) 
315     {
316       case 0x45: // U
317         addr=0x34b30;
318         break;                   
319       case 0x4A: // J 
320         addr=0x34b70;    
321         break;    
322       case 0x50: // E 
323         addr=0x329f0;
324         break;                        
325       default: 
326         // Unknown country code
327         addr=0;
328         break;
329     }
330     u_int rom_addr=(u_int)rom;
331     #ifdef ROM_COPY
332     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
333     // in the lower 4G of memory to use this hack.  Copy it if necessary.
334     if((void *)rom>(void *)0xffffffff) {
335       munmap(ROM_COPY, 67108864);
336       if(mmap(ROM_COPY, 12582912,
337               PROT_READ | PROT_WRITE,
338               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
339               -1, 0) <= 0) {printf("mmap() failed\n");}
340       memcpy(ROM_COPY,rom,12582912);
341       rom_addr=(u_int)ROM_COPY;
342     }
343     #endif
344     if(addr) {
345       for(n=0x7F000;n<0x80000;n++) {
346         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
347       }
348     }
349   }
350 #endif
351 }
352
353 static u_int get_page(u_int vaddr)
354 {
355 #ifndef PCSX
356   u_int page=(vaddr^0x80000000)>>12;
357 #else
358   u_int page=vaddr&~0xe0000000;
359   if (page < 0x1000000)
360     page &= ~0x0e00000; // RAM mirrors
361   page>>=12;
362 #endif
363 #ifndef DISABLE_TLB
364   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
365 #endif
366   if(page>2048) page=2048+(page&2047);
367   return page;
368 }
369
370 #ifndef PCSX
371 static u_int get_vpage(u_int vaddr)
372 {
373   u_int vpage=(vaddr^0x80000000)>>12;
374 #ifndef DISABLE_TLB
375   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
376 #endif
377   if(vpage>2048) vpage=2048+(vpage&2047);
378   return vpage;
379 }
380 #else
381 // no virtual mem in PCSX
382 static u_int get_vpage(u_int vaddr)
383 {
384   return get_page(vaddr);
385 }
386 #endif
387
388 // Get address from virtual address
389 // This is called from the recompiled JR/JALR instructions
390 void *get_addr(u_int vaddr)
391 {
392   u_int page=get_page(vaddr);
393   u_int vpage=get_vpage(vaddr);
394   struct ll_entry *head;
395   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
396   head=jump_in[page];
397   while(head!=NULL) {
398     if(head->vaddr==vaddr) {
399   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
400       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
401       ht_bin[3]=ht_bin[1];
402       ht_bin[2]=ht_bin[0];
403       ht_bin[1]=(int)head->addr;
404       ht_bin[0]=vaddr;
405       return head->addr;
406     }
407     head=head->next;
408   }
409   head=jump_dirty[vpage];
410   while(head!=NULL) {
411     if(head->vaddr==vaddr) {
412       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
413       // Don't restore blocks which are about to expire from the cache
414       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
415       if(verify_dirty(head->addr)) {
416         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
417         invalid_code[vaddr>>12]=0;
418         inv_code_start=inv_code_end=~0;
419 #ifndef DISABLE_TLB
420         memory_map[vaddr>>12]|=0x40000000;
421 #endif
422         if(vpage<2048) {
423 #ifndef DISABLE_TLB
424           if(tlb_LUT_r[vaddr>>12]) {
425             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
426             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
427           }
428 #endif
429           restore_candidate[vpage>>3]|=1<<(vpage&7);
430         }
431         else restore_candidate[page>>3]|=1<<(page&7);
432         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
433         if(ht_bin[0]==vaddr) {
434           ht_bin[1]=(int)head->addr; // Replace existing entry
435         }
436         else
437         {
438           ht_bin[3]=ht_bin[1];
439           ht_bin[2]=ht_bin[0];
440           ht_bin[1]=(int)head->addr;
441           ht_bin[0]=vaddr;
442         }
443         return head->addr;
444       }
445     }
446     head=head->next;
447   }
448   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
449   int r=new_recompile_block(vaddr);
450   if(r==0) return get_addr(vaddr);
451   // Execute in unmapped page, generate pagefault execption
452   Status|=2;
453   Cause=(vaddr<<31)|0x8;
454   EPC=(vaddr&1)?vaddr-5:vaddr;
455   BadVAddr=(vaddr&~1);
456   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
457   EntryHi=BadVAddr&0xFFFFE000;
458   return get_addr_ht(0x80000000);
459 }
460 // Look up address in hash table first
461 void *get_addr_ht(u_int vaddr)
462 {
463   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
464   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
465   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
466   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
467   return get_addr(vaddr);
468 }
469
470 void clear_all_regs(signed char regmap[])
471 {
472   int hr;
473   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
474 }
475
476 signed char get_reg(signed char regmap[],int r)
477 {
478   int hr;
479   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
480   return -1;
481 }
482
483 // Find a register that is available for two consecutive cycles
484 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
485 {
486   int hr;
487   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
488   return -1;
489 }
490
491 int count_free_regs(signed char regmap[])
492 {
493   int count=0;
494   int hr;
495   for(hr=0;hr<HOST_REGS;hr++)
496   {
497     if(hr!=EXCLUDE_REG) {
498       if(regmap[hr]<0) count++;
499     }
500   }
501   return count;
502 }
503
504 void dirty_reg(struct regstat *cur,signed char reg)
505 {
506   int hr;
507   if(!reg) return;
508   for (hr=0;hr<HOST_REGS;hr++) {
509     if((cur->regmap[hr]&63)==reg) {
510       cur->dirty|=1<<hr;
511     }
512   }
513 }
514
515 // If we dirty the lower half of a 64 bit register which is now being
516 // sign-extended, we need to dump the upper half.
517 // Note: Do this only after completion of the instruction, because
518 // some instructions may need to read the full 64-bit value even if
519 // overwriting it (eg SLTI, DSRA32).
520 static void flush_dirty_uppers(struct regstat *cur)
521 {
522   int hr,reg;
523   for (hr=0;hr<HOST_REGS;hr++) {
524     if((cur->dirty>>hr)&1) {
525       reg=cur->regmap[hr];
526       if(reg>=64) 
527         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
528     }
529   }
530 }
531
532 void set_const(struct regstat *cur,signed char reg,uint64_t value)
533 {
534   int hr;
535   if(!reg) return;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if(cur->regmap[hr]==reg) {
538       cur->isconst|=1<<hr;
539       current_constmap[hr]=value;
540     }
541     else if((cur->regmap[hr]^64)==reg) {
542       cur->isconst|=1<<hr;
543       current_constmap[hr]=value>>32;
544     }
545   }
546 }
547
548 void clear_const(struct regstat *cur,signed char reg)
549 {
550   int hr;
551   if(!reg) return;
552   for (hr=0;hr<HOST_REGS;hr++) {
553     if((cur->regmap[hr]&63)==reg) {
554       cur->isconst&=~(1<<hr);
555     }
556   }
557 }
558
559 int is_const(struct regstat *cur,signed char reg)
560 {
561   int hr;
562   if(reg<0) return 0;
563   if(!reg) return 1;
564   for (hr=0;hr<HOST_REGS;hr++) {
565     if((cur->regmap[hr]&63)==reg) {
566       return (cur->isconst>>hr)&1;
567     }
568   }
569   return 0;
570 }
571 uint64_t get_const(struct regstat *cur,signed char reg)
572 {
573   int hr;
574   if(!reg) return 0;
575   for (hr=0;hr<HOST_REGS;hr++) {
576     if(cur->regmap[hr]==reg) {
577       return current_constmap[hr];
578     }
579   }
580   SysPrintf("Unknown constant in r%d\n",reg);
581   exit(1);
582 }
583
584 // Least soon needed registers
585 // Look at the next ten instructions and see which registers
586 // will be used.  Try not to reallocate these.
587 void lsn(u_char hsn[], int i, int *preferred_reg)
588 {
589   int j;
590   int b=-1;
591   for(j=0;j<9;j++)
592   {
593     if(i+j>=slen) {
594       j=slen-i-1;
595       break;
596     }
597     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
598     {
599       // Don't go past an unconditonal jump
600       j++;
601       break;
602     }
603   }
604   for(;j>=0;j--)
605   {
606     if(rs1[i+j]) hsn[rs1[i+j]]=j;
607     if(rs2[i+j]) hsn[rs2[i+j]]=j;
608     if(rt1[i+j]) hsn[rt1[i+j]]=j;
609     if(rt2[i+j]) hsn[rt2[i+j]]=j;
610     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
611       // Stores can allocate zero
612       hsn[rs1[i+j]]=j;
613       hsn[rs2[i+j]]=j;
614     }
615     // On some architectures stores need invc_ptr
616     #if defined(HOST_IMM8)
617     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
618       hsn[INVCP]=j;
619     }
620     #endif
621     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
622     {
623       hsn[CCREG]=j;
624       b=j;
625     }
626   }
627   if(b>=0)
628   {
629     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
630     {
631       // Follow first branch
632       int t=(ba[i+b]-start)>>2;
633       j=7-b;if(t+j>=slen) j=slen-t-1;
634       for(;j>=0;j--)
635       {
636         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
637         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
638         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
639         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
640       }
641     }
642     // TODO: preferred register based on backward branch
643   }
644   // Delay slot should preferably not overwrite branch conditions or cycle count
645   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
646     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
647     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
648     hsn[CCREG]=1;
649     // ...or hash tables
650     hsn[RHASH]=1;
651     hsn[RHTBL]=1;
652   }
653   // Coprocessor load/store needs FTEMP, even if not declared
654   if(itype[i]==C1LS||itype[i]==C2LS) {
655     hsn[FTEMP]=0;
656   }
657   // Load L/R also uses FTEMP as a temporary register
658   if(itype[i]==LOADLR) {
659     hsn[FTEMP]=0;
660   }
661   // Also SWL/SWR/SDL/SDR
662   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
663     hsn[FTEMP]=0;
664   }
665   // Don't remove the TLB registers either
666   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
667     hsn[TLREG]=0;
668   }
669   // Don't remove the miniht registers
670   if(itype[i]==UJUMP||itype[i]==RJUMP)
671   {
672     hsn[RHASH]=0;
673     hsn[RHTBL]=0;
674   }
675 }
676
677 // We only want to allocate registers if we're going to use them again soon
678 int needed_again(int r, int i)
679 {
680   int j;
681   int b=-1;
682   int rn=10;
683   
684   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
685   {
686     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
687       return 0; // Don't need any registers if exiting the block
688   }
689   for(j=0;j<9;j++)
690   {
691     if(i+j>=slen) {
692       j=slen-i-1;
693       break;
694     }
695     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
696     {
697       // Don't go past an unconditonal jump
698       j++;
699       break;
700     }
701     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
702     {
703       break;
704     }
705   }
706   for(;j>=1;j--)
707   {
708     if(rs1[i+j]==r) rn=j;
709     if(rs2[i+j]==r) rn=j;
710     if((unneeded_reg[i+j]>>r)&1) rn=10;
711     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
712     {
713       b=j;
714     }
715   }
716   /*
717   if(b>=0)
718   {
719     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
720     {
721       // Follow first branch
722       int o=rn;
723       int t=(ba[i+b]-start)>>2;
724       j=7-b;if(t+j>=slen) j=slen-t-1;
725       for(;j>=0;j--)
726       {
727         if(!((unneeded_reg[t+j]>>r)&1)) {
728           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
729           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
730         }
731         else rn=o;
732       }
733     }
734   }*/
735   if(rn<10) return 1;
736   return 0;
737 }
738
739 // Try to match register allocations at the end of a loop with those
740 // at the beginning
741 int loop_reg(int i, int r, int hr)
742 {
743   int j,k;
744   for(j=0;j<9;j++)
745   {
746     if(i+j>=slen) {
747       j=slen-i-1;
748       break;
749     }
750     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
751     {
752       // Don't go past an unconditonal jump
753       j++;
754       break;
755     }
756   }
757   k=0;
758   if(i>0){
759     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
760       k--;
761   }
762   for(;k<j;k++)
763   {
764     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
765     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
766     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
767     {
768       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
769       {
770         int t=(ba[i+k]-start)>>2;
771         int reg=get_reg(regs[t].regmap_entry,r);
772         if(reg>=0) return reg;
773         //reg=get_reg(regs[t+1].regmap_entry,r);
774         //if(reg>=0) return reg;
775       }
776     }
777   }
778   return hr;
779 }
780
781
782 // Allocate every register, preserving source/target regs
783 void alloc_all(struct regstat *cur,int i)
784 {
785   int hr;
786   
787   for(hr=0;hr<HOST_REGS;hr++) {
788     if(hr!=EXCLUDE_REG) {
789       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
790          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
791       {
792         cur->regmap[hr]=-1;
793         cur->dirty&=~(1<<hr);
794       }
795       // Don't need zeros
796       if((cur->regmap[hr]&63)==0)
797       {
798         cur->regmap[hr]=-1;
799         cur->dirty&=~(1<<hr);
800       }
801     }
802   }
803 }
804
805 #ifndef FORCE32
806 void div64(int64_t dividend,int64_t divisor)
807 {
808   lo=dividend/divisor;
809   hi=dividend%divisor;
810   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
811   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
812 }
813 void divu64(uint64_t dividend,uint64_t divisor)
814 {
815   lo=dividend/divisor;
816   hi=dividend%divisor;
817   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
818   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
819 }
820
821 void mult64(uint64_t m1,uint64_t m2)
822 {
823    unsigned long long int op1, op2, op3, op4;
824    unsigned long long int result1, result2, result3, result4;
825    unsigned long long int temp1, temp2, temp3, temp4;
826    int sign = 0;
827    
828    if (m1 < 0)
829      {
830     op2 = -m1;
831     sign = 1 - sign;
832      }
833    else op2 = m1;
834    if (m2 < 0)
835      {
836     op4 = -m2;
837     sign = 1 - sign;
838      }
839    else op4 = m2;
840    
841    op1 = op2 & 0xFFFFFFFF;
842    op2 = (op2 >> 32) & 0xFFFFFFFF;
843    op3 = op4 & 0xFFFFFFFF;
844    op4 = (op4 >> 32) & 0xFFFFFFFF;
845    
846    temp1 = op1 * op3;
847    temp2 = (temp1 >> 32) + op1 * op4;
848    temp3 = op2 * op3;
849    temp4 = (temp3 >> 32) + op2 * op4;
850    
851    result1 = temp1 & 0xFFFFFFFF;
852    result2 = temp2 + (temp3 & 0xFFFFFFFF);
853    result3 = (result2 >> 32) + temp4;
854    result4 = (result3 >> 32);
855    
856    lo = result1 | (result2 << 32);
857    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
858    if (sign)
859      {
860     hi = ~hi;
861     if (!lo) hi++;
862     else lo = ~lo + 1;
863      }
864 }
865
866 void multu64(uint64_t m1,uint64_t m2)
867 {
868    unsigned long long int op1, op2, op3, op4;
869    unsigned long long int result1, result2, result3, result4;
870    unsigned long long int temp1, temp2, temp3, temp4;
871    
872    op1 = m1 & 0xFFFFFFFF;
873    op2 = (m1 >> 32) & 0xFFFFFFFF;
874    op3 = m2 & 0xFFFFFFFF;
875    op4 = (m2 >> 32) & 0xFFFFFFFF;
876    
877    temp1 = op1 * op3;
878    temp2 = (temp1 >> 32) + op1 * op4;
879    temp3 = op2 * op3;
880    temp4 = (temp3 >> 32) + op2 * op4;
881    
882    result1 = temp1 & 0xFFFFFFFF;
883    result2 = temp2 + (temp3 & 0xFFFFFFFF);
884    result3 = (result2 >> 32) + temp4;
885    result4 = (result3 >> 32);
886    
887    lo = result1 | (result2 << 32);
888    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
889    
890   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
891   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
892 }
893
894 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
895 {
896   if(bits) {
897     original<<=64-bits;
898     original>>=64-bits;
899     loaded<<=bits;
900     original|=loaded;
901   }
902   else original=loaded;
903   return original;
904 }
905 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
906 {
907   if(bits^56) {
908     original>>=64-(bits^56);
909     original<<=64-(bits^56);
910     loaded>>=bits^56;
911     original|=loaded;
912   }
913   else original=loaded;
914   return original;
915 }
916 #endif
917
918 #ifdef __i386__
919 #include "assem_x86.c"
920 #endif
921 #ifdef __x86_64__
922 #include "assem_x64.c"
923 #endif
924 #ifdef __arm__
925 #include "assem_arm.c"
926 #endif
927
928 // Add virtual address mapping to linked list
929 void ll_add(struct ll_entry **head,int vaddr,void *addr)
930 {
931   struct ll_entry *new_entry;
932   new_entry=malloc(sizeof(struct ll_entry));
933   assert(new_entry!=NULL);
934   new_entry->vaddr=vaddr;
935   new_entry->reg_sv_flags=0;
936   new_entry->addr=addr;
937   new_entry->next=*head;
938   *head=new_entry;
939 }
940
941 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
942 {
943   ll_add(head,vaddr,addr);
944   (*head)->reg_sv_flags=reg_sv_flags;
945 }
946
947 // Check if an address is already compiled
948 // but don't return addresses which are about to expire from the cache
949 void *check_addr(u_int vaddr)
950 {
951   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
952   if(ht_bin[0]==vaddr) {
953     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
954       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
955   }
956   if(ht_bin[2]==vaddr) {
957     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
958       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
959   }
960   u_int page=get_page(vaddr);
961   struct ll_entry *head;
962   head=jump_in[page];
963   while(head!=NULL) {
964     if(head->vaddr==vaddr) {
965       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
966         // Update existing entry with current address
967         if(ht_bin[0]==vaddr) {
968           ht_bin[1]=(int)head->addr;
969           return head->addr;
970         }
971         if(ht_bin[2]==vaddr) {
972           ht_bin[3]=(int)head->addr;
973           return head->addr;
974         }
975         // Insert into hash table with low priority.
976         // Don't evict existing entries, as they are probably
977         // addresses that are being accessed frequently.
978         if(ht_bin[0]==-1) {
979           ht_bin[1]=(int)head->addr;
980           ht_bin[0]=vaddr;
981         }else if(ht_bin[2]==-1) {
982           ht_bin[3]=(int)head->addr;
983           ht_bin[2]=vaddr;
984         }
985         return head->addr;
986       }
987     }
988     head=head->next;
989   }
990   return 0;
991 }
992
993 void remove_hash(int vaddr)
994 {
995   //printf("remove hash: %x\n",vaddr);
996   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
997   if(ht_bin[2]==vaddr) {
998     ht_bin[2]=ht_bin[3]=-1;
999   }
1000   if(ht_bin[0]==vaddr) {
1001     ht_bin[0]=ht_bin[2];
1002     ht_bin[1]=ht_bin[3];
1003     ht_bin[2]=ht_bin[3]=-1;
1004   }
1005 }
1006
1007 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1008 {
1009   struct ll_entry *next;
1010   while(*head) {
1011     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1012        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1013     {
1014       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1015       remove_hash((*head)->vaddr);
1016       next=(*head)->next;
1017       free(*head);
1018       *head=next;
1019     }
1020     else
1021     {
1022       head=&((*head)->next);
1023     }
1024   }
1025 }
1026
1027 // Remove all entries from linked list
1028 void ll_clear(struct ll_entry **head)
1029 {
1030   struct ll_entry *cur;
1031   struct ll_entry *next;
1032   if(cur=*head) {
1033     *head=0;
1034     while(cur) {
1035       next=cur->next;
1036       free(cur);
1037       cur=next;
1038     }
1039   }
1040 }
1041
1042 // Dereference the pointers and remove if it matches
1043 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1044 {
1045   while(head) {
1046     int ptr=get_pointer(head->addr);
1047     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1048     if(((ptr>>shift)==(addr>>shift)) ||
1049        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1050     {
1051       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1052       u_int host_addr=(u_int)kill_pointer(head->addr);
1053       #ifdef __arm__
1054         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1055       #endif
1056     }
1057     head=head->next;
1058   }
1059 }
1060
1061 // This is called when we write to a compiled block (see do_invstub)
1062 void invalidate_page(u_int page)
1063 {
1064   struct ll_entry *head;
1065   struct ll_entry *next;
1066   head=jump_in[page];
1067   jump_in[page]=0;
1068   while(head!=NULL) {
1069     inv_debug("INVALIDATE: %x\n",head->vaddr);
1070     remove_hash(head->vaddr);
1071     next=head->next;
1072     free(head);
1073     head=next;
1074   }
1075   head=jump_out[page];
1076   jump_out[page]=0;
1077   while(head!=NULL) {
1078     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1079     u_int host_addr=(u_int)kill_pointer(head->addr);
1080     #ifdef __arm__
1081       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1082     #endif
1083     next=head->next;
1084     free(head);
1085     head=next;
1086   }
1087 }
1088
1089 static void invalidate_block_range(u_int block, u_int first, u_int last)
1090 {
1091   u_int page=get_page(block<<12);
1092   //printf("first=%d last=%d\n",first,last);
1093   invalidate_page(page);
1094   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1095   assert(last<page+5);
1096   // Invalidate the adjacent pages if a block crosses a 4K boundary
1097   while(first<page) {
1098     invalidate_page(first);
1099     first++;
1100   }
1101   for(first=page+1;first<last;first++) {
1102     invalidate_page(first);
1103   }
1104   #ifdef __arm__
1105     do_clear_cache();
1106   #endif
1107   
1108   // Don't trap writes
1109   invalid_code[block]=1;
1110 #ifndef DISABLE_TLB
1111   // If there is a valid TLB entry for this page, remove write protect
1112   if(tlb_LUT_w[block]) {
1113     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1114     // CHECK: Is this right?
1115     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1116     u_int real_block=tlb_LUT_w[block]>>12;
1117     invalid_code[real_block]=1;
1118     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1119   }
1120   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1121 #endif
1122
1123   #ifdef USE_MINI_HT
1124   memset(mini_ht,-1,sizeof(mini_ht));
1125   #endif
1126 }
1127
1128 void invalidate_block(u_int block)
1129 {
1130   u_int page=get_page(block<<12);
1131   u_int vpage=get_vpage(block<<12);
1132   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1133   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1134   u_int first,last;
1135   first=last=page;
1136   struct ll_entry *head;
1137   head=jump_dirty[vpage];
1138   //printf("page=%d vpage=%d\n",page,vpage);
1139   while(head!=NULL) {
1140     u_int start,end;
1141     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1142       get_bounds((int)head->addr,&start,&end);
1143       //printf("start: %x end: %x\n",start,end);
1144       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1145         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1146           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1147           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1148         }
1149       }
1150 #ifndef DISABLE_TLB
1151       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1152         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1153           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1154           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1155         }
1156       }
1157 #endif
1158     }
1159     head=head->next;
1160   }
1161   invalidate_block_range(block,first,last);
1162 }
1163
1164 void invalidate_addr(u_int addr)
1165 {
1166 #ifdef PCSX
1167   //static int rhits;
1168   // this check is done by the caller
1169   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1170   u_int page=get_vpage(addr);
1171   if(page<2048) { // RAM
1172     struct ll_entry *head;
1173     u_int addr_min=~0, addr_max=0;
1174     u_int mask=RAM_SIZE-1;
1175     u_int addr_main=0x80000000|(addr&mask);
1176     int pg1;
1177     inv_code_start=addr_main&~0xfff;
1178     inv_code_end=addr_main|0xfff;
1179     pg1=page;
1180     if (pg1>0) {
1181       // must check previous page too because of spans..
1182       pg1--;
1183       inv_code_start-=0x1000;
1184     }
1185     for(;pg1<=page;pg1++) {
1186       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1187         u_int start,end;
1188         get_bounds((int)head->addr,&start,&end);
1189         if(ram_offset) {
1190           start-=ram_offset;
1191           end-=ram_offset;
1192         }
1193         if(start<=addr_main&&addr_main<end) {
1194           if(start<addr_min) addr_min=start;
1195           if(end>addr_max) addr_max=end;
1196         }
1197         else if(addr_main<start) {
1198           if(start<inv_code_end)
1199             inv_code_end=start-1;
1200         }
1201         else {
1202           if(end>inv_code_start)
1203             inv_code_start=end;
1204         }
1205       }
1206     }
1207     if (addr_min!=~0) {
1208       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1209       inv_code_start=inv_code_end=~0;
1210       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1211       return;
1212     }
1213     else {
1214       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1215       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1216       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1217       return;
1218     }
1219   }
1220 #endif
1221   invalidate_block(addr>>12);
1222 }
1223
1224 // This is called when loading a save state.
1225 // Anything could have changed, so invalidate everything.
1226 void invalidate_all_pages()
1227 {
1228   u_int page,n;
1229   for(page=0;page<4096;page++)
1230     invalidate_page(page);
1231   for(page=0;page<1048576;page++)
1232     if(!invalid_code[page]) {
1233       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1234       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1235     }
1236   #ifdef __arm__
1237   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1238   #endif
1239   #ifdef USE_MINI_HT
1240   memset(mini_ht,-1,sizeof(mini_ht));
1241   #endif
1242   #ifndef DISABLE_TLB
1243   // TLB
1244   for(page=0;page<0x100000;page++) {
1245     if(tlb_LUT_r[page]) {
1246       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1247       if(!tlb_LUT_w[page]||!invalid_code[page])
1248         memory_map[page]|=0x40000000; // Write protect
1249     }
1250     else memory_map[page]=-1;
1251     if(page==0x80000) page=0xC0000;
1252   }
1253   tlb_hacks();
1254   #endif
1255 }
1256
1257 // Add an entry to jump_out after making a link
1258 void add_link(u_int vaddr,void *src)
1259 {
1260   u_int page=get_page(vaddr);
1261   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1262   int *ptr=(int *)(src+4);
1263   assert((*ptr&0x0fff0000)==0x059f0000);
1264   ll_add(jump_out+page,vaddr,src);
1265   //int ptr=get_pointer(src);
1266   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1267 }
1268
1269 // If a code block was found to be unmodified (bit was set in
1270 // restore_candidate) and it remains unmodified (bit is clear
1271 // in invalid_code) then move the entries for that 4K page from
1272 // the dirty list to the clean list.
1273 void clean_blocks(u_int page)
1274 {
1275   struct ll_entry *head;
1276   inv_debug("INV: clean_blocks page=%d\n",page);
1277   head=jump_dirty[page];
1278   while(head!=NULL) {
1279     if(!invalid_code[head->vaddr>>12]) {
1280       // Don't restore blocks which are about to expire from the cache
1281       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1282         u_int start,end;
1283         if(verify_dirty((int)head->addr)) {
1284           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1285           u_int i;
1286           u_int inv=0;
1287           get_bounds((int)head->addr,&start,&end);
1288           if(start-(u_int)rdram<RAM_SIZE) {
1289             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1290               inv|=invalid_code[i];
1291             }
1292           }
1293 #ifndef DISABLE_TLB
1294           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1295             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1296             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1297             if(addr<start||addr>=end) inv=1;
1298           }
1299 #endif
1300           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1301             inv=1;
1302           }
1303           if(!inv) {
1304             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1305             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1306               u_int ppage=page;
1307 #ifndef DISABLE_TLB
1308               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1309 #endif
1310               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1311               //printf("page=%x, addr=%x\n",page,head->vaddr);
1312               //assert(head->vaddr>>12==(page|0x80000));
1313               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1314               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1315               if(ht_bin[0]==head->vaddr) {
1316                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1317               }
1318               if(ht_bin[2]==head->vaddr) {
1319                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1320               }
1321             }
1322           }
1323         }
1324       }
1325     }
1326     head=head->next;
1327   }
1328 }
1329
1330
1331 void mov_alloc(struct regstat *current,int i)
1332 {
1333   // Note: Don't need to actually alloc the source registers
1334   if((~current->is32>>rs1[i])&1) {
1335     //alloc_reg64(current,i,rs1[i]);
1336     alloc_reg64(current,i,rt1[i]);
1337     current->is32&=~(1LL<<rt1[i]);
1338   } else {
1339     //alloc_reg(current,i,rs1[i]);
1340     alloc_reg(current,i,rt1[i]);
1341     current->is32|=(1LL<<rt1[i]);
1342   }
1343   clear_const(current,rs1[i]);
1344   clear_const(current,rt1[i]);
1345   dirty_reg(current,rt1[i]);
1346 }
1347
1348 void shiftimm_alloc(struct regstat *current,int i)
1349 {
1350   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1351   {
1352     if(rt1[i]) {
1353       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1354       else lt1[i]=rs1[i];
1355       alloc_reg(current,i,rt1[i]);
1356       current->is32|=1LL<<rt1[i];
1357       dirty_reg(current,rt1[i]);
1358       if(is_const(current,rs1[i])) {
1359         int v=get_const(current,rs1[i]);
1360         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1361         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1362         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1363       }
1364       else clear_const(current,rt1[i]);
1365     }
1366   }
1367   else
1368   {
1369     clear_const(current,rs1[i]);
1370     clear_const(current,rt1[i]);
1371   }
1372
1373   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1374   {
1375     if(rt1[i]) {
1376       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1377       alloc_reg64(current,i,rt1[i]);
1378       current->is32&=~(1LL<<rt1[i]);
1379       dirty_reg(current,rt1[i]);
1380     }
1381   }
1382   if(opcode2[i]==0x3c) // DSLL32
1383   {
1384     if(rt1[i]) {
1385       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1386       alloc_reg64(current,i,rt1[i]);
1387       current->is32&=~(1LL<<rt1[i]);
1388       dirty_reg(current,rt1[i]);
1389     }
1390   }
1391   if(opcode2[i]==0x3e) // DSRL32
1392   {
1393     if(rt1[i]) {
1394       alloc_reg64(current,i,rs1[i]);
1395       if(imm[i]==32) {
1396         alloc_reg64(current,i,rt1[i]);
1397         current->is32&=~(1LL<<rt1[i]);
1398       } else {
1399         alloc_reg(current,i,rt1[i]);
1400         current->is32|=1LL<<rt1[i];
1401       }
1402       dirty_reg(current,rt1[i]);
1403     }
1404   }
1405   if(opcode2[i]==0x3f) // DSRA32
1406   {
1407     if(rt1[i]) {
1408       alloc_reg64(current,i,rs1[i]);
1409       alloc_reg(current,i,rt1[i]);
1410       current->is32|=1LL<<rt1[i];
1411       dirty_reg(current,rt1[i]);
1412     }
1413   }
1414 }
1415
1416 void shift_alloc(struct regstat *current,int i)
1417 {
1418   if(rt1[i]) {
1419     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1420     {
1421       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1422       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1423       alloc_reg(current,i,rt1[i]);
1424       if(rt1[i]==rs2[i]) {
1425         alloc_reg_temp(current,i,-1);
1426         minimum_free_regs[i]=1;
1427       }
1428       current->is32|=1LL<<rt1[i];
1429     } else { // DSLLV/DSRLV/DSRAV
1430       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1431       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1432       alloc_reg64(current,i,rt1[i]);
1433       current->is32&=~(1LL<<rt1[i]);
1434       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1435       {
1436         alloc_reg_temp(current,i,-1);
1437         minimum_free_regs[i]=1;
1438       }
1439     }
1440     clear_const(current,rs1[i]);
1441     clear_const(current,rs2[i]);
1442     clear_const(current,rt1[i]);
1443     dirty_reg(current,rt1[i]);
1444   }
1445 }
1446
1447 void alu_alloc(struct regstat *current,int i)
1448 {
1449   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1450     if(rt1[i]) {
1451       if(rs1[i]&&rs2[i]) {
1452         alloc_reg(current,i,rs1[i]);
1453         alloc_reg(current,i,rs2[i]);
1454       }
1455       else {
1456         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1457         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1458       }
1459       alloc_reg(current,i,rt1[i]);
1460     }
1461     current->is32|=1LL<<rt1[i];
1462   }
1463   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1464     if(rt1[i]) {
1465       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1466       {
1467         alloc_reg64(current,i,rs1[i]);
1468         alloc_reg64(current,i,rs2[i]);
1469         alloc_reg(current,i,rt1[i]);
1470       } else {
1471         alloc_reg(current,i,rs1[i]);
1472         alloc_reg(current,i,rs2[i]);
1473         alloc_reg(current,i,rt1[i]);
1474       }
1475     }
1476     current->is32|=1LL<<rt1[i];
1477   }
1478   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1479     if(rt1[i]) {
1480       if(rs1[i]&&rs2[i]) {
1481         alloc_reg(current,i,rs1[i]);
1482         alloc_reg(current,i,rs2[i]);
1483       }
1484       else
1485       {
1486         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1487         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1488       }
1489       alloc_reg(current,i,rt1[i]);
1490       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1491       {
1492         if(!((current->uu>>rt1[i])&1)) {
1493           alloc_reg64(current,i,rt1[i]);
1494         }
1495         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1496           if(rs1[i]&&rs2[i]) {
1497             alloc_reg64(current,i,rs1[i]);
1498             alloc_reg64(current,i,rs2[i]);
1499           }
1500           else
1501           {
1502             // Is is really worth it to keep 64-bit values in registers?
1503             #ifdef NATIVE_64BIT
1504             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1505             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1506             #endif
1507           }
1508         }
1509         current->is32&=~(1LL<<rt1[i]);
1510       } else {
1511         current->is32|=1LL<<rt1[i];
1512       }
1513     }
1514   }
1515   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1516     if(rt1[i]) {
1517       if(rs1[i]&&rs2[i]) {
1518         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1519           alloc_reg64(current,i,rs1[i]);
1520           alloc_reg64(current,i,rs2[i]);
1521           alloc_reg64(current,i,rt1[i]);
1522         } else {
1523           alloc_reg(current,i,rs1[i]);
1524           alloc_reg(current,i,rs2[i]);
1525           alloc_reg(current,i,rt1[i]);
1526         }
1527       }
1528       else {
1529         alloc_reg(current,i,rt1[i]);
1530         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1531           // DADD used as move, or zeroing
1532           // If we have a 64-bit source, then make the target 64 bits too
1533           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1534             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1535             alloc_reg64(current,i,rt1[i]);
1536           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1537             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1538             alloc_reg64(current,i,rt1[i]);
1539           }
1540           if(opcode2[i]>=0x2e&&rs2[i]) {
1541             // DSUB used as negation - 64-bit result
1542             // If we have a 32-bit register, extend it to 64 bits
1543             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1544             alloc_reg64(current,i,rt1[i]);
1545           }
1546         }
1547       }
1548       if(rs1[i]&&rs2[i]) {
1549         current->is32&=~(1LL<<rt1[i]);
1550       } else if(rs1[i]) {
1551         current->is32&=~(1LL<<rt1[i]);
1552         if((current->is32>>rs1[i])&1)
1553           current->is32|=1LL<<rt1[i];
1554       } else if(rs2[i]) {
1555         current->is32&=~(1LL<<rt1[i]);
1556         if((current->is32>>rs2[i])&1)
1557           current->is32|=1LL<<rt1[i];
1558       } else {
1559         current->is32|=1LL<<rt1[i];
1560       }
1561     }
1562   }
1563   clear_const(current,rs1[i]);
1564   clear_const(current,rs2[i]);
1565   clear_const(current,rt1[i]);
1566   dirty_reg(current,rt1[i]);
1567 }
1568
1569 void imm16_alloc(struct regstat *current,int i)
1570 {
1571   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1572   else lt1[i]=rs1[i];
1573   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1574   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1575     current->is32&=~(1LL<<rt1[i]);
1576     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1577       // TODO: Could preserve the 32-bit flag if the immediate is zero
1578       alloc_reg64(current,i,rt1[i]);
1579       alloc_reg64(current,i,rs1[i]);
1580     }
1581     clear_const(current,rs1[i]);
1582     clear_const(current,rt1[i]);
1583   }
1584   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1585     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1586     current->is32|=1LL<<rt1[i];
1587     clear_const(current,rs1[i]);
1588     clear_const(current,rt1[i]);
1589   }
1590   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1591     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1592       if(rs1[i]!=rt1[i]) {
1593         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1594         alloc_reg64(current,i,rt1[i]);
1595         current->is32&=~(1LL<<rt1[i]);
1596       }
1597     }
1598     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1599     if(is_const(current,rs1[i])) {
1600       int v=get_const(current,rs1[i]);
1601       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1602       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1603       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1604     }
1605     else clear_const(current,rt1[i]);
1606   }
1607   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1608     if(is_const(current,rs1[i])) {
1609       int v=get_const(current,rs1[i]);
1610       set_const(current,rt1[i],v+imm[i]);
1611     }
1612     else clear_const(current,rt1[i]);
1613     current->is32|=1LL<<rt1[i];
1614   }
1615   else {
1616     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1617     current->is32|=1LL<<rt1[i];
1618   }
1619   dirty_reg(current,rt1[i]);
1620 }
1621
1622 void load_alloc(struct regstat *current,int i)
1623 {
1624   clear_const(current,rt1[i]);
1625   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1626   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1627   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1628   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1629     alloc_reg(current,i,rt1[i]);
1630     assert(get_reg(current->regmap,rt1[i])>=0);
1631     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1632     {
1633       current->is32&=~(1LL<<rt1[i]);
1634       alloc_reg64(current,i,rt1[i]);
1635     }
1636     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1637     {
1638       current->is32&=~(1LL<<rt1[i]);
1639       alloc_reg64(current,i,rt1[i]);
1640       alloc_all(current,i);
1641       alloc_reg64(current,i,FTEMP);
1642       minimum_free_regs[i]=HOST_REGS;
1643     }
1644     else current->is32|=1LL<<rt1[i];
1645     dirty_reg(current,rt1[i]);
1646     // If using TLB, need a register for pointer to the mapping table
1647     if(using_tlb) alloc_reg(current,i,TLREG);
1648     // LWL/LWR need a temporary register for the old value
1649     if(opcode[i]==0x22||opcode[i]==0x26)
1650     {
1651       alloc_reg(current,i,FTEMP);
1652       alloc_reg_temp(current,i,-1);
1653       minimum_free_regs[i]=1;
1654     }
1655   }
1656   else
1657   {
1658     // Load to r0 or unneeded register (dummy load)
1659     // but we still need a register to calculate the address
1660     if(opcode[i]==0x22||opcode[i]==0x26)
1661     {
1662       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1663     }
1664     // If using TLB, need a register for pointer to the mapping table
1665     if(using_tlb) alloc_reg(current,i,TLREG);
1666     alloc_reg_temp(current,i,-1);
1667     minimum_free_regs[i]=1;
1668     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1669     {
1670       alloc_all(current,i);
1671       alloc_reg64(current,i,FTEMP);
1672       minimum_free_regs[i]=HOST_REGS;
1673     }
1674   }
1675 }
1676
1677 void store_alloc(struct regstat *current,int i)
1678 {
1679   clear_const(current,rs2[i]);
1680   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1681   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1682   alloc_reg(current,i,rs2[i]);
1683   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1684     alloc_reg64(current,i,rs2[i]);
1685     if(rs2[i]) alloc_reg(current,i,FTEMP);
1686   }
1687   // If using TLB, need a register for pointer to the mapping table
1688   if(using_tlb) alloc_reg(current,i,TLREG);
1689   #if defined(HOST_IMM8)
1690   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1691   else alloc_reg(current,i,INVCP);
1692   #endif
1693   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1694     alloc_reg(current,i,FTEMP);
1695   }
1696   // We need a temporary register for address generation
1697   alloc_reg_temp(current,i,-1);
1698   minimum_free_regs[i]=1;
1699 }
1700
1701 void c1ls_alloc(struct regstat *current,int i)
1702 {
1703   //clear_const(current,rs1[i]); // FIXME
1704   clear_const(current,rt1[i]);
1705   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1706   alloc_reg(current,i,CSREG); // Status
1707   alloc_reg(current,i,FTEMP);
1708   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1709     alloc_reg64(current,i,FTEMP);
1710   }
1711   // If using TLB, need a register for pointer to the mapping table
1712   if(using_tlb) alloc_reg(current,i,TLREG);
1713   #if defined(HOST_IMM8)
1714   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1715   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1716     alloc_reg(current,i,INVCP);
1717   #endif
1718   // We need a temporary register for address generation
1719   alloc_reg_temp(current,i,-1);
1720 }
1721
1722 void c2ls_alloc(struct regstat *current,int i)
1723 {
1724   clear_const(current,rt1[i]);
1725   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1726   alloc_reg(current,i,FTEMP);
1727   // If using TLB, need a register for pointer to the mapping table
1728   if(using_tlb) alloc_reg(current,i,TLREG);
1729   #if defined(HOST_IMM8)
1730   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1731   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1732     alloc_reg(current,i,INVCP);
1733   #endif
1734   // We need a temporary register for address generation
1735   alloc_reg_temp(current,i,-1);
1736   minimum_free_regs[i]=1;
1737 }
1738
1739 #ifndef multdiv_alloc
1740 void multdiv_alloc(struct regstat *current,int i)
1741 {
1742   //  case 0x18: MULT
1743   //  case 0x19: MULTU
1744   //  case 0x1A: DIV
1745   //  case 0x1B: DIVU
1746   //  case 0x1C: DMULT
1747   //  case 0x1D: DMULTU
1748   //  case 0x1E: DDIV
1749   //  case 0x1F: DDIVU
1750   clear_const(current,rs1[i]);
1751   clear_const(current,rs2[i]);
1752   if(rs1[i]&&rs2[i])
1753   {
1754     if((opcode2[i]&4)==0) // 32-bit
1755     {
1756       current->u&=~(1LL<<HIREG);
1757       current->u&=~(1LL<<LOREG);
1758       alloc_reg(current,i,HIREG);
1759       alloc_reg(current,i,LOREG);
1760       alloc_reg(current,i,rs1[i]);
1761       alloc_reg(current,i,rs2[i]);
1762       current->is32|=1LL<<HIREG;
1763       current->is32|=1LL<<LOREG;
1764       dirty_reg(current,HIREG);
1765       dirty_reg(current,LOREG);
1766     }
1767     else // 64-bit
1768     {
1769       current->u&=~(1LL<<HIREG);
1770       current->u&=~(1LL<<LOREG);
1771       current->uu&=~(1LL<<HIREG);
1772       current->uu&=~(1LL<<LOREG);
1773       alloc_reg64(current,i,HIREG);
1774       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1775       alloc_reg64(current,i,rs1[i]);
1776       alloc_reg64(current,i,rs2[i]);
1777       alloc_all(current,i);
1778       current->is32&=~(1LL<<HIREG);
1779       current->is32&=~(1LL<<LOREG);
1780       dirty_reg(current,HIREG);
1781       dirty_reg(current,LOREG);
1782       minimum_free_regs[i]=HOST_REGS;
1783     }
1784   }
1785   else
1786   {
1787     // Multiply by zero is zero.
1788     // MIPS does not have a divide by zero exception.
1789     // The result is undefined, we return zero.
1790     alloc_reg(current,i,HIREG);
1791     alloc_reg(current,i,LOREG);
1792     current->is32|=1LL<<HIREG;
1793     current->is32|=1LL<<LOREG;
1794     dirty_reg(current,HIREG);
1795     dirty_reg(current,LOREG);
1796   }
1797 }
1798 #endif
1799
1800 void cop0_alloc(struct regstat *current,int i)
1801 {
1802   if(opcode2[i]==0) // MFC0
1803   {
1804     if(rt1[i]) {
1805       clear_const(current,rt1[i]);
1806       alloc_all(current,i);
1807       alloc_reg(current,i,rt1[i]);
1808       current->is32|=1LL<<rt1[i];
1809       dirty_reg(current,rt1[i]);
1810     }
1811   }
1812   else if(opcode2[i]==4) // MTC0
1813   {
1814     if(rs1[i]){
1815       clear_const(current,rs1[i]);
1816       alloc_reg(current,i,rs1[i]);
1817       alloc_all(current,i);
1818     }
1819     else {
1820       alloc_all(current,i); // FIXME: Keep r0
1821       current->u&=~1LL;
1822       alloc_reg(current,i,0);
1823     }
1824   }
1825   else
1826   {
1827     // TLBR/TLBWI/TLBWR/TLBP/ERET
1828     assert(opcode2[i]==0x10);
1829     alloc_all(current,i);
1830   }
1831   minimum_free_regs[i]=HOST_REGS;
1832 }
1833
1834 void cop1_alloc(struct regstat *current,int i)
1835 {
1836   alloc_reg(current,i,CSREG); // Load status
1837   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1838   {
1839     if(rt1[i]){
1840       clear_const(current,rt1[i]);
1841       if(opcode2[i]==1) {
1842         alloc_reg64(current,i,rt1[i]); // DMFC1
1843         current->is32&=~(1LL<<rt1[i]);
1844       }else{
1845         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1846         current->is32|=1LL<<rt1[i];
1847       }
1848       dirty_reg(current,rt1[i]);
1849     }
1850     alloc_reg_temp(current,i,-1);
1851   }
1852   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1853   {
1854     if(rs1[i]){
1855       clear_const(current,rs1[i]);
1856       if(opcode2[i]==5)
1857         alloc_reg64(current,i,rs1[i]); // DMTC1
1858       else
1859         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1860       alloc_reg_temp(current,i,-1);
1861     }
1862     else {
1863       current->u&=~1LL;
1864       alloc_reg(current,i,0);
1865       alloc_reg_temp(current,i,-1);
1866     }
1867   }
1868   minimum_free_regs[i]=1;
1869 }
1870 void fconv_alloc(struct regstat *current,int i)
1871 {
1872   alloc_reg(current,i,CSREG); // Load status
1873   alloc_reg_temp(current,i,-1);
1874   minimum_free_regs[i]=1;
1875 }
1876 void float_alloc(struct regstat *current,int i)
1877 {
1878   alloc_reg(current,i,CSREG); // Load status
1879   alloc_reg_temp(current,i,-1);
1880   minimum_free_regs[i]=1;
1881 }
1882 void c2op_alloc(struct regstat *current,int i)
1883 {
1884   alloc_reg_temp(current,i,-1);
1885 }
1886 void fcomp_alloc(struct regstat *current,int i)
1887 {
1888   alloc_reg(current,i,CSREG); // Load status
1889   alloc_reg(current,i,FSREG); // Load flags
1890   dirty_reg(current,FSREG); // Flag will be modified
1891   alloc_reg_temp(current,i,-1);
1892   minimum_free_regs[i]=1;
1893 }
1894
1895 void syscall_alloc(struct regstat *current,int i)
1896 {
1897   alloc_cc(current,i);
1898   dirty_reg(current,CCREG);
1899   alloc_all(current,i);
1900   minimum_free_regs[i]=HOST_REGS;
1901   current->isconst=0;
1902 }
1903
1904 void delayslot_alloc(struct regstat *current,int i)
1905 {
1906   switch(itype[i]) {
1907     case UJUMP:
1908     case CJUMP:
1909     case SJUMP:
1910     case RJUMP:
1911     case FJUMP:
1912     case SYSCALL:
1913     case HLECALL:
1914     case SPAN:
1915       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1916       SysPrintf("Disabled speculative precompilation\n");
1917       stop_after_jal=1;
1918       break;
1919     case IMM16:
1920       imm16_alloc(current,i);
1921       break;
1922     case LOAD:
1923     case LOADLR:
1924       load_alloc(current,i);
1925       break;
1926     case STORE:
1927     case STORELR:
1928       store_alloc(current,i);
1929       break;
1930     case ALU:
1931       alu_alloc(current,i);
1932       break;
1933     case SHIFT:
1934       shift_alloc(current,i);
1935       break;
1936     case MULTDIV:
1937       multdiv_alloc(current,i);
1938       break;
1939     case SHIFTIMM:
1940       shiftimm_alloc(current,i);
1941       break;
1942     case MOV:
1943       mov_alloc(current,i);
1944       break;
1945     case COP0:
1946       cop0_alloc(current,i);
1947       break;
1948     case COP1:
1949     case COP2:
1950       cop1_alloc(current,i);
1951       break;
1952     case C1LS:
1953       c1ls_alloc(current,i);
1954       break;
1955     case C2LS:
1956       c2ls_alloc(current,i);
1957       break;
1958     case FCONV:
1959       fconv_alloc(current,i);
1960       break;
1961     case FLOAT:
1962       float_alloc(current,i);
1963       break;
1964     case FCOMP:
1965       fcomp_alloc(current,i);
1966       break;
1967     case C2OP:
1968       c2op_alloc(current,i);
1969       break;
1970   }
1971 }
1972
1973 // Special case where a branch and delay slot span two pages in virtual memory
1974 static void pagespan_alloc(struct regstat *current,int i)
1975 {
1976   current->isconst=0;
1977   current->wasconst=0;
1978   regs[i].wasconst=0;
1979   minimum_free_regs[i]=HOST_REGS;
1980   alloc_all(current,i);
1981   alloc_cc(current,i);
1982   dirty_reg(current,CCREG);
1983   if(opcode[i]==3) // JAL
1984   {
1985     alloc_reg(current,i,31);
1986     dirty_reg(current,31);
1987   }
1988   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1989   {
1990     alloc_reg(current,i,rs1[i]);
1991     if (rt1[i]!=0) {
1992       alloc_reg(current,i,rt1[i]);
1993       dirty_reg(current,rt1[i]);
1994     }
1995   }
1996   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1997   {
1998     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1999     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2000     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2001     {
2002       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2003       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2004     }
2005   }
2006   else
2007   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2008   {
2009     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2010     if(!((current->is32>>rs1[i])&1))
2011     {
2012       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2013     }
2014   }
2015   else
2016   if(opcode[i]==0x11) // BC1
2017   {
2018     alloc_reg(current,i,FSREG);
2019     alloc_reg(current,i,CSREG);
2020   }
2021   //else ...
2022 }
2023
2024 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2025 {
2026   stubs[stubcount][0]=type;
2027   stubs[stubcount][1]=addr;
2028   stubs[stubcount][2]=retaddr;
2029   stubs[stubcount][3]=a;
2030   stubs[stubcount][4]=b;
2031   stubs[stubcount][5]=c;
2032   stubs[stubcount][6]=d;
2033   stubs[stubcount][7]=e;
2034   stubcount++;
2035 }
2036
2037 // Write out a single register
2038 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2039 {
2040   int hr;
2041   for(hr=0;hr<HOST_REGS;hr++) {
2042     if(hr!=EXCLUDE_REG) {
2043       if((regmap[hr]&63)==r) {
2044         if((dirty>>hr)&1) {
2045           if(regmap[hr]<64) {
2046             emit_storereg(r,hr);
2047 #ifndef FORCE32
2048             if((is32>>regmap[hr])&1) {
2049               emit_sarimm(hr,31,hr);
2050               emit_storereg(r|64,hr);
2051             }
2052 #endif
2053           }else{
2054             emit_storereg(r|64,hr);
2055           }
2056         }
2057       }
2058     }
2059   }
2060 }
2061
2062 int mchecksum()
2063 {
2064   //if(!tracedebug) return 0;
2065   int i;
2066   int sum=0;
2067   for(i=0;i<2097152;i++) {
2068     unsigned int temp=sum;
2069     sum<<=1;
2070     sum|=(~temp)>>31;
2071     sum^=((u_int *)rdram)[i];
2072   }
2073   return sum;
2074 }
2075 int rchecksum()
2076 {
2077   int i;
2078   int sum=0;
2079   for(i=0;i<64;i++)
2080     sum^=((u_int *)reg)[i];
2081   return sum;
2082 }
2083 void rlist()
2084 {
2085   int i;
2086   printf("TRACE: ");
2087   for(i=0;i<32;i++)
2088     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2089   printf("\n");
2090 #ifndef DISABLE_COP1
2091   printf("TRACE: ");
2092   for(i=0;i<32;i++)
2093     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2094   printf("\n");
2095 #endif
2096 }
2097
2098 void enabletrace()
2099 {
2100   tracedebug=1;
2101 }
2102
2103 void memdebug(int i)
2104 {
2105   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2106   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2107   //rlist();
2108   //if(tracedebug) {
2109   //if(Count>=-2084597794) {
2110   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2111   //if(0) {
2112     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2113     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2114     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2115     rlist();
2116     #ifdef __i386__
2117     printf("TRACE: %x\n",(&i)[-1]);
2118     #endif
2119     #ifdef __arm__
2120     int j;
2121     printf("TRACE: %x \n",(&j)[10]);
2122     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2123     #endif
2124     //fflush(stdout);
2125   }
2126   //printf("TRACE: %x\n",(&i)[-1]);
2127 }
2128
2129 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2130 {
2131   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2132 }
2133
2134 void alu_assemble(int i,struct regstat *i_regs)
2135 {
2136   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2137     if(rt1[i]) {
2138       signed char s1,s2,t;
2139       t=get_reg(i_regs->regmap,rt1[i]);
2140       if(t>=0) {
2141         s1=get_reg(i_regs->regmap,rs1[i]);
2142         s2=get_reg(i_regs->regmap,rs2[i]);
2143         if(rs1[i]&&rs2[i]) {
2144           assert(s1>=0);
2145           assert(s2>=0);
2146           if(opcode2[i]&2) emit_sub(s1,s2,t);
2147           else emit_add(s1,s2,t);
2148         }
2149         else if(rs1[i]) {
2150           if(s1>=0) emit_mov(s1,t);
2151           else emit_loadreg(rs1[i],t);
2152         }
2153         else if(rs2[i]) {
2154           if(s2>=0) {
2155             if(opcode2[i]&2) emit_neg(s2,t);
2156             else emit_mov(s2,t);
2157           }
2158           else {
2159             emit_loadreg(rs2[i],t);
2160             if(opcode2[i]&2) emit_neg(t,t);
2161           }
2162         }
2163         else emit_zeroreg(t);
2164       }
2165     }
2166   }
2167   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2168     if(rt1[i]) {
2169       signed char s1l,s2l,s1h,s2h,tl,th;
2170       tl=get_reg(i_regs->regmap,rt1[i]);
2171       th=get_reg(i_regs->regmap,rt1[i]|64);
2172       if(tl>=0) {
2173         s1l=get_reg(i_regs->regmap,rs1[i]);
2174         s2l=get_reg(i_regs->regmap,rs2[i]);
2175         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2176         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2177         if(rs1[i]&&rs2[i]) {
2178           assert(s1l>=0);
2179           assert(s2l>=0);
2180           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2181           else emit_adds(s1l,s2l,tl);
2182           if(th>=0) {
2183             #ifdef INVERTED_CARRY
2184             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2185             #else
2186             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2187             #endif
2188             else emit_add(s1h,s2h,th);
2189           }
2190         }
2191         else if(rs1[i]) {
2192           if(s1l>=0) emit_mov(s1l,tl);
2193           else emit_loadreg(rs1[i],tl);
2194           if(th>=0) {
2195             if(s1h>=0) emit_mov(s1h,th);
2196             else emit_loadreg(rs1[i]|64,th);
2197           }
2198         }
2199         else if(rs2[i]) {
2200           if(s2l>=0) {
2201             if(opcode2[i]&2) emit_negs(s2l,tl);
2202             else emit_mov(s2l,tl);
2203           }
2204           else {
2205             emit_loadreg(rs2[i],tl);
2206             if(opcode2[i]&2) emit_negs(tl,tl);
2207           }
2208           if(th>=0) {
2209             #ifdef INVERTED_CARRY
2210             if(s2h>=0) emit_mov(s2h,th);
2211             else emit_loadreg(rs2[i]|64,th);
2212             if(opcode2[i]&2) {
2213               emit_adcimm(-1,th); // x86 has inverted carry flag
2214               emit_not(th,th);
2215             }
2216             #else
2217             if(opcode2[i]&2) {
2218               if(s2h>=0) emit_rscimm(s2h,0,th);
2219               else {
2220                 emit_loadreg(rs2[i]|64,th);
2221                 emit_rscimm(th,0,th);
2222               }
2223             }else{
2224               if(s2h>=0) emit_mov(s2h,th);
2225               else emit_loadreg(rs2[i]|64,th);
2226             }
2227             #endif
2228           }
2229         }
2230         else {
2231           emit_zeroreg(tl);
2232           if(th>=0) emit_zeroreg(th);
2233         }
2234       }
2235     }
2236   }
2237   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2238     if(rt1[i]) {
2239       signed char s1l,s1h,s2l,s2h,t;
2240       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2241       {
2242         t=get_reg(i_regs->regmap,rt1[i]);
2243         //assert(t>=0);
2244         if(t>=0) {
2245           s1l=get_reg(i_regs->regmap,rs1[i]);
2246           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2247           s2l=get_reg(i_regs->regmap,rs2[i]);
2248           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2249           if(rs2[i]==0) // rx<r0
2250           {
2251             assert(s1h>=0);
2252             if(opcode2[i]==0x2a) // SLT
2253               emit_shrimm(s1h,31,t);
2254             else // SLTU (unsigned can not be less than zero)
2255               emit_zeroreg(t);
2256           }
2257           else if(rs1[i]==0) // r0<rx
2258           {
2259             assert(s2h>=0);
2260             if(opcode2[i]==0x2a) // SLT
2261               emit_set_gz64_32(s2h,s2l,t);
2262             else // SLTU (set if not zero)
2263               emit_set_nz64_32(s2h,s2l,t);
2264           }
2265           else {
2266             assert(s1l>=0);assert(s1h>=0);
2267             assert(s2l>=0);assert(s2h>=0);
2268             if(opcode2[i]==0x2a) // SLT
2269               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2270             else // SLTU
2271               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2272           }
2273         }
2274       } else {
2275         t=get_reg(i_regs->regmap,rt1[i]);
2276         //assert(t>=0);
2277         if(t>=0) {
2278           s1l=get_reg(i_regs->regmap,rs1[i]);
2279           s2l=get_reg(i_regs->regmap,rs2[i]);
2280           if(rs2[i]==0) // rx<r0
2281           {
2282             assert(s1l>=0);
2283             if(opcode2[i]==0x2a) // SLT
2284               emit_shrimm(s1l,31,t);
2285             else // SLTU (unsigned can not be less than zero)
2286               emit_zeroreg(t);
2287           }
2288           else if(rs1[i]==0) // r0<rx
2289           {
2290             assert(s2l>=0);
2291             if(opcode2[i]==0x2a) // SLT
2292               emit_set_gz32(s2l,t);
2293             else // SLTU (set if not zero)
2294               emit_set_nz32(s2l,t);
2295           }
2296           else{
2297             assert(s1l>=0);assert(s2l>=0);
2298             if(opcode2[i]==0x2a) // SLT
2299               emit_set_if_less32(s1l,s2l,t);
2300             else // SLTU
2301               emit_set_if_carry32(s1l,s2l,t);
2302           }
2303         }
2304       }
2305     }
2306   }
2307   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2308     if(rt1[i]) {
2309       signed char s1l,s1h,s2l,s2h,th,tl;
2310       tl=get_reg(i_regs->regmap,rt1[i]);
2311       th=get_reg(i_regs->regmap,rt1[i]|64);
2312       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2313       {
2314         assert(tl>=0);
2315         if(tl>=0) {
2316           s1l=get_reg(i_regs->regmap,rs1[i]);
2317           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2318           s2l=get_reg(i_regs->regmap,rs2[i]);
2319           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2320           if(rs1[i]&&rs2[i]) {
2321             assert(s1l>=0);assert(s1h>=0);
2322             assert(s2l>=0);assert(s2h>=0);
2323             if(opcode2[i]==0x24) { // AND
2324               emit_and(s1l,s2l,tl);
2325               emit_and(s1h,s2h,th);
2326             } else
2327             if(opcode2[i]==0x25) { // OR
2328               emit_or(s1l,s2l,tl);
2329               emit_or(s1h,s2h,th);
2330             } else
2331             if(opcode2[i]==0x26) { // XOR
2332               emit_xor(s1l,s2l,tl);
2333               emit_xor(s1h,s2h,th);
2334             } else
2335             if(opcode2[i]==0x27) { // NOR
2336               emit_or(s1l,s2l,tl);
2337               emit_or(s1h,s2h,th);
2338               emit_not(tl,tl);
2339               emit_not(th,th);
2340             }
2341           }
2342           else
2343           {
2344             if(opcode2[i]==0x24) { // AND
2345               emit_zeroreg(tl);
2346               emit_zeroreg(th);
2347             } else
2348             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2349               if(rs1[i]){
2350                 if(s1l>=0) emit_mov(s1l,tl);
2351                 else emit_loadreg(rs1[i],tl);
2352                 if(s1h>=0) emit_mov(s1h,th);
2353                 else emit_loadreg(rs1[i]|64,th);
2354               }
2355               else
2356               if(rs2[i]){
2357                 if(s2l>=0) emit_mov(s2l,tl);
2358                 else emit_loadreg(rs2[i],tl);
2359                 if(s2h>=0) emit_mov(s2h,th);
2360                 else emit_loadreg(rs2[i]|64,th);
2361               }
2362               else{
2363                 emit_zeroreg(tl);
2364                 emit_zeroreg(th);
2365               }
2366             } else
2367             if(opcode2[i]==0x27) { // NOR
2368               if(rs1[i]){
2369                 if(s1l>=0) emit_not(s1l,tl);
2370                 else{
2371                   emit_loadreg(rs1[i],tl);
2372                   emit_not(tl,tl);
2373                 }
2374                 if(s1h>=0) emit_not(s1h,th);
2375                 else{
2376                   emit_loadreg(rs1[i]|64,th);
2377                   emit_not(th,th);
2378                 }
2379               }
2380               else
2381               if(rs2[i]){
2382                 if(s2l>=0) emit_not(s2l,tl);
2383                 else{
2384                   emit_loadreg(rs2[i],tl);
2385                   emit_not(tl,tl);
2386                 }
2387                 if(s2h>=0) emit_not(s2h,th);
2388                 else{
2389                   emit_loadreg(rs2[i]|64,th);
2390                   emit_not(th,th);
2391                 }
2392               }
2393               else {
2394                 emit_movimm(-1,tl);
2395                 emit_movimm(-1,th);
2396               }
2397             }
2398           }
2399         }
2400       }
2401       else
2402       {
2403         // 32 bit
2404         if(tl>=0) {
2405           s1l=get_reg(i_regs->regmap,rs1[i]);
2406           s2l=get_reg(i_regs->regmap,rs2[i]);
2407           if(rs1[i]&&rs2[i]) {
2408             assert(s1l>=0);
2409             assert(s2l>=0);
2410             if(opcode2[i]==0x24) { // AND
2411               emit_and(s1l,s2l,tl);
2412             } else
2413             if(opcode2[i]==0x25) { // OR
2414               emit_or(s1l,s2l,tl);
2415             } else
2416             if(opcode2[i]==0x26) { // XOR
2417               emit_xor(s1l,s2l,tl);
2418             } else
2419             if(opcode2[i]==0x27) { // NOR
2420               emit_or(s1l,s2l,tl);
2421               emit_not(tl,tl);
2422             }
2423           }
2424           else
2425           {
2426             if(opcode2[i]==0x24) { // AND
2427               emit_zeroreg(tl);
2428             } else
2429             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2430               if(rs1[i]){
2431                 if(s1l>=0) emit_mov(s1l,tl);
2432                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2433               }
2434               else
2435               if(rs2[i]){
2436                 if(s2l>=0) emit_mov(s2l,tl);
2437                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2438               }
2439               else emit_zeroreg(tl);
2440             } else
2441             if(opcode2[i]==0x27) { // NOR
2442               if(rs1[i]){
2443                 if(s1l>=0) emit_not(s1l,tl);
2444                 else {
2445                   emit_loadreg(rs1[i],tl);
2446                   emit_not(tl,tl);
2447                 }
2448               }
2449               else
2450               if(rs2[i]){
2451                 if(s2l>=0) emit_not(s2l,tl);
2452                 else {
2453                   emit_loadreg(rs2[i],tl);
2454                   emit_not(tl,tl);
2455                 }
2456               }
2457               else emit_movimm(-1,tl);
2458             }
2459           }
2460         }
2461       }
2462     }
2463   }
2464 }
2465
2466 void imm16_assemble(int i,struct regstat *i_regs)
2467 {
2468   if (opcode[i]==0x0f) { // LUI
2469     if(rt1[i]) {
2470       signed char t;
2471       t=get_reg(i_regs->regmap,rt1[i]);
2472       //assert(t>=0);
2473       if(t>=0) {
2474         if(!((i_regs->isconst>>t)&1))
2475           emit_movimm(imm[i]<<16,t);
2476       }
2477     }
2478   }
2479   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2480     if(rt1[i]) {
2481       signed char s,t;
2482       t=get_reg(i_regs->regmap,rt1[i]);
2483       s=get_reg(i_regs->regmap,rs1[i]);
2484       if(rs1[i]) {
2485         //assert(t>=0);
2486         //assert(s>=0);
2487         if(t>=0) {
2488           if(!((i_regs->isconst>>t)&1)) {
2489             if(s<0) {
2490               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2491               emit_addimm(t,imm[i],t);
2492             }else{
2493               if(!((i_regs->wasconst>>s)&1))
2494                 emit_addimm(s,imm[i],t);
2495               else
2496                 emit_movimm(constmap[i][s]+imm[i],t);
2497             }
2498           }
2499         }
2500       } else {
2501         if(t>=0) {
2502           if(!((i_regs->isconst>>t)&1))
2503             emit_movimm(imm[i],t);
2504         }
2505       }
2506     }
2507   }
2508   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2509     if(rt1[i]) {
2510       signed char sh,sl,th,tl;
2511       th=get_reg(i_regs->regmap,rt1[i]|64);
2512       tl=get_reg(i_regs->regmap,rt1[i]);
2513       sh=get_reg(i_regs->regmap,rs1[i]|64);
2514       sl=get_reg(i_regs->regmap,rs1[i]);
2515       if(tl>=0) {
2516         if(rs1[i]) {
2517           assert(sh>=0);
2518           assert(sl>=0);
2519           if(th>=0) {
2520             emit_addimm64_32(sh,sl,imm[i],th,tl);
2521           }
2522           else {
2523             emit_addimm(sl,imm[i],tl);
2524           }
2525         } else {
2526           emit_movimm(imm[i],tl);
2527           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2528         }
2529       }
2530     }
2531   }
2532   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2533     if(rt1[i]) {
2534       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2535       signed char sh,sl,t;
2536       t=get_reg(i_regs->regmap,rt1[i]);
2537       sh=get_reg(i_regs->regmap,rs1[i]|64);
2538       sl=get_reg(i_regs->regmap,rs1[i]);
2539       //assert(t>=0);
2540       if(t>=0) {
2541         if(rs1[i]>0) {
2542           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2543           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2544             if(opcode[i]==0x0a) { // SLTI
2545               if(sl<0) {
2546                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2547                 emit_slti32(t,imm[i],t);
2548               }else{
2549                 emit_slti32(sl,imm[i],t);
2550               }
2551             }
2552             else { // SLTIU
2553               if(sl<0) {
2554                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2555                 emit_sltiu32(t,imm[i],t);
2556               }else{
2557                 emit_sltiu32(sl,imm[i],t);
2558               }
2559             }
2560           }else{ // 64-bit
2561             assert(sl>=0);
2562             if(opcode[i]==0x0a) // SLTI
2563               emit_slti64_32(sh,sl,imm[i],t);
2564             else // SLTIU
2565               emit_sltiu64_32(sh,sl,imm[i],t);
2566           }
2567         }else{
2568           // SLTI(U) with r0 is just stupid,
2569           // nonetheless examples can be found
2570           if(opcode[i]==0x0a) // SLTI
2571             if(0<imm[i]) emit_movimm(1,t);
2572             else emit_zeroreg(t);
2573           else // SLTIU
2574           {
2575             if(imm[i]) emit_movimm(1,t);
2576             else emit_zeroreg(t);
2577           }
2578         }
2579       }
2580     }
2581   }
2582   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2583     if(rt1[i]) {
2584       signed char sh,sl,th,tl;
2585       th=get_reg(i_regs->regmap,rt1[i]|64);
2586       tl=get_reg(i_regs->regmap,rt1[i]);
2587       sh=get_reg(i_regs->regmap,rs1[i]|64);
2588       sl=get_reg(i_regs->regmap,rs1[i]);
2589       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2590         if(opcode[i]==0x0c) //ANDI
2591         {
2592           if(rs1[i]) {
2593             if(sl<0) {
2594               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2595               emit_andimm(tl,imm[i],tl);
2596             }else{
2597               if(!((i_regs->wasconst>>sl)&1))
2598                 emit_andimm(sl,imm[i],tl);
2599               else
2600                 emit_movimm(constmap[i][sl]&imm[i],tl);
2601             }
2602           }
2603           else
2604             emit_zeroreg(tl);
2605           if(th>=0) emit_zeroreg(th);
2606         }
2607         else
2608         {
2609           if(rs1[i]) {
2610             if(sl<0) {
2611               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2612             }
2613             if(th>=0) {
2614               if(sh<0) {
2615                 emit_loadreg(rs1[i]|64,th);
2616               }else{
2617                 emit_mov(sh,th);
2618               }
2619             }
2620             if(opcode[i]==0x0d) //ORI
2621             if(sl<0) {
2622               emit_orimm(tl,imm[i],tl);
2623             }else{
2624               if(!((i_regs->wasconst>>sl)&1))
2625                 emit_orimm(sl,imm[i],tl);
2626               else
2627                 emit_movimm(constmap[i][sl]|imm[i],tl);
2628             }
2629             if(opcode[i]==0x0e) //XORI
2630             if(sl<0) {
2631               emit_xorimm(tl,imm[i],tl);
2632             }else{
2633               if(!((i_regs->wasconst>>sl)&1))
2634                 emit_xorimm(sl,imm[i],tl);
2635               else
2636                 emit_movimm(constmap[i][sl]^imm[i],tl);
2637             }
2638           }
2639           else {
2640             emit_movimm(imm[i],tl);
2641             if(th>=0) emit_zeroreg(th);
2642           }
2643         }
2644       }
2645     }
2646   }
2647 }
2648
2649 void shiftimm_assemble(int i,struct regstat *i_regs)
2650 {
2651   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2652   {
2653     if(rt1[i]) {
2654       signed char s,t;
2655       t=get_reg(i_regs->regmap,rt1[i]);
2656       s=get_reg(i_regs->regmap,rs1[i]);
2657       //assert(t>=0);
2658       if(t>=0&&!((i_regs->isconst>>t)&1)){
2659         if(rs1[i]==0)
2660         {
2661           emit_zeroreg(t);
2662         }
2663         else
2664         {
2665           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2666           if(imm[i]) {
2667             if(opcode2[i]==0) // SLL
2668             {
2669               emit_shlimm(s<0?t:s,imm[i],t);
2670             }
2671             if(opcode2[i]==2) // SRL
2672             {
2673               emit_shrimm(s<0?t:s,imm[i],t);
2674             }
2675             if(opcode2[i]==3) // SRA
2676             {
2677               emit_sarimm(s<0?t:s,imm[i],t);
2678             }
2679           }else{
2680             // Shift by zero
2681             if(s>=0 && s!=t) emit_mov(s,t);
2682           }
2683         }
2684       }
2685       //emit_storereg(rt1[i],t); //DEBUG
2686     }
2687   }
2688   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2689   {
2690     if(rt1[i]) {
2691       signed char sh,sl,th,tl;
2692       th=get_reg(i_regs->regmap,rt1[i]|64);
2693       tl=get_reg(i_regs->regmap,rt1[i]);
2694       sh=get_reg(i_regs->regmap,rs1[i]|64);
2695       sl=get_reg(i_regs->regmap,rs1[i]);
2696       if(tl>=0) {
2697         if(rs1[i]==0)
2698         {
2699           emit_zeroreg(tl);
2700           if(th>=0) emit_zeroreg(th);
2701         }
2702         else
2703         {
2704           assert(sl>=0);
2705           assert(sh>=0);
2706           if(imm[i]) {
2707             if(opcode2[i]==0x38) // DSLL
2708             {
2709               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2710               emit_shlimm(sl,imm[i],tl);
2711             }
2712             if(opcode2[i]==0x3a) // DSRL
2713             {
2714               emit_shrdimm(sl,sh,imm[i],tl);
2715               if(th>=0) emit_shrimm(sh,imm[i],th);
2716             }
2717             if(opcode2[i]==0x3b) // DSRA
2718             {
2719               emit_shrdimm(sl,sh,imm[i],tl);
2720               if(th>=0) emit_sarimm(sh,imm[i],th);
2721             }
2722           }else{
2723             // Shift by zero
2724             if(sl!=tl) emit_mov(sl,tl);
2725             if(th>=0&&sh!=th) emit_mov(sh,th);
2726           }
2727         }
2728       }
2729     }
2730   }
2731   if(opcode2[i]==0x3c) // DSLL32
2732   {
2733     if(rt1[i]) {
2734       signed char sl,tl,th;
2735       tl=get_reg(i_regs->regmap,rt1[i]);
2736       th=get_reg(i_regs->regmap,rt1[i]|64);
2737       sl=get_reg(i_regs->regmap,rs1[i]);
2738       if(th>=0||tl>=0){
2739         assert(tl>=0);
2740         assert(th>=0);
2741         assert(sl>=0);
2742         emit_mov(sl,th);
2743         emit_zeroreg(tl);
2744         if(imm[i]>32)
2745         {
2746           emit_shlimm(th,imm[i]&31,th);
2747         }
2748       }
2749     }
2750   }
2751   if(opcode2[i]==0x3e) // DSRL32
2752   {
2753     if(rt1[i]) {
2754       signed char sh,tl,th;
2755       tl=get_reg(i_regs->regmap,rt1[i]);
2756       th=get_reg(i_regs->regmap,rt1[i]|64);
2757       sh=get_reg(i_regs->regmap,rs1[i]|64);
2758       if(tl>=0){
2759         assert(sh>=0);
2760         emit_mov(sh,tl);
2761         if(th>=0) emit_zeroreg(th);
2762         if(imm[i]>32)
2763         {
2764           emit_shrimm(tl,imm[i]&31,tl);
2765         }
2766       }
2767     }
2768   }
2769   if(opcode2[i]==0x3f) // DSRA32
2770   {
2771     if(rt1[i]) {
2772       signed char sh,tl;
2773       tl=get_reg(i_regs->regmap,rt1[i]);
2774       sh=get_reg(i_regs->regmap,rs1[i]|64);
2775       if(tl>=0){
2776         assert(sh>=0);
2777         emit_mov(sh,tl);
2778         if(imm[i]>32)
2779         {
2780           emit_sarimm(tl,imm[i]&31,tl);
2781         }
2782       }
2783     }
2784   }
2785 }
2786
2787 #ifndef shift_assemble
2788 void shift_assemble(int i,struct regstat *i_regs)
2789 {
2790   printf("Need shift_assemble for this architecture.\n");
2791   exit(1);
2792 }
2793 #endif
2794
2795 void load_assemble(int i,struct regstat *i_regs)
2796 {
2797   int s,th,tl,addr,map=-1;
2798   int offset;
2799   int jaddr=0;
2800   int memtarget=0,c=0;
2801   int fastload_reg_override=0;
2802   u_int hr,reglist=0;
2803   th=get_reg(i_regs->regmap,rt1[i]|64);
2804   tl=get_reg(i_regs->regmap,rt1[i]);
2805   s=get_reg(i_regs->regmap,rs1[i]);
2806   offset=imm[i];
2807   for(hr=0;hr<HOST_REGS;hr++) {
2808     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2809   }
2810   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2811   if(s>=0) {
2812     c=(i_regs->wasconst>>s)&1;
2813     if (c) {
2814       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2815       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2816     }
2817   }
2818   //printf("load_assemble: c=%d\n",c);
2819   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2820   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2821 #ifdef PCSX
2822   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2823     ||rt1[i]==0) {
2824       // could be FIFO, must perform the read
2825       // ||dummy read
2826       assem_debug("(forced read)\n");
2827       tl=get_reg(i_regs->regmap,-1);
2828       assert(tl>=0);
2829   }
2830 #endif
2831   if(offset||s<0||c) addr=tl;
2832   else addr=s;
2833   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2834  if(tl>=0) {
2835   //printf("load_assemble: c=%d\n",c);
2836   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2837   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2838   reglist&=~(1<<tl);
2839   if(th>=0) reglist&=~(1<<th);
2840   if(!using_tlb) {
2841     if(!c) {
2842       #ifdef RAM_OFFSET
2843       map=get_reg(i_regs->regmap,ROREG);
2844       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2845       #endif
2846 //#define R29_HACK 1
2847       #ifdef R29_HACK
2848       // Strmnnrmn's speed hack
2849       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2850       #endif
2851       {
2852         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2853       }
2854     }
2855     else if(ram_offset&&memtarget) {
2856       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2857       fastload_reg_override=HOST_TEMPREG;
2858     }
2859   }else{ // using tlb
2860     int x=0;
2861     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2862     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2863     map=get_reg(i_regs->regmap,TLREG);
2864     assert(map>=0);
2865     reglist&=~(1<<map);
2866     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2867     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2868   }
2869   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2870   if (opcode[i]==0x20) { // LB
2871     if(!c||memtarget) {
2872       if(!dummy) {
2873         #ifdef HOST_IMM_ADDR32
2874         if(c)
2875           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2876         else
2877         #endif
2878         {
2879           //emit_xorimm(addr,3,tl);
2880           //gen_tlb_addr_r(tl,map);
2881           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2882           int x=0,a=tl;
2883 #ifdef BIG_ENDIAN_MIPS
2884           if(!c) emit_xorimm(addr,3,tl);
2885           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2886 #else
2887           if(!c) a=addr;
2888 #endif
2889           if(fastload_reg_override) a=fastload_reg_override;
2890
2891           emit_movsbl_indexed_tlb(x,a,map,tl);
2892         }
2893       }
2894       if(jaddr)
2895         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2896     }
2897     else
2898       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2899   }
2900   if (opcode[i]==0x21) { // LH
2901     if(!c||memtarget) {
2902       if(!dummy) {
2903         #ifdef HOST_IMM_ADDR32
2904         if(c)
2905           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2906         else
2907         #endif
2908         {
2909           int x=0,a=tl;
2910 #ifdef BIG_ENDIAN_MIPS
2911           if(!c) emit_xorimm(addr,2,tl);
2912           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2913 #else
2914           if(!c) a=addr;
2915 #endif
2916           if(fastload_reg_override) a=fastload_reg_override;
2917           //#ifdef
2918           //emit_movswl_indexed_tlb(x,tl,map,tl);
2919           //else
2920           if(map>=0) {
2921             gen_tlb_addr_r(a,map);
2922             emit_movswl_indexed(x,a,tl);
2923           }else{
2924             #if 1 //def RAM_OFFSET
2925             emit_movswl_indexed(x,a,tl);
2926             #else
2927             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2928             #endif
2929           }
2930         }
2931       }
2932       if(jaddr)
2933         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2934     }
2935     else
2936       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2937   }
2938   if (opcode[i]==0x23) { // LW
2939     if(!c||memtarget) {
2940       if(!dummy) {
2941         int a=addr;
2942         if(fastload_reg_override) a=fastload_reg_override;
2943         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2944         #ifdef HOST_IMM_ADDR32
2945         if(c)
2946           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2947         else
2948         #endif
2949         emit_readword_indexed_tlb(0,a,map,tl);
2950       }
2951       if(jaddr)
2952         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2953     }
2954     else
2955       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2956   }
2957   if (opcode[i]==0x24) { // LBU
2958     if(!c||memtarget) {
2959       if(!dummy) {
2960         #ifdef HOST_IMM_ADDR32
2961         if(c)
2962           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2963         else
2964         #endif
2965         {
2966           //emit_xorimm(addr,3,tl);
2967           //gen_tlb_addr_r(tl,map);
2968           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2969           int x=0,a=tl;
2970 #ifdef BIG_ENDIAN_MIPS
2971           if(!c) emit_xorimm(addr,3,tl);
2972           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2973 #else
2974           if(!c) a=addr;
2975 #endif
2976           if(fastload_reg_override) a=fastload_reg_override;
2977
2978           emit_movzbl_indexed_tlb(x,a,map,tl);
2979         }
2980       }
2981       if(jaddr)
2982         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2983     }
2984     else
2985       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2986   }
2987   if (opcode[i]==0x25) { // LHU
2988     if(!c||memtarget) {
2989       if(!dummy) {
2990         #ifdef HOST_IMM_ADDR32
2991         if(c)
2992           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2993         else
2994         #endif
2995         {
2996           int x=0,a=tl;
2997 #ifdef BIG_ENDIAN_MIPS
2998           if(!c) emit_xorimm(addr,2,tl);
2999           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3000 #else
3001           if(!c) a=addr;
3002 #endif
3003           if(fastload_reg_override) a=fastload_reg_override;
3004           //#ifdef
3005           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3006           //#else
3007           if(map>=0) {
3008             gen_tlb_addr_r(a,map);
3009             emit_movzwl_indexed(x,a,tl);
3010           }else{
3011             #if 1 //def RAM_OFFSET
3012             emit_movzwl_indexed(x,a,tl);
3013             #else
3014             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3015             #endif
3016           }
3017         }
3018       }
3019       if(jaddr)
3020         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3021     }
3022     else
3023       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3024   }
3025   if (opcode[i]==0x27) { // LWU
3026     assert(th>=0);
3027     if(!c||memtarget) {
3028       if(!dummy) {
3029         int a=addr;
3030         if(fastload_reg_override) a=fastload_reg_override;
3031         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3032         #ifdef HOST_IMM_ADDR32
3033         if(c)
3034           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3035         else
3036         #endif
3037         emit_readword_indexed_tlb(0,a,map,tl);
3038       }
3039       if(jaddr)
3040         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3041     }
3042     else {
3043       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3044     }
3045     emit_zeroreg(th);
3046   }
3047   if (opcode[i]==0x37) { // LD
3048     if(!c||memtarget) {
3049       if(!dummy) {
3050         int a=addr;
3051         if(fastload_reg_override) a=fastload_reg_override;
3052         //gen_tlb_addr_r(tl,map);
3053         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3054         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3055         #ifdef HOST_IMM_ADDR32
3056         if(c)
3057           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3058         else
3059         #endif
3060         emit_readdword_indexed_tlb(0,a,map,th,tl);
3061       }
3062       if(jaddr)
3063         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3064     }
3065     else
3066       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3067   }
3068  }
3069   //emit_storereg(rt1[i],tl); // DEBUG
3070   //if(opcode[i]==0x23)
3071   //if(opcode[i]==0x24)
3072   //if(opcode[i]==0x23||opcode[i]==0x24)
3073   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3074   {
3075     //emit_pusha();
3076     save_regs(0x100f);
3077         emit_readword((int)&last_count,ECX);
3078         #ifdef __i386__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,HOST_CCREG);
3081         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3082         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3083         emit_writeword(HOST_CCREG,(int)&Count);
3084         #endif
3085         #ifdef __arm__
3086         if(get_reg(i_regs->regmap,CCREG)<0)
3087           emit_loadreg(CCREG,0);
3088         else
3089           emit_mov(HOST_CCREG,0);
3090         emit_add(0,ECX,0);
3091         emit_addimm(0,2*ccadj[i],0);
3092         emit_writeword(0,(int)&Count);
3093         #endif
3094     emit_call((int)memdebug);
3095     //emit_popa();
3096     restore_regs(0x100f);
3097   }/**/
3098 }
3099
3100 #ifndef loadlr_assemble
3101 void loadlr_assemble(int i,struct regstat *i_regs)
3102 {
3103   printf("Need loadlr_assemble for this architecture.\n");
3104   exit(1);
3105 }
3106 #endif
3107
3108 void store_assemble(int i,struct regstat *i_regs)
3109 {
3110   int s,th,tl,map=-1;
3111   int addr,temp;
3112   int offset;
3113   int jaddr=0,jaddr2,type;
3114   int memtarget=0,c=0;
3115   int agr=AGEN1+(i&1);
3116   int faststore_reg_override=0;
3117   u_int hr,reglist=0;
3118   th=get_reg(i_regs->regmap,rs2[i]|64);
3119   tl=get_reg(i_regs->regmap,rs2[i]);
3120   s=get_reg(i_regs->regmap,rs1[i]);
3121   temp=get_reg(i_regs->regmap,agr);
3122   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3123   offset=imm[i];
3124   if(s>=0) {
3125     c=(i_regs->wasconst>>s)&1;
3126     if(c) {
3127       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3128       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3129     }
3130   }
3131   assert(tl>=0);
3132   assert(temp>=0);
3133   for(hr=0;hr<HOST_REGS;hr++) {
3134     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3135   }
3136   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3137   if(offset||s<0||c) addr=temp;
3138   else addr=s;
3139   if(!using_tlb) {
3140     if(!c) {
3141       #ifndef PCSX
3142       #ifdef R29_HACK
3143       // Strmnnrmn's speed hack
3144       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3145       #endif
3146       emit_cmpimm(addr,RAM_SIZE);
3147       #ifdef DESTRUCTIVE_SHIFT
3148       if(s==addr) emit_mov(s,temp);
3149       #endif
3150       #ifdef R29_HACK
3151       memtarget=1;
3152       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3153       #endif
3154       {
3155         jaddr=(int)out;
3156         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3157         // Hint to branch predictor that the branch is unlikely to be taken
3158         if(rs1[i]>=28)
3159           emit_jno_unlikely(0);
3160         else
3161         #endif
3162         emit_jno(0);
3163       }
3164       #else
3165         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3166       #endif
3167     }
3168     else if(ram_offset&&memtarget) {
3169       emit_addimm(addr,ram_offset,HOST_TEMPREG);
3170       faststore_reg_override=HOST_TEMPREG;
3171     }
3172   }else{ // using tlb
3173     int x=0;
3174     if (opcode[i]==0x28) x=3; // SB
3175     if (opcode[i]==0x29) x=2; // SH
3176     map=get_reg(i_regs->regmap,TLREG);
3177     assert(map>=0);
3178     reglist&=~(1<<map);
3179     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3180     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3181   }
3182
3183   if (opcode[i]==0x28) { // SB
3184     if(!c||memtarget) {
3185       int x=0,a=temp;
3186 #ifdef BIG_ENDIAN_MIPS
3187       if(!c) emit_xorimm(addr,3,temp);
3188       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3189 #else
3190       if(!c) a=addr;
3191 #endif
3192       if(faststore_reg_override) a=faststore_reg_override;
3193       //gen_tlb_addr_w(temp,map);
3194       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3195       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3196     }
3197     type=STOREB_STUB;
3198   }
3199   if (opcode[i]==0x29) { // SH
3200     if(!c||memtarget) {
3201       int x=0,a=temp;
3202 #ifdef BIG_ENDIAN_MIPS
3203       if(!c) emit_xorimm(addr,2,temp);
3204       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3205 #else
3206       if(!c) a=addr;
3207 #endif
3208       if(faststore_reg_override) a=faststore_reg_override;
3209       //#ifdef
3210       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3211       //#else
3212       if(map>=0) {
3213         gen_tlb_addr_w(a,map);
3214         emit_writehword_indexed(tl,x,a);
3215       }else
3216         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3217         emit_writehword_indexed(tl,x,a);
3218     }
3219     type=STOREH_STUB;
3220   }
3221   if (opcode[i]==0x2B) { // SW
3222     if(!c||memtarget) {
3223       int a=addr;
3224       if(faststore_reg_override) a=faststore_reg_override;
3225       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3226       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3227     }
3228     type=STOREW_STUB;
3229   }
3230   if (opcode[i]==0x3F) { // SD
3231     if(!c||memtarget) {
3232       int a=addr;
3233       if(faststore_reg_override) a=faststore_reg_override;
3234       if(rs2[i]) {
3235         assert(th>=0);
3236         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3237         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3238         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3239       }else{
3240         // Store zero
3241         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3242         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3243         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3244       }
3245     }
3246     type=STORED_STUB;
3247   }
3248 #ifdef PCSX
3249   if(jaddr) {
3250     // PCSX store handlers don't check invcode again
3251     reglist|=1<<addr;
3252     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3253     jaddr=0;
3254   }
3255 #endif
3256   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3257     if(!c||memtarget) {
3258       #ifdef DESTRUCTIVE_SHIFT
3259       // The x86 shift operation is 'destructive'; it overwrites the
3260       // source register, so we need to make a copy first and use that.
3261       addr=temp;
3262       #endif
3263       #if defined(HOST_IMM8)
3264       int ir=get_reg(i_regs->regmap,INVCP);
3265       assert(ir>=0);
3266       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3267       #else
3268       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3269       #endif
3270       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3271       emit_callne(invalidate_addr_reg[addr]);
3272       #else
3273       jaddr2=(int)out;
3274       emit_jne(0);
3275       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3276       #endif
3277     }
3278   }
3279   u_int addr_val=constmap[i][s]+offset;
3280   if(jaddr) {
3281     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3282   } else if(c&&!memtarget) {
3283     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3284   }
3285   // basic current block modification detection..
3286   // not looking back as that should be in mips cache already
3287   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3288     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3289     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3290     if(i_regs->regmap==regs[i].regmap) {
3291       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3292       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3293       emit_movimm(start+i*4+4,0);
3294       emit_writeword(0,(int)&pcaddr);
3295       emit_jmp((int)do_interrupt);
3296     }
3297   }
3298   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3299   //if(opcode[i]==0x2B || opcode[i]==0x28)
3300   //if(opcode[i]==0x2B || opcode[i]==0x29)
3301   //if(opcode[i]==0x2B)
3302   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3303   {
3304     #ifdef __i386__
3305     emit_pusha();
3306     #endif
3307     #ifdef __arm__
3308     save_regs(0x100f);
3309     #endif
3310         emit_readword((int)&last_count,ECX);
3311         #ifdef __i386__
3312         if(get_reg(i_regs->regmap,CCREG)<0)
3313           emit_loadreg(CCREG,HOST_CCREG);
3314         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3315         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3316         emit_writeword(HOST_CCREG,(int)&Count);
3317         #endif
3318         #ifdef __arm__
3319         if(get_reg(i_regs->regmap,CCREG)<0)
3320           emit_loadreg(CCREG,0);
3321         else
3322           emit_mov(HOST_CCREG,0);
3323         emit_add(0,ECX,0);
3324         emit_addimm(0,2*ccadj[i],0);
3325         emit_writeword(0,(int)&Count);
3326         #endif
3327     emit_call((int)memdebug);
3328     #ifdef __i386__
3329     emit_popa();
3330     #endif
3331     #ifdef __arm__
3332     restore_regs(0x100f);
3333     #endif
3334   }/**/
3335 }
3336
3337 void storelr_assemble(int i,struct regstat *i_regs)
3338 {
3339   int s,th,tl;
3340   int temp;
3341   int temp2;
3342   int offset;
3343   int jaddr=0,jaddr2;
3344   int case1,case2,case3;
3345   int done0,done1,done2;
3346   int memtarget=0,c=0;
3347   int agr=AGEN1+(i&1);
3348   u_int hr,reglist=0;
3349   th=get_reg(i_regs->regmap,rs2[i]|64);
3350   tl=get_reg(i_regs->regmap,rs2[i]);
3351   s=get_reg(i_regs->regmap,rs1[i]);
3352   temp=get_reg(i_regs->regmap,agr);
3353   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3354   offset=imm[i];
3355   if(s>=0) {
3356     c=(i_regs->isconst>>s)&1;
3357     if(c) {
3358       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3359       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3360     }
3361   }
3362   assert(tl>=0);
3363   for(hr=0;hr<HOST_REGS;hr++) {
3364     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3365   }
3366   assert(temp>=0);
3367   if(!using_tlb) {
3368     if(!c) {
3369       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3370       if(!offset&&s!=temp) emit_mov(s,temp);
3371       jaddr=(int)out;
3372       emit_jno(0);
3373     }
3374     else
3375     {
3376       if(!memtarget||!rs1[i]) {
3377         jaddr=(int)out;
3378         emit_jmp(0);
3379       }
3380     }
3381     #ifdef RAM_OFFSET
3382     int map=get_reg(i_regs->regmap,ROREG);
3383     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3384     gen_tlb_addr_w(temp,map);
3385     #else
3386     if((u_int)rdram!=0x80000000) 
3387       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3388     #endif
3389   }else{ // using tlb
3390     int map=get_reg(i_regs->regmap,TLREG);
3391     assert(map>=0);
3392     reglist&=~(1<<map);
3393     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3394     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3395     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3396     if(!jaddr&&!memtarget) {
3397       jaddr=(int)out;
3398       emit_jmp(0);
3399     }
3400     gen_tlb_addr_w(temp,map);
3401   }
3402
3403   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3404     temp2=get_reg(i_regs->regmap,FTEMP);
3405     if(!rs2[i]) temp2=th=tl;
3406   }
3407
3408 #ifndef BIG_ENDIAN_MIPS
3409     emit_xorimm(temp,3,temp);
3410 #endif
3411   emit_testimm(temp,2);
3412   case2=(int)out;
3413   emit_jne(0);
3414   emit_testimm(temp,1);
3415   case1=(int)out;
3416   emit_jne(0);
3417   // 0
3418   if (opcode[i]==0x2A) { // SWL
3419     emit_writeword_indexed(tl,0,temp);
3420   }
3421   if (opcode[i]==0x2E) { // SWR
3422     emit_writebyte_indexed(tl,3,temp);
3423   }
3424   if (opcode[i]==0x2C) { // SDL
3425     emit_writeword_indexed(th,0,temp);
3426     if(rs2[i]) emit_mov(tl,temp2);
3427   }
3428   if (opcode[i]==0x2D) { // SDR
3429     emit_writebyte_indexed(tl,3,temp);
3430     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3431   }
3432   done0=(int)out;
3433   emit_jmp(0);
3434   // 1
3435   set_jump_target(case1,(int)out);
3436   if (opcode[i]==0x2A) { // SWL
3437     // Write 3 msb into three least significant bytes
3438     if(rs2[i]) emit_rorimm(tl,8,tl);
3439     emit_writehword_indexed(tl,-1,temp);
3440     if(rs2[i]) emit_rorimm(tl,16,tl);
3441     emit_writebyte_indexed(tl,1,temp);
3442     if(rs2[i]) emit_rorimm(tl,8,tl);
3443   }
3444   if (opcode[i]==0x2E) { // SWR
3445     // Write two lsb into two most significant bytes
3446     emit_writehword_indexed(tl,1,temp);
3447   }
3448   if (opcode[i]==0x2C) { // SDL
3449     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3450     // Write 3 msb into three least significant bytes
3451     if(rs2[i]) emit_rorimm(th,8,th);
3452     emit_writehword_indexed(th,-1,temp);
3453     if(rs2[i]) emit_rorimm(th,16,th);
3454     emit_writebyte_indexed(th,1,temp);
3455     if(rs2[i]) emit_rorimm(th,8,th);
3456   }
3457   if (opcode[i]==0x2D) { // SDR
3458     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3459     // Write two lsb into two most significant bytes
3460     emit_writehword_indexed(tl,1,temp);
3461   }
3462   done1=(int)out;
3463   emit_jmp(0);
3464   // 2
3465   set_jump_target(case2,(int)out);
3466   emit_testimm(temp,1);
3467   case3=(int)out;
3468   emit_jne(0);
3469   if (opcode[i]==0x2A) { // SWL
3470     // Write two msb into two least significant bytes
3471     if(rs2[i]) emit_rorimm(tl,16,tl);
3472     emit_writehword_indexed(tl,-2,temp);
3473     if(rs2[i]) emit_rorimm(tl,16,tl);
3474   }
3475   if (opcode[i]==0x2E) { // SWR
3476     // Write 3 lsb into three most significant bytes
3477     emit_writebyte_indexed(tl,-1,temp);
3478     if(rs2[i]) emit_rorimm(tl,8,tl);
3479     emit_writehword_indexed(tl,0,temp);
3480     if(rs2[i]) emit_rorimm(tl,24,tl);
3481   }
3482   if (opcode[i]==0x2C) { // SDL
3483     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3484     // Write two msb into two least significant bytes
3485     if(rs2[i]) emit_rorimm(th,16,th);
3486     emit_writehword_indexed(th,-2,temp);
3487     if(rs2[i]) emit_rorimm(th,16,th);
3488   }
3489   if (opcode[i]==0x2D) { // SDR
3490     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3491     // Write 3 lsb into three most significant bytes
3492     emit_writebyte_indexed(tl,-1,temp);
3493     if(rs2[i]) emit_rorimm(tl,8,tl);
3494     emit_writehword_indexed(tl,0,temp);
3495     if(rs2[i]) emit_rorimm(tl,24,tl);
3496   }
3497   done2=(int)out;
3498   emit_jmp(0);
3499   // 3
3500   set_jump_target(case3,(int)out);
3501   if (opcode[i]==0x2A) { // SWL
3502     // Write msb into least significant byte
3503     if(rs2[i]) emit_rorimm(tl,24,tl);
3504     emit_writebyte_indexed(tl,-3,temp);
3505     if(rs2[i]) emit_rorimm(tl,8,tl);
3506   }
3507   if (opcode[i]==0x2E) { // SWR
3508     // Write entire word
3509     emit_writeword_indexed(tl,-3,temp);
3510   }
3511   if (opcode[i]==0x2C) { // SDL
3512     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3513     // Write msb into least significant byte
3514     if(rs2[i]) emit_rorimm(th,24,th);
3515     emit_writebyte_indexed(th,-3,temp);
3516     if(rs2[i]) emit_rorimm(th,8,th);
3517   }
3518   if (opcode[i]==0x2D) { // SDR
3519     if(rs2[i]) emit_mov(th,temp2);
3520     // Write entire word
3521     emit_writeword_indexed(tl,-3,temp);
3522   }
3523   set_jump_target(done0,(int)out);
3524   set_jump_target(done1,(int)out);
3525   set_jump_target(done2,(int)out);
3526   if (opcode[i]==0x2C) { // SDL
3527     emit_testimm(temp,4);
3528     done0=(int)out;
3529     emit_jne(0);
3530     emit_andimm(temp,~3,temp);
3531     emit_writeword_indexed(temp2,4,temp);
3532     set_jump_target(done0,(int)out);
3533   }
3534   if (opcode[i]==0x2D) { // SDR
3535     emit_testimm(temp,4);
3536     done0=(int)out;
3537     emit_jeq(0);
3538     emit_andimm(temp,~3,temp);
3539     emit_writeword_indexed(temp2,-4,temp);
3540     set_jump_target(done0,(int)out);
3541   }
3542   if(!c||!memtarget)
3543     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3544   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3545     #ifdef RAM_OFFSET
3546     int map=get_reg(i_regs->regmap,ROREG);
3547     if(map<0) map=HOST_TEMPREG;
3548     gen_orig_addr_w(temp,map);
3549     #else
3550     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3551     #endif
3552     #if defined(HOST_IMM8)
3553     int ir=get_reg(i_regs->regmap,INVCP);
3554     assert(ir>=0);
3555     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3556     #else
3557     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3558     #endif
3559     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3560     emit_callne(invalidate_addr_reg[temp]);
3561     #else
3562     jaddr2=(int)out;
3563     emit_jne(0);
3564     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3565     #endif
3566   }
3567   /*
3568     emit_pusha();
3569     //save_regs(0x100f);
3570         emit_readword((int)&last_count,ECX);
3571         if(get_reg(i_regs->regmap,CCREG)<0)
3572           emit_loadreg(CCREG,HOST_CCREG);
3573         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3574         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3575         emit_writeword(HOST_CCREG,(int)&Count);
3576     emit_call((int)memdebug);
3577     emit_popa();
3578     //restore_regs(0x100f);
3579   /**/
3580 }
3581
3582 void c1ls_assemble(int i,struct regstat *i_regs)
3583 {
3584 #ifndef DISABLE_COP1
3585   int s,th,tl;
3586   int temp,ar;
3587   int map=-1;
3588   int offset;
3589   int c=0;
3590   int jaddr,jaddr2=0,jaddr3,type;
3591   int agr=AGEN1+(i&1);
3592   u_int hr,reglist=0;
3593   th=get_reg(i_regs->regmap,FTEMP|64);
3594   tl=get_reg(i_regs->regmap,FTEMP);
3595   s=get_reg(i_regs->regmap,rs1[i]);
3596   temp=get_reg(i_regs->regmap,agr);
3597   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3598   offset=imm[i];
3599   assert(tl>=0);
3600   assert(rs1[i]>0);
3601   assert(temp>=0);
3602   for(hr=0;hr<HOST_REGS;hr++) {
3603     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3604   }
3605   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3606   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3607   {
3608     // Loads use a temporary register which we need to save
3609     reglist|=1<<temp;
3610   }
3611   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3612     ar=temp;
3613   else // LWC1/LDC1
3614     ar=tl;
3615   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3616   //else c=(i_regs->wasconst>>s)&1;
3617   if(s>=0) c=(i_regs->wasconst>>s)&1;
3618   // Check cop1 unusable
3619   if(!cop1_usable) {
3620     signed char rs=get_reg(i_regs->regmap,CSREG);
3621     assert(rs>=0);
3622     emit_testimm(rs,0x20000000);
3623     jaddr=(int)out;
3624     emit_jeq(0);
3625     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3626     cop1_usable=1;
3627   }
3628   if (opcode[i]==0x39) { // SWC1 (get float address)
3629     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3630   }
3631   if (opcode[i]==0x3D) { // SDC1 (get double address)
3632     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3633   }
3634   // Generate address + offset
3635   if(!using_tlb) {
3636     if(!c)
3637       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3638   }
3639   else
3640   {
3641     map=get_reg(i_regs->regmap,TLREG);
3642     assert(map>=0);
3643     reglist&=~(1<<map);
3644     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3645       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3646     }
3647     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3648       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3649     }
3650   }
3651   if (opcode[i]==0x39) { // SWC1 (read float)
3652     emit_readword_indexed(0,tl,tl);
3653   }
3654   if (opcode[i]==0x3D) { // SDC1 (read double)
3655     emit_readword_indexed(4,tl,th);
3656     emit_readword_indexed(0,tl,tl);
3657   }
3658   if (opcode[i]==0x31) { // LWC1 (get target address)
3659     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3660   }
3661   if (opcode[i]==0x35) { // LDC1 (get target address)
3662     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3663   }
3664   if(!using_tlb) {
3665     if(!c) {
3666       jaddr2=(int)out;
3667       emit_jno(0);
3668     }
3669     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3670       jaddr2=(int)out;
3671       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3672     }
3673     #ifdef DESTRUCTIVE_SHIFT
3674     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3675       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3676     }
3677     #endif
3678   }else{
3679     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3680       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3681     }
3682     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3683       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3684     }
3685   }
3686   if (opcode[i]==0x31) { // LWC1
3687     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3688     //gen_tlb_addr_r(ar,map);
3689     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3690     #ifdef HOST_IMM_ADDR32
3691     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3692     else
3693     #endif
3694     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3695     type=LOADW_STUB;
3696   }
3697   if (opcode[i]==0x35) { // LDC1
3698     assert(th>=0);
3699     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3700     //gen_tlb_addr_r(ar,map);
3701     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3702     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3703     #ifdef HOST_IMM_ADDR32
3704     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3705     else
3706     #endif
3707     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3708     type=LOADD_STUB;
3709   }
3710   if (opcode[i]==0x39) { // SWC1
3711     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3712     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3713     type=STOREW_STUB;
3714   }
3715   if (opcode[i]==0x3D) { // SDC1
3716     assert(th>=0);
3717     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3718     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3719     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3720     type=STORED_STUB;
3721   }
3722   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3723     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3724       #ifndef DESTRUCTIVE_SHIFT
3725       temp=offset||c||s<0?ar:s;
3726       #endif
3727       #if defined(HOST_IMM8)
3728       int ir=get_reg(i_regs->regmap,INVCP);
3729       assert(ir>=0);
3730       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3731       #else
3732       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3733       #endif
3734       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3735       emit_callne(invalidate_addr_reg[temp]);
3736       #else
3737       jaddr3=(int)out;
3738       emit_jne(0);
3739       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3740       #endif
3741     }
3742   }
3743   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3744   if (opcode[i]==0x31) { // LWC1 (write float)
3745     emit_writeword_indexed(tl,0,temp);
3746   }
3747   if (opcode[i]==0x35) { // LDC1 (write double)
3748     emit_writeword_indexed(th,4,temp);
3749     emit_writeword_indexed(tl,0,temp);
3750   }
3751   //if(opcode[i]==0x39)
3752   /*if(opcode[i]==0x39||opcode[i]==0x31)
3753   {
3754     emit_pusha();
3755         emit_readword((int)&last_count,ECX);
3756         if(get_reg(i_regs->regmap,CCREG)<0)
3757           emit_loadreg(CCREG,HOST_CCREG);
3758         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3759         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3760         emit_writeword(HOST_CCREG,(int)&Count);
3761     emit_call((int)memdebug);
3762     emit_popa();
3763   }/**/
3764 #else
3765   cop1_unusable(i, i_regs);
3766 #endif
3767 }
3768
3769 void c2ls_assemble(int i,struct regstat *i_regs)
3770 {
3771   int s,tl;
3772   int ar;
3773   int offset;
3774   int memtarget=0,c=0;
3775   int jaddr2=0,jaddr3,type;
3776   int agr=AGEN1+(i&1);
3777   int fastio_reg_override=0;
3778   u_int hr,reglist=0;
3779   u_int copr=(source[i]>>16)&0x1f;
3780   s=get_reg(i_regs->regmap,rs1[i]);
3781   tl=get_reg(i_regs->regmap,FTEMP);
3782   offset=imm[i];
3783   assert(rs1[i]>0);
3784   assert(tl>=0);
3785   assert(!using_tlb);
3786
3787   for(hr=0;hr<HOST_REGS;hr++) {
3788     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3789   }
3790   if(i_regs->regmap[HOST_CCREG]==CCREG)
3791     reglist&=~(1<<HOST_CCREG);
3792
3793   // get the address
3794   if (opcode[i]==0x3a) { // SWC2
3795     ar=get_reg(i_regs->regmap,agr);
3796     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3797     reglist|=1<<ar;
3798   } else { // LWC2
3799     ar=tl;
3800   }
3801   if(s>=0) c=(i_regs->wasconst>>s)&1;
3802   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3803   if (!offset&&!c&&s>=0) ar=s;
3804   assert(ar>=0);
3805
3806   if (opcode[i]==0x3a) { // SWC2
3807     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3808     type=STOREW_STUB;
3809   }
3810   else
3811     type=LOADW_STUB;
3812
3813   if(c&&!memtarget) {
3814     jaddr2=(int)out;
3815     emit_jmp(0); // inline_readstub/inline_writestub?
3816   }
3817   else {
3818     if(!c) {
3819       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3820     }
3821     else if(ram_offset&&memtarget) {
3822       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3823       fastio_reg_override=HOST_TEMPREG;
3824     }
3825     if (opcode[i]==0x32) { // LWC2
3826       #ifdef HOST_IMM_ADDR32
3827       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3828       else
3829       #endif
3830       int a=ar;
3831       if(fastio_reg_override) a=fastio_reg_override;
3832       emit_readword_indexed(0,a,tl);
3833     }
3834     if (opcode[i]==0x3a) { // SWC2
3835       #ifdef DESTRUCTIVE_SHIFT
3836       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3837       #endif
3838       int a=ar;
3839       if(fastio_reg_override) a=fastio_reg_override;
3840       emit_writeword_indexed(tl,0,a);
3841     }
3842   }
3843   if(jaddr2)
3844     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3845   if(opcode[i]==0x3a) // SWC2
3846   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3847 #if defined(HOST_IMM8)
3848     int ir=get_reg(i_regs->regmap,INVCP);
3849     assert(ir>=0);
3850     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3851 #else
3852     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3853 #endif
3854     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3855     emit_callne(invalidate_addr_reg[ar]);
3856     #else
3857     jaddr3=(int)out;
3858     emit_jne(0);
3859     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3860     #endif
3861   }
3862   if (opcode[i]==0x32) { // LWC2
3863     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3864   }
3865 }
3866
3867 #ifndef multdiv_assemble
3868 void multdiv_assemble(int i,struct regstat *i_regs)
3869 {
3870   printf("Need multdiv_assemble for this architecture.\n");
3871   exit(1);
3872 }
3873 #endif
3874
3875 void mov_assemble(int i,struct regstat *i_regs)
3876 {
3877   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3878   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3879   if(rt1[i]) {
3880     signed char sh,sl,th,tl;
3881     th=get_reg(i_regs->regmap,rt1[i]|64);
3882     tl=get_reg(i_regs->regmap,rt1[i]);
3883     //assert(tl>=0);
3884     if(tl>=0) {
3885       sh=get_reg(i_regs->regmap,rs1[i]|64);
3886       sl=get_reg(i_regs->regmap,rs1[i]);
3887       if(sl>=0) emit_mov(sl,tl);
3888       else emit_loadreg(rs1[i],tl);
3889       if(th>=0) {
3890         if(sh>=0) emit_mov(sh,th);
3891         else emit_loadreg(rs1[i]|64,th);
3892       }
3893     }
3894   }
3895 }
3896
3897 #ifndef fconv_assemble
3898 void fconv_assemble(int i,struct regstat *i_regs)
3899 {
3900   printf("Need fconv_assemble for this architecture.\n");
3901   exit(1);
3902 }
3903 #endif
3904
3905 #if 0
3906 void float_assemble(int i,struct regstat *i_regs)
3907 {
3908   printf("Need float_assemble for this architecture.\n");
3909   exit(1);
3910 }
3911 #endif
3912
3913 void syscall_assemble(int i,struct regstat *i_regs)
3914 {
3915   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3916   assert(ccreg==HOST_CCREG);
3917   assert(!is_delayslot);
3918   emit_movimm(start+i*4,EAX); // Get PC
3919   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3920   emit_jmp((int)jump_syscall_hle); // XXX
3921 }
3922
3923 void hlecall_assemble(int i,struct regstat *i_regs)
3924 {
3925   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3926   assert(ccreg==HOST_CCREG);
3927   assert(!is_delayslot);
3928   emit_movimm(start+i*4+4,0); // Get PC
3929   emit_movimm((int)psxHLEt[source[i]&7],1);
3930   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3931   emit_jmp((int)jump_hlecall);
3932 }
3933
3934 void intcall_assemble(int i,struct regstat *i_regs)
3935 {
3936   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3937   assert(ccreg==HOST_CCREG);
3938   assert(!is_delayslot);
3939   emit_movimm(start+i*4,0); // Get PC
3940   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3941   emit_jmp((int)jump_intcall);
3942 }
3943
3944 void ds_assemble(int i,struct regstat *i_regs)
3945 {
3946   speculate_register_values(i);
3947   is_delayslot=1;
3948   switch(itype[i]) {
3949     case ALU:
3950       alu_assemble(i,i_regs);break;
3951     case IMM16:
3952       imm16_assemble(i,i_regs);break;
3953     case SHIFT:
3954       shift_assemble(i,i_regs);break;
3955     case SHIFTIMM:
3956       shiftimm_assemble(i,i_regs);break;
3957     case LOAD:
3958       load_assemble(i,i_regs);break;
3959     case LOADLR:
3960       loadlr_assemble(i,i_regs);break;
3961     case STORE:
3962       store_assemble(i,i_regs);break;
3963     case STORELR:
3964       storelr_assemble(i,i_regs);break;
3965     case COP0:
3966       cop0_assemble(i,i_regs);break;
3967     case COP1:
3968       cop1_assemble(i,i_regs);break;
3969     case C1LS:
3970       c1ls_assemble(i,i_regs);break;
3971     case COP2:
3972       cop2_assemble(i,i_regs);break;
3973     case C2LS:
3974       c2ls_assemble(i,i_regs);break;
3975     case C2OP:
3976       c2op_assemble(i,i_regs);break;
3977     case FCONV:
3978       fconv_assemble(i,i_regs);break;
3979     case FLOAT:
3980       float_assemble(i,i_regs);break;
3981     case FCOMP:
3982       fcomp_assemble(i,i_regs);break;
3983     case MULTDIV:
3984       multdiv_assemble(i,i_regs);break;
3985     case MOV:
3986       mov_assemble(i,i_regs);break;
3987     case SYSCALL:
3988     case HLECALL:
3989     case INTCALL:
3990     case SPAN:
3991     case UJUMP:
3992     case RJUMP:
3993     case CJUMP:
3994     case SJUMP:
3995     case FJUMP:
3996       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3997   }
3998   is_delayslot=0;
3999 }
4000
4001 // Is the branch target a valid internal jump?
4002 int internal_branch(uint64_t i_is32,int addr)
4003 {
4004   if(addr&1) return 0; // Indirect (register) jump
4005   if(addr>=start && addr<start+slen*4-4)
4006   {
4007     int t=(addr-start)>>2;
4008     // Delay slots are not valid branch targets
4009     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4010     // 64 -> 32 bit transition requires a recompile
4011     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4012     {
4013       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4014       else printf("optimizable: yes\n");
4015     }*/
4016     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4017 #ifndef FORCE32
4018     if(requires_32bit[t]&~i_is32) return 0;
4019     else
4020 #endif
4021       return 1;
4022   }
4023   return 0;
4024 }
4025
4026 #ifndef wb_invalidate
4027 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4028   uint64_t u,uint64_t uu)
4029 {
4030   int hr;
4031   for(hr=0;hr<HOST_REGS;hr++) {
4032     if(hr!=EXCLUDE_REG) {
4033       if(pre[hr]!=entry[hr]) {
4034         if(pre[hr]>=0) {
4035           if((dirty>>hr)&1) {
4036             if(get_reg(entry,pre[hr])<0) {
4037               if(pre[hr]<64) {
4038                 if(!((u>>pre[hr])&1)) {
4039                   emit_storereg(pre[hr],hr);
4040                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4041                     emit_sarimm(hr,31,hr);
4042                     emit_storereg(pre[hr]|64,hr);
4043                   }
4044                 }
4045               }else{
4046                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4047                   emit_storereg(pre[hr],hr);
4048                 }
4049               }
4050             }
4051           }
4052         }
4053       }
4054     }
4055   }
4056   // Move from one register to another (no writeback)
4057   for(hr=0;hr<HOST_REGS;hr++) {
4058     if(hr!=EXCLUDE_REG) {
4059       if(pre[hr]!=entry[hr]) {
4060         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4061           int nr;
4062           if((nr=get_reg(entry,pre[hr]))>=0) {
4063             emit_mov(hr,nr);
4064           }
4065         }
4066       }
4067     }
4068   }
4069 }
4070 #endif
4071
4072 // Load the specified registers
4073 // This only loads the registers given as arguments because
4074 // we don't want to load things that will be overwritten
4075 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4076 {
4077   int hr;
4078   // Load 32-bit regs
4079   for(hr=0;hr<HOST_REGS;hr++) {
4080     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4081       if(entry[hr]!=regmap[hr]) {
4082         if(regmap[hr]==rs1||regmap[hr]==rs2)
4083         {
4084           if(regmap[hr]==0) {
4085             emit_zeroreg(hr);
4086           }
4087           else
4088           {
4089             emit_loadreg(regmap[hr],hr);
4090           }
4091         }
4092       }
4093     }
4094   }
4095   //Load 64-bit regs
4096   for(hr=0;hr<HOST_REGS;hr++) {
4097     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4098       if(entry[hr]!=regmap[hr]) {
4099         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4100         {
4101           assert(regmap[hr]!=64);
4102           if((is32>>(regmap[hr]&63))&1) {
4103             int lr=get_reg(regmap,regmap[hr]-64);
4104             if(lr>=0)
4105               emit_sarimm(lr,31,hr);
4106             else
4107               emit_loadreg(regmap[hr],hr);
4108           }
4109           else
4110           {
4111             emit_loadreg(regmap[hr],hr);
4112           }
4113         }
4114       }
4115     }
4116   }
4117 }
4118
4119 // Load registers prior to the start of a loop
4120 // so that they are not loaded within the loop
4121 static void loop_preload(signed char pre[],signed char entry[])
4122 {
4123   int hr;
4124   for(hr=0;hr<HOST_REGS;hr++) {
4125     if(hr!=EXCLUDE_REG) {
4126       if(pre[hr]!=entry[hr]) {
4127         if(entry[hr]>=0) {
4128           if(get_reg(pre,entry[hr])<0) {
4129             assem_debug("loop preload:\n");
4130             //printf("loop preload: %d\n",hr);
4131             if(entry[hr]==0) {
4132               emit_zeroreg(hr);
4133             }
4134             else if(entry[hr]<TEMPREG)
4135             {
4136               emit_loadreg(entry[hr],hr);
4137             }
4138             else if(entry[hr]-64<TEMPREG)
4139             {
4140               emit_loadreg(entry[hr],hr);
4141             }
4142           }
4143         }
4144       }
4145     }
4146   }
4147 }
4148
4149 // Generate address for load/store instruction
4150 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4151 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4152 {
4153   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4154     int ra=-1;
4155     int agr=AGEN1+(i&1);
4156     int mgr=MGEN1+(i&1);
4157     if(itype[i]==LOAD) {
4158       ra=get_reg(i_regs->regmap,rt1[i]);
4159       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4160       assert(ra>=0);
4161     }
4162     if(itype[i]==LOADLR) {
4163       ra=get_reg(i_regs->regmap,FTEMP);
4164     }
4165     if(itype[i]==STORE||itype[i]==STORELR) {
4166       ra=get_reg(i_regs->regmap,agr);
4167       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4168     }
4169     if(itype[i]==C1LS||itype[i]==C2LS) {
4170       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4171         ra=get_reg(i_regs->regmap,FTEMP);
4172       else { // SWC1/SDC1/SWC2/SDC2
4173         ra=get_reg(i_regs->regmap,agr);
4174         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4175       }
4176     }
4177     int rs=get_reg(i_regs->regmap,rs1[i]);
4178     int rm=get_reg(i_regs->regmap,TLREG);
4179     if(ra>=0) {
4180       int offset=imm[i];
4181       int c=(i_regs->wasconst>>rs)&1;
4182       if(rs1[i]==0) {
4183         // Using r0 as a base address
4184         /*if(rm>=0) {
4185           if(!entry||entry[rm]!=mgr) {
4186             generate_map_const(offset,rm);
4187           } // else did it in the previous cycle
4188         }*/
4189         if(!entry||entry[ra]!=agr) {
4190           if (opcode[i]==0x22||opcode[i]==0x26) {
4191             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4192           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4193             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4194           }else{
4195             emit_movimm(offset,ra);
4196           }
4197         } // else did it in the previous cycle
4198       }
4199       else if(rs<0) {
4200         if(!entry||entry[ra]!=rs1[i])
4201           emit_loadreg(rs1[i],ra);
4202         //if(!entry||entry[ra]!=rs1[i])
4203         //  printf("poor load scheduling!\n");
4204       }
4205       else if(c) {
4206 #ifndef DISABLE_TLB
4207         if(rm>=0) {
4208           if(!entry||entry[rm]!=mgr) {
4209             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4210               // Stores to memory go thru the mapper to detect self-modifying
4211               // code, loads don't.
4212               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4213                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4214                 generate_map_const(constmap[i][rs]+offset,rm);
4215             }else{
4216               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4217                 generate_map_const(constmap[i][rs]+offset,rm);
4218             }
4219           }
4220         }
4221 #endif
4222         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4223           if(!entry||entry[ra]!=agr) {
4224             if (opcode[i]==0x22||opcode[i]==0x26) {
4225               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4226             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4227               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4228             }else{
4229               #ifdef HOST_IMM_ADDR32
4230               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4231                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4232               #endif
4233               emit_movimm(constmap[i][rs]+offset,ra);
4234               regs[i].loadedconst|=1<<ra;
4235             }
4236           } // else did it in the previous cycle
4237         } // else load_consts already did it
4238       }
4239       if(offset&&!c&&rs1[i]) {
4240         if(rs>=0) {
4241           emit_addimm(rs,offset,ra);
4242         }else{
4243           emit_addimm(ra,offset,ra);
4244         }
4245       }
4246     }
4247   }
4248   // Preload constants for next instruction
4249   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4250     int agr,ra;
4251     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4252     // Mapper entry
4253     agr=MGEN1+((i+1)&1);
4254     ra=get_reg(i_regs->regmap,agr);
4255     if(ra>=0) {
4256       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4257       int offset=imm[i+1];
4258       int c=(regs[i+1].wasconst>>rs)&1;
4259       if(c) {
4260         if(itype[i+1]==STORE||itype[i+1]==STORELR
4261            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4262           // Stores to memory go thru the mapper to detect self-modifying
4263           // code, loads don't.
4264           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4265              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4266             generate_map_const(constmap[i+1][rs]+offset,ra);
4267         }else{
4268           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4269             generate_map_const(constmap[i+1][rs]+offset,ra);
4270         }
4271       }
4272       /*else if(rs1[i]==0) {
4273         generate_map_const(offset,ra);
4274       }*/
4275     }
4276     #endif
4277     // Actual address
4278     agr=AGEN1+((i+1)&1);
4279     ra=get_reg(i_regs->regmap,agr);
4280     if(ra>=0) {
4281       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4282       int offset=imm[i+1];
4283       int c=(regs[i+1].wasconst>>rs)&1;
4284       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4285         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4286           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4287         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4288           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4289         }else{
4290           #ifdef HOST_IMM_ADDR32
4291           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4292              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4293           #endif
4294           emit_movimm(constmap[i+1][rs]+offset,ra);
4295           regs[i+1].loadedconst|=1<<ra;
4296         }
4297       }
4298       else if(rs1[i+1]==0) {
4299         // Using r0 as a base address
4300         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4301           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4302         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4303           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4304         }else{
4305           emit_movimm(offset,ra);
4306         }
4307       }
4308     }
4309   }
4310 }
4311
4312 int get_final_value(int hr, int i, int *value)
4313 {
4314   int reg=regs[i].regmap[hr];
4315   while(i<slen-1) {
4316     if(regs[i+1].regmap[hr]!=reg) break;
4317     if(!((regs[i+1].isconst>>hr)&1)) break;
4318     if(bt[i+1]) break;
4319     i++;
4320   }
4321   if(i<slen-1) {
4322     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4323       *value=constmap[i][hr];
4324       return 1;
4325     }
4326     if(!bt[i+1]) {
4327       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4328         // Load in delay slot, out-of-order execution
4329         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4330         {
4331           #ifdef HOST_IMM_ADDR32
4332           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4333           #endif
4334           // Precompute load address
4335           *value=constmap[i][hr]+imm[i+2];
4336           return 1;
4337         }
4338       }
4339       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4340       {
4341         #ifdef HOST_IMM_ADDR32
4342         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4343         #endif
4344         // Precompute load address
4345         *value=constmap[i][hr]+imm[i+1];
4346         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4347         return 1;
4348       }
4349     }
4350   }
4351   *value=constmap[i][hr];
4352   //printf("c=%x\n",(int)constmap[i][hr]);
4353   if(i==slen-1) return 1;
4354   if(reg<64) {
4355     return !((unneeded_reg[i+1]>>reg)&1);
4356   }else{
4357     return !((unneeded_reg_upper[i+1]>>reg)&1);
4358   }
4359 }
4360
4361 // Load registers with known constants
4362 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4363 {
4364   int hr,hr2;
4365   // propagate loaded constant flags
4366   if(i==0||bt[i])
4367     regs[i].loadedconst=0;
4368   else {
4369     for(hr=0;hr<HOST_REGS;hr++) {
4370       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4371          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4372       {
4373         regs[i].loadedconst|=1<<hr;
4374       }
4375     }
4376   }
4377   // Load 32-bit regs
4378   for(hr=0;hr<HOST_REGS;hr++) {
4379     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4380       //if(entry[hr]!=regmap[hr]) {
4381       if(!((regs[i].loadedconst>>hr)&1)) {
4382         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4383           int value,similar=0;
4384           if(get_final_value(hr,i,&value)) {
4385             // see if some other register has similar value
4386             for(hr2=0;hr2<HOST_REGS;hr2++) {
4387               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4388                 if(is_similar_value(value,constmap[i][hr2])) {
4389                   similar=1;
4390                   break;
4391                 }
4392               }
4393             }
4394             if(similar) {
4395               int value2;
4396               if(get_final_value(hr2,i,&value2)) // is this needed?
4397                 emit_movimm_from(value2,hr2,value,hr);
4398               else
4399                 emit_movimm(value,hr);
4400             }
4401             else if(value==0) {
4402               emit_zeroreg(hr);
4403             }
4404             else {
4405               emit_movimm(value,hr);
4406             }
4407           }
4408           regs[i].loadedconst|=1<<hr;
4409         }
4410       }
4411     }
4412   }
4413   // Load 64-bit regs
4414   for(hr=0;hr<HOST_REGS;hr++) {
4415     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4416       //if(entry[hr]!=regmap[hr]) {
4417       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4418         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4419           if((is32>>(regmap[hr]&63))&1) {
4420             int lr=get_reg(regmap,regmap[hr]-64);
4421             assert(lr>=0);
4422             emit_sarimm(lr,31,hr);
4423           }
4424           else
4425           {
4426             int value;
4427             if(get_final_value(hr,i,&value)) {
4428               if(value==0) {
4429                 emit_zeroreg(hr);
4430               }
4431               else {
4432                 emit_movimm(value,hr);
4433               }
4434             }
4435           }
4436         }
4437       }
4438     }
4439   }
4440 }
4441 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4442 {
4443   int hr;
4444   // Load 32-bit regs
4445   for(hr=0;hr<HOST_REGS;hr++) {
4446     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4447       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4448         int value=constmap[i][hr];
4449         if(value==0) {
4450           emit_zeroreg(hr);
4451         }
4452         else {
4453           emit_movimm(value,hr);
4454         }
4455       }
4456     }
4457   }
4458   // Load 64-bit regs
4459   for(hr=0;hr<HOST_REGS;hr++) {
4460     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4461       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4462         if((is32>>(regmap[hr]&63))&1) {
4463           int lr=get_reg(regmap,regmap[hr]-64);
4464           assert(lr>=0);
4465           emit_sarimm(lr,31,hr);
4466         }
4467         else
4468         {
4469           int value=constmap[i][hr];
4470           if(value==0) {
4471             emit_zeroreg(hr);
4472           }
4473           else {
4474             emit_movimm(value,hr);
4475           }
4476         }
4477       }
4478     }
4479   }
4480 }
4481
4482 // Write out all dirty registers (except cycle count)
4483 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4484 {
4485   int hr;
4486   for(hr=0;hr<HOST_REGS;hr++) {
4487     if(hr!=EXCLUDE_REG) {
4488       if(i_regmap[hr]>0) {
4489         if(i_regmap[hr]!=CCREG) {
4490           if((i_dirty>>hr)&1) {
4491             if(i_regmap[hr]<64) {
4492               emit_storereg(i_regmap[hr],hr);
4493 #ifndef FORCE32
4494               if( ((i_is32>>i_regmap[hr])&1) ) {
4495                 #ifdef DESTRUCTIVE_WRITEBACK
4496                 emit_sarimm(hr,31,hr);
4497                 emit_storereg(i_regmap[hr]|64,hr);
4498                 #else
4499                 emit_sarimm(hr,31,HOST_TEMPREG);
4500                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4501                 #endif
4502               }
4503 #endif
4504             }else{
4505               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4506                 emit_storereg(i_regmap[hr],hr);
4507               }
4508             }
4509           }
4510         }
4511       }
4512     }
4513   }
4514 }
4515 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4516 // This writes the registers not written by store_regs_bt
4517 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4518 {
4519   int hr;
4520   int t=(addr-start)>>2;
4521   for(hr=0;hr<HOST_REGS;hr++) {
4522     if(hr!=EXCLUDE_REG) {
4523       if(i_regmap[hr]>0) {
4524         if(i_regmap[hr]!=CCREG) {
4525           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4526             if((i_dirty>>hr)&1) {
4527               if(i_regmap[hr]<64) {
4528                 emit_storereg(i_regmap[hr],hr);
4529 #ifndef FORCE32
4530                 if( ((i_is32>>i_regmap[hr])&1) ) {
4531                   #ifdef DESTRUCTIVE_WRITEBACK
4532                   emit_sarimm(hr,31,hr);
4533                   emit_storereg(i_regmap[hr]|64,hr);
4534                   #else
4535                   emit_sarimm(hr,31,HOST_TEMPREG);
4536                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4537                   #endif
4538                 }
4539 #endif
4540               }else{
4541                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4542                   emit_storereg(i_regmap[hr],hr);
4543                 }
4544               }
4545             }
4546           }
4547         }
4548       }
4549     }
4550   }
4551 }
4552
4553 // Load all registers (except cycle count)
4554 void load_all_regs(signed char i_regmap[])
4555 {
4556   int hr;
4557   for(hr=0;hr<HOST_REGS;hr++) {
4558     if(hr!=EXCLUDE_REG) {
4559       if(i_regmap[hr]==0) {
4560         emit_zeroreg(hr);
4561       }
4562       else
4563       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4564       {
4565         emit_loadreg(i_regmap[hr],hr);
4566       }
4567     }
4568   }
4569 }
4570
4571 // Load all current registers also needed by next instruction
4572 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4573 {
4574   int hr;
4575   for(hr=0;hr<HOST_REGS;hr++) {
4576     if(hr!=EXCLUDE_REG) {
4577       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4578         if(i_regmap[hr]==0) {
4579           emit_zeroreg(hr);
4580         }
4581         else
4582         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4583         {
4584           emit_loadreg(i_regmap[hr],hr);
4585         }
4586       }
4587     }
4588   }
4589 }
4590
4591 // Load all regs, storing cycle count if necessary
4592 void load_regs_entry(int t)
4593 {
4594   int hr;
4595   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4596   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4597   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4598     emit_storereg(CCREG,HOST_CCREG);
4599   }
4600   // Load 32-bit regs
4601   for(hr=0;hr<HOST_REGS;hr++) {
4602     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4603       if(regs[t].regmap_entry[hr]==0) {
4604         emit_zeroreg(hr);
4605       }
4606       else if(regs[t].regmap_entry[hr]!=CCREG)
4607       {
4608         emit_loadreg(regs[t].regmap_entry[hr],hr);
4609       }
4610     }
4611   }
4612   // Load 64-bit regs
4613   for(hr=0;hr<HOST_REGS;hr++) {
4614     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4615       assert(regs[t].regmap_entry[hr]!=64);
4616       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4617         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4618         if(lr<0) {
4619           emit_loadreg(regs[t].regmap_entry[hr],hr);
4620         }
4621         else
4622         {
4623           emit_sarimm(lr,31,hr);
4624         }
4625       }
4626       else
4627       {
4628         emit_loadreg(regs[t].regmap_entry[hr],hr);
4629       }
4630     }
4631   }
4632 }
4633
4634 // Store dirty registers prior to branch
4635 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4636 {
4637   if(internal_branch(i_is32,addr))
4638   {
4639     int t=(addr-start)>>2;
4640     int hr;
4641     for(hr=0;hr<HOST_REGS;hr++) {
4642       if(hr!=EXCLUDE_REG) {
4643         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4644           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4645             if((i_dirty>>hr)&1) {
4646               if(i_regmap[hr]<64) {
4647                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4648                   emit_storereg(i_regmap[hr],hr);
4649                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4650                     #ifdef DESTRUCTIVE_WRITEBACK
4651                     emit_sarimm(hr,31,hr);
4652                     emit_storereg(i_regmap[hr]|64,hr);
4653                     #else
4654                     emit_sarimm(hr,31,HOST_TEMPREG);
4655                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4656                     #endif
4657                   }
4658                 }
4659               }else{
4660                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4661                   emit_storereg(i_regmap[hr],hr);
4662                 }
4663               }
4664             }
4665           }
4666         }
4667       }
4668     }
4669   }
4670   else
4671   {
4672     // Branch out of this block, write out all dirty regs
4673     wb_dirtys(i_regmap,i_is32,i_dirty);
4674   }
4675 }
4676
4677 // Load all needed registers for branch target
4678 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4679 {
4680   //if(addr>=start && addr<(start+slen*4))
4681   if(internal_branch(i_is32,addr))
4682   {
4683     int t=(addr-start)>>2;
4684     int hr;
4685     // Store the cycle count before loading something else
4686     if(i_regmap[HOST_CCREG]!=CCREG) {
4687       assert(i_regmap[HOST_CCREG]==-1);
4688     }
4689     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4690       emit_storereg(CCREG,HOST_CCREG);
4691     }
4692     // Load 32-bit regs
4693     for(hr=0;hr<HOST_REGS;hr++) {
4694       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4695         #ifdef DESTRUCTIVE_WRITEBACK
4696         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4697         #else
4698         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4699         #endif
4700           if(regs[t].regmap_entry[hr]==0) {
4701             emit_zeroreg(hr);
4702           }
4703           else if(regs[t].regmap_entry[hr]!=CCREG)
4704           {
4705             emit_loadreg(regs[t].regmap_entry[hr],hr);
4706           }
4707         }
4708       }
4709     }
4710     //Load 64-bit regs
4711     for(hr=0;hr<HOST_REGS;hr++) {
4712       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4713         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4714           assert(regs[t].regmap_entry[hr]!=64);
4715           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4716             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4717             if(lr<0) {
4718               emit_loadreg(regs[t].regmap_entry[hr],hr);
4719             }
4720             else
4721             {
4722               emit_sarimm(lr,31,hr);
4723             }
4724           }
4725           else
4726           {
4727             emit_loadreg(regs[t].regmap_entry[hr],hr);
4728           }
4729         }
4730         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4731           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4732           assert(lr>=0);
4733           emit_sarimm(lr,31,hr);
4734         }
4735       }
4736     }
4737   }
4738 }
4739
4740 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4741 {
4742   if(addr>=start && addr<start+slen*4-4)
4743   {
4744     int t=(addr-start)>>2;
4745     int hr;
4746     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4747     for(hr=0;hr<HOST_REGS;hr++)
4748     {
4749       if(hr!=EXCLUDE_REG)
4750       {
4751         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4752         {
4753           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4754           {
4755             return 0;
4756           }
4757           else 
4758           if((i_dirty>>hr)&1)
4759           {
4760             if(i_regmap[hr]<TEMPREG)
4761             {
4762               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4763                 return 0;
4764             }
4765             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4766             {
4767               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4768                 return 0;
4769             }
4770           }
4771         }
4772         else // Same register but is it 32-bit or dirty?
4773         if(i_regmap[hr]>=0)
4774         {
4775           if(!((regs[t].dirty>>hr)&1))
4776           {
4777             if((i_dirty>>hr)&1)
4778             {
4779               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4780               {
4781                 //printf("%x: dirty no match\n",addr);
4782                 return 0;
4783               }
4784             }
4785           }
4786           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4787           {
4788             //printf("%x: is32 no match\n",addr);
4789             return 0;
4790           }
4791         }
4792       }
4793     }
4794     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4795 #ifndef FORCE32
4796     if(requires_32bit[t]&~i_is32) return 0;
4797 #endif
4798     // Delay slots are not valid branch targets
4799     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4800     // Delay slots require additional processing, so do not match
4801     if(is_ds[t]) return 0;
4802   }
4803   else
4804   {
4805     int hr;
4806     for(hr=0;hr<HOST_REGS;hr++)
4807     {
4808       if(hr!=EXCLUDE_REG)
4809       {
4810         if(i_regmap[hr]>=0)
4811         {
4812           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4813           {
4814             if((i_dirty>>hr)&1)
4815             {
4816               return 0;
4817             }
4818           }
4819         }
4820       }
4821     }
4822   }
4823   return 1;
4824 }
4825
4826 // Used when a branch jumps into the delay slot of another branch
4827 void ds_assemble_entry(int i)
4828 {
4829   int t=(ba[i]-start)>>2;
4830   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4831   assem_debug("Assemble delay slot at %x\n",ba[i]);
4832   assem_debug("<->\n");
4833   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4834     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4835   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4836   address_generation(t,&regs[t],regs[t].regmap_entry);
4837   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4838     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4839   cop1_usable=0;
4840   is_delayslot=0;
4841   switch(itype[t]) {
4842     case ALU:
4843       alu_assemble(t,&regs[t]);break;
4844     case IMM16:
4845       imm16_assemble(t,&regs[t]);break;
4846     case SHIFT:
4847       shift_assemble(t,&regs[t]);break;
4848     case SHIFTIMM:
4849       shiftimm_assemble(t,&regs[t]);break;
4850     case LOAD:
4851       load_assemble(t,&regs[t]);break;
4852     case LOADLR:
4853       loadlr_assemble(t,&regs[t]);break;
4854     case STORE:
4855       store_assemble(t,&regs[t]);break;
4856     case STORELR:
4857       storelr_assemble(t,&regs[t]);break;
4858     case COP0:
4859       cop0_assemble(t,&regs[t]);break;
4860     case COP1:
4861       cop1_assemble(t,&regs[t]);break;
4862     case C1LS:
4863       c1ls_assemble(t,&regs[t]);break;
4864     case COP2:
4865       cop2_assemble(t,&regs[t]);break;
4866     case C2LS:
4867       c2ls_assemble(t,&regs[t]);break;
4868     case C2OP:
4869       c2op_assemble(t,&regs[t]);break;
4870     case FCONV:
4871       fconv_assemble(t,&regs[t]);break;
4872     case FLOAT:
4873       float_assemble(t,&regs[t]);break;
4874     case FCOMP:
4875       fcomp_assemble(t,&regs[t]);break;
4876     case MULTDIV:
4877       multdiv_assemble(t,&regs[t]);break;
4878     case MOV:
4879       mov_assemble(t,&regs[t]);break;
4880     case SYSCALL:
4881     case HLECALL:
4882     case INTCALL:
4883     case SPAN:
4884     case UJUMP:
4885     case RJUMP:
4886     case CJUMP:
4887     case SJUMP:
4888     case FJUMP:
4889       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4890   }
4891   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4892   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4893   if(internal_branch(regs[t].is32,ba[i]+4))
4894     assem_debug("branch: internal\n");
4895   else
4896     assem_debug("branch: external\n");
4897   assert(internal_branch(regs[t].is32,ba[i]+4));
4898   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4899   emit_jmp(0);
4900 }
4901
4902 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4903 {
4904   int count;
4905   int jaddr;
4906   int idle=0;
4907   int t=0;
4908   if(itype[i]==RJUMP)
4909   {
4910     *adj=0;
4911   }
4912   //if(ba[i]>=start && ba[i]<(start+slen*4))
4913   if(internal_branch(branch_regs[i].is32,ba[i]))
4914   {
4915     t=(ba[i]-start)>>2;
4916     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4917     else *adj=ccadj[t];
4918   }
4919   else
4920   {
4921     *adj=0;
4922   }
4923   count=ccadj[i];
4924   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4925     // Idle loop
4926     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4927     idle=(int)out;
4928     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4929     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4930     jaddr=(int)out;
4931     emit_jmp(0);
4932   }
4933   else if(*adj==0||invert) {
4934     int cycles=CLOCK_ADJUST(count+2);
4935     // faster loop HACK
4936     if (t&&*adj) {
4937       int rel=t-i;
4938       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4939         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4940     }
4941     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4942     jaddr=(int)out;
4943     emit_jns(0);
4944   }
4945   else
4946   {
4947     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4948     jaddr=(int)out;
4949     emit_jns(0);
4950   }
4951   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4952 }
4953
4954 void do_ccstub(int n)
4955 {
4956   literal_pool(256);
4957   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4958   set_jump_target(stubs[n][1],(int)out);
4959   int i=stubs[n][4];
4960   if(stubs[n][6]==NULLDS) {
4961     // Delay slot instruction is nullified ("likely" branch)
4962     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4963   }
4964   else if(stubs[n][6]!=TAKEN) {
4965     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4966   }
4967   else {
4968     if(internal_branch(branch_regs[i].is32,ba[i]))
4969       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4970   }
4971   if(stubs[n][5]!=-1)
4972   {
4973     // Save PC as return address
4974     emit_movimm(stubs[n][5],EAX);
4975     emit_writeword(EAX,(int)&pcaddr);
4976   }
4977   else
4978   {
4979     // Return address depends on which way the branch goes
4980     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4981     {
4982       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4983       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4984       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4985       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4986       if(rs1[i]==0)
4987       {
4988         s1l=s2l;s1h=s2h;
4989         s2l=s2h=-1;
4990       }
4991       else if(rs2[i]==0)
4992       {
4993         s2l=s2h=-1;
4994       }
4995       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4996         s1h=s2h=-1;
4997       }
4998       assert(s1l>=0);
4999       #ifdef DESTRUCTIVE_WRITEBACK
5000       if(rs1[i]) {
5001         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
5002           emit_loadreg(rs1[i],s1l);
5003       } 
5004       else {
5005         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
5006           emit_loadreg(rs2[i],s1l);
5007       }
5008       if(s2l>=0)
5009         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
5010           emit_loadreg(rs2[i],s2l);
5011       #endif
5012       int hr=0;
5013       int addr=-1,alt=-1,ntaddr=-1;
5014       while(hr<HOST_REGS)
5015       {
5016         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5017            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5018            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5019         {
5020           addr=hr++;break;
5021         }
5022         hr++;
5023       }
5024       while(hr<HOST_REGS)
5025       {
5026         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5027            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5028            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5029         {
5030           alt=hr++;break;
5031         }
5032         hr++;
5033       }
5034       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5035       {
5036         while(hr<HOST_REGS)
5037         {
5038           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5039              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5040              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5041           {
5042             ntaddr=hr;break;
5043           }
5044           hr++;
5045         }
5046         assert(hr<HOST_REGS);
5047       }
5048       if((opcode[i]&0x2f)==4) // BEQ
5049       {
5050         #ifdef HAVE_CMOV_IMM
5051         if(s1h<0) {
5052           if(s2l>=0) emit_cmp(s1l,s2l);
5053           else emit_test(s1l,s1l);
5054           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5055         }
5056         else
5057         #endif
5058         {
5059           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5060           if(s1h>=0) {
5061             if(s2h>=0) emit_cmp(s1h,s2h);
5062             else emit_test(s1h,s1h);
5063             emit_cmovne_reg(alt,addr);
5064           }
5065           if(s2l>=0) emit_cmp(s1l,s2l);
5066           else emit_test(s1l,s1l);
5067           emit_cmovne_reg(alt,addr);
5068         }
5069       }
5070       if((opcode[i]&0x2f)==5) // BNE
5071       {
5072         #ifdef HAVE_CMOV_IMM
5073         if(s1h<0) {
5074           if(s2l>=0) emit_cmp(s1l,s2l);
5075           else emit_test(s1l,s1l);
5076           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5077         }
5078         else
5079         #endif
5080         {
5081           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5082           if(s1h>=0) {
5083             if(s2h>=0) emit_cmp(s1h,s2h);
5084             else emit_test(s1h,s1h);
5085             emit_cmovne_reg(alt,addr);
5086           }
5087           if(s2l>=0) emit_cmp(s1l,s2l);
5088           else emit_test(s1l,s1l);
5089           emit_cmovne_reg(alt,addr);
5090         }
5091       }
5092       if((opcode[i]&0x2f)==6) // BLEZ
5093       {
5094         //emit_movimm(ba[i],alt);
5095         //emit_movimm(start+i*4+8,addr);
5096         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5097         emit_cmpimm(s1l,1);
5098         if(s1h>=0) emit_mov(addr,ntaddr);
5099         emit_cmovl_reg(alt,addr);
5100         if(s1h>=0) {
5101           emit_test(s1h,s1h);
5102           emit_cmovne_reg(ntaddr,addr);
5103           emit_cmovs_reg(alt,addr);
5104         }
5105       }
5106       if((opcode[i]&0x2f)==7) // BGTZ
5107       {
5108         //emit_movimm(ba[i],addr);
5109         //emit_movimm(start+i*4+8,ntaddr);
5110         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5111         emit_cmpimm(s1l,1);
5112         if(s1h>=0) emit_mov(addr,alt);
5113         emit_cmovl_reg(ntaddr,addr);
5114         if(s1h>=0) {
5115           emit_test(s1h,s1h);
5116           emit_cmovne_reg(alt,addr);
5117           emit_cmovs_reg(ntaddr,addr);
5118         }
5119       }
5120       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5121       {
5122         //emit_movimm(ba[i],alt);
5123         //emit_movimm(start+i*4+8,addr);
5124         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5125         if(s1h>=0) emit_test(s1h,s1h);
5126         else emit_test(s1l,s1l);
5127         emit_cmovs_reg(alt,addr);
5128       }
5129       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5130       {
5131         //emit_movimm(ba[i],addr);
5132         //emit_movimm(start+i*4+8,alt);
5133         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5134         if(s1h>=0) emit_test(s1h,s1h);
5135         else emit_test(s1l,s1l);
5136         emit_cmovs_reg(alt,addr);
5137       }
5138       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5139         if(source[i]&0x10000) // BC1T
5140         {
5141           //emit_movimm(ba[i],alt);
5142           //emit_movimm(start+i*4+8,addr);
5143           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5144           emit_testimm(s1l,0x800000);
5145           emit_cmovne_reg(alt,addr);
5146         }
5147         else // BC1F
5148         {
5149           //emit_movimm(ba[i],addr);
5150           //emit_movimm(start+i*4+8,alt);
5151           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5152           emit_testimm(s1l,0x800000);
5153           emit_cmovne_reg(alt,addr);
5154         }
5155       }
5156       emit_writeword(addr,(int)&pcaddr);
5157     }
5158     else
5159     if(itype[i]==RJUMP)
5160     {
5161       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5162       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5163         r=get_reg(branch_regs[i].regmap,RTEMP);
5164       }
5165       emit_writeword(r,(int)&pcaddr);
5166     }
5167     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
5168   }
5169   // Update cycle count
5170   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5171   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5172   emit_call((int)cc_interrupt);
5173   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5174   if(stubs[n][6]==TAKEN) {
5175     if(internal_branch(branch_regs[i].is32,ba[i]))
5176       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5177     else if(itype[i]==RJUMP) {
5178       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5179         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5180       else
5181         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5182     }
5183   }else if(stubs[n][6]==NOTTAKEN) {
5184     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5185     else load_all_regs(branch_regs[i].regmap);
5186   }else if(stubs[n][6]==NULLDS) {
5187     // Delay slot instruction is nullified ("likely" branch)
5188     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5189     else load_all_regs(regs[i].regmap);
5190   }else{
5191     load_all_regs(branch_regs[i].regmap);
5192   }
5193   emit_jmp(stubs[n][2]); // return address
5194   
5195   /* This works but uses a lot of memory...
5196   emit_readword((int)&last_count,ECX);
5197   emit_add(HOST_CCREG,ECX,EAX);
5198   emit_writeword(EAX,(int)&Count);
5199   emit_call((int)gen_interupt);
5200   emit_readword((int)&Count,HOST_CCREG);
5201   emit_readword((int)&next_interupt,EAX);
5202   emit_readword((int)&pending_exception,EBX);
5203   emit_writeword(EAX,(int)&last_count);
5204   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5205   emit_test(EBX,EBX);
5206   int jne_instr=(int)out;
5207   emit_jne(0);
5208   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5209   load_all_regs(branch_regs[i].regmap);
5210   emit_jmp(stubs[n][2]); // return address
5211   set_jump_target(jne_instr,(int)out);
5212   emit_readword((int)&pcaddr,EAX);
5213   // Call get_addr_ht instead of doing the hash table here.
5214   // This code is executed infrequently and takes up a lot of space
5215   // so smaller is better.
5216   emit_storereg(CCREG,HOST_CCREG);
5217   emit_pushreg(EAX);
5218   emit_call((int)get_addr_ht);
5219   emit_loadreg(CCREG,HOST_CCREG);
5220   emit_addimm(ESP,4,ESP);
5221   emit_jmpreg(EAX);*/
5222 }
5223
5224 add_to_linker(int addr,int target,int ext)
5225 {
5226   link_addr[linkcount][0]=addr;
5227   link_addr[linkcount][1]=target;
5228   link_addr[linkcount][2]=ext;  
5229   linkcount++;
5230 }
5231
5232 static void ujump_assemble_write_ra(int i)
5233 {
5234   int rt;
5235   unsigned int return_address;
5236   rt=get_reg(branch_regs[i].regmap,31);
5237   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5238   //assert(rt>=0);
5239   return_address=start+i*4+8;
5240   if(rt>=0) {
5241     #ifdef USE_MINI_HT
5242     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5243       int temp=-1; // note: must be ds-safe
5244       #ifdef HOST_TEMPREG
5245       temp=HOST_TEMPREG;
5246       #endif
5247       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5248       else emit_movimm(return_address,rt);
5249     }
5250     else
5251     #endif
5252     {
5253       #ifdef REG_PREFETCH
5254       if(temp>=0) 
5255       {
5256         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5257       }
5258       #endif
5259       emit_movimm(return_address,rt); // PC into link register
5260       #ifdef IMM_PREFETCH
5261       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5262       #endif
5263     }
5264   }
5265 }
5266
5267 void ujump_assemble(int i,struct regstat *i_regs)
5268 {
5269   signed char *i_regmap=i_regs->regmap;
5270   int ra_done=0;
5271   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5272   address_generation(i+1,i_regs,regs[i].regmap_entry);
5273   #ifdef REG_PREFETCH
5274   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5275   if(rt1[i]==31&&temp>=0) 
5276   {
5277     int return_address=start+i*4+8;
5278     if(get_reg(branch_regs[i].regmap,31)>0) 
5279     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5280   }
5281   #endif
5282   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5283     ujump_assemble_write_ra(i); // writeback ra for DS
5284     ra_done=1;
5285   }
5286   ds_assemble(i+1,i_regs);
5287   uint64_t bc_unneeded=branch_regs[i].u;
5288   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5289   bc_unneeded|=1|(1LL<<rt1[i]);
5290   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5291   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5292                 bc_unneeded,bc_unneeded_upper);
5293   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5294   if(!ra_done&&rt1[i]==31)
5295     ujump_assemble_write_ra(i);
5296   int cc,adj;
5297   cc=get_reg(branch_regs[i].regmap,CCREG);
5298   assert(cc==HOST_CCREG);
5299   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5300   #ifdef REG_PREFETCH
5301   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5302   #endif
5303   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5304   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5305   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5306   if(internal_branch(branch_regs[i].is32,ba[i]))
5307     assem_debug("branch: internal\n");
5308   else
5309     assem_debug("branch: external\n");
5310   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5311     ds_assemble_entry(i);
5312   }
5313   else {
5314     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5315     emit_jmp(0);
5316   }
5317 }
5318
5319 static void rjump_assemble_write_ra(int i)
5320 {
5321   int rt,return_address;
5322   assert(rt1[i+1]!=rt1[i]);
5323   assert(rt2[i+1]!=rt1[i]);
5324   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5325   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5326   assert(rt>=0);
5327   return_address=start+i*4+8;
5328   #ifdef REG_PREFETCH
5329   if(temp>=0) 
5330   {
5331     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5332   }
5333   #endif
5334   emit_movimm(return_address,rt); // PC into link register
5335   #ifdef IMM_PREFETCH
5336   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5337   #endif
5338 }
5339
5340 void rjump_assemble(int i,struct regstat *i_regs)
5341 {
5342   signed char *i_regmap=i_regs->regmap;
5343   int temp;
5344   int rs,cc,adj;
5345   int ra_done=0;
5346   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5347   assert(rs>=0);
5348   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5349     // Delay slot abuse, make a copy of the branch address register
5350     temp=get_reg(branch_regs[i].regmap,RTEMP);
5351     assert(temp>=0);
5352     assert(regs[i].regmap[temp]==RTEMP);
5353     emit_mov(rs,temp);
5354     rs=temp;
5355   }
5356   address_generation(i+1,i_regs,regs[i].regmap_entry);
5357   #ifdef REG_PREFETCH
5358   if(rt1[i]==31) 
5359   {
5360     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5361       int return_address=start+i*4+8;
5362       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5363     }
5364   }
5365   #endif
5366   #ifdef USE_MINI_HT
5367   if(rs1[i]==31) {
5368     int rh=get_reg(regs[i].regmap,RHASH);
5369     if(rh>=0) do_preload_rhash(rh);
5370   }
5371   #endif
5372   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5373     rjump_assemble_write_ra(i);
5374     ra_done=1;
5375   }
5376   ds_assemble(i+1,i_regs);
5377   uint64_t bc_unneeded=branch_regs[i].u;
5378   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5379   bc_unneeded|=1|(1LL<<rt1[i]);
5380   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5381   bc_unneeded&=~(1LL<<rs1[i]);
5382   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5383                 bc_unneeded,bc_unneeded_upper);
5384   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5385   if(!ra_done&&rt1[i]!=0)
5386     rjump_assemble_write_ra(i);
5387   cc=get_reg(branch_regs[i].regmap,CCREG);
5388   assert(cc==HOST_CCREG);
5389   #ifdef USE_MINI_HT
5390   int rh=get_reg(branch_regs[i].regmap,RHASH);
5391   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5392   if(rs1[i]==31) {
5393     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5394     do_preload_rhtbl(ht);
5395     do_rhash(rs,rh);
5396   }
5397   #endif
5398   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5399   #ifdef DESTRUCTIVE_WRITEBACK
5400   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5401     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5402       emit_loadreg(rs1[i],rs);
5403     }
5404   }
5405   #endif
5406   #ifdef REG_PREFETCH
5407   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5408   #endif
5409   #ifdef USE_MINI_HT
5410   if(rs1[i]==31) {
5411     do_miniht_load(ht,rh);
5412   }
5413   #endif
5414   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5415   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5416   //assert(adj==0);
5417   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5418   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5419 #ifdef PCSX
5420   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5421     // special case for RFE
5422     emit_jmp(0);
5423   else
5424 #endif
5425   emit_jns(0);
5426   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5427   #ifdef USE_MINI_HT
5428   if(rs1[i]==31) {
5429     do_miniht_jump(rs,rh,ht);
5430   }
5431   else
5432   #endif
5433   {
5434     //if(rs!=EAX) emit_mov(rs,EAX);
5435     //emit_jmp((int)jump_vaddr_eax);
5436     emit_jmp(jump_vaddr_reg[rs]);
5437   }
5438   /* Check hash table
5439   temp=!rs;
5440   emit_mov(rs,temp);
5441   emit_shrimm(rs,16,rs);
5442   emit_xor(temp,rs,rs);
5443   emit_movzwl_reg(rs,rs);
5444   emit_shlimm(rs,4,rs);
5445   emit_cmpmem_indexed((int)hash_table,rs,temp);
5446   emit_jne((int)out+14);
5447   emit_readword_indexed((int)hash_table+4,rs,rs);
5448   emit_jmpreg(rs);
5449   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5450   emit_addimm_no_flags(8,rs);
5451   emit_jeq((int)out-17);
5452   // No hit on hash table, call compiler
5453   emit_pushreg(temp);
5454 //DEBUG >
5455 #ifdef DEBUG_CYCLE_COUNT
5456   emit_readword((int)&last_count,ECX);
5457   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5458   emit_readword((int)&next_interupt,ECX);
5459   emit_writeword(HOST_CCREG,(int)&Count);
5460   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5461   emit_writeword(ECX,(int)&last_count);
5462 #endif
5463 //DEBUG <
5464   emit_storereg(CCREG,HOST_CCREG);
5465   emit_call((int)get_addr);
5466   emit_loadreg(CCREG,HOST_CCREG);
5467   emit_addimm(ESP,4,ESP);
5468   emit_jmpreg(EAX);*/
5469   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5470   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5471   #endif
5472 }
5473
5474 void cjump_assemble(int i,struct regstat *i_regs)
5475 {
5476   signed char *i_regmap=i_regs->regmap;
5477   int cc;
5478   int match;
5479   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5480   assem_debug("match=%d\n",match);
5481   int s1h,s1l,s2h,s2l;
5482   int prev_cop1_usable=cop1_usable;
5483   int unconditional=0,nop=0;
5484   int only32=0;
5485   int invert=0;
5486   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5487   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5488   if(!match) invert=1;
5489   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5490   if(i>(ba[i]-start)>>2) invert=1;
5491   #endif
5492   
5493   if(ooo[i]) {
5494     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5495     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5496     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5497     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5498   }
5499   else {
5500     s1l=get_reg(i_regmap,rs1[i]);
5501     s1h=get_reg(i_regmap,rs1[i]|64);
5502     s2l=get_reg(i_regmap,rs2[i]);
5503     s2h=get_reg(i_regmap,rs2[i]|64);
5504   }
5505   if(rs1[i]==0&&rs2[i]==0)
5506   {
5507     if(opcode[i]&1) nop=1;
5508     else unconditional=1;
5509     //assert(opcode[i]!=5);
5510     //assert(opcode[i]!=7);
5511     //assert(opcode[i]!=0x15);
5512     //assert(opcode[i]!=0x17);
5513   }
5514   else if(rs1[i]==0)
5515   {
5516     s1l=s2l;s1h=s2h;
5517     s2l=s2h=-1;
5518     only32=(regs[i].was32>>rs2[i])&1;
5519   }
5520   else if(rs2[i]==0)
5521   {
5522     s2l=s2h=-1;
5523     only32=(regs[i].was32>>rs1[i])&1;
5524   }
5525   else {
5526     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5527   }
5528
5529   if(ooo[i]) {
5530     // Out of order execution (delay slot first)
5531     //printf("OOOE\n");
5532     address_generation(i+1,i_regs,regs[i].regmap_entry);
5533     ds_assemble(i+1,i_regs);
5534     int adj;
5535     uint64_t bc_unneeded=branch_regs[i].u;
5536     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5537     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5538     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5539     bc_unneeded|=1;
5540     bc_unneeded_upper|=1;
5541     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5542                   bc_unneeded,bc_unneeded_upper);
5543     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5544     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5545     cc=get_reg(branch_regs[i].regmap,CCREG);
5546     assert(cc==HOST_CCREG);
5547     if(unconditional) 
5548       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5549     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5550     //assem_debug("cycle count (adj)\n");
5551     if(unconditional) {
5552       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5553       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5554         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5555         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5556         if(internal)
5557           assem_debug("branch: internal\n");
5558         else
5559           assem_debug("branch: external\n");
5560         if(internal&&is_ds[(ba[i]-start)>>2]) {
5561           ds_assemble_entry(i);
5562         }
5563         else {
5564           add_to_linker((int)out,ba[i],internal);
5565           emit_jmp(0);
5566         }
5567         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5568         if(((u_int)out)&7) emit_addnop(0);
5569         #endif
5570       }
5571     }
5572     else if(nop) {
5573       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5574       int jaddr=(int)out;
5575       emit_jns(0);
5576       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5577     }
5578     else {
5579       int taken=0,nottaken=0,nottaken1=0;
5580       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5581       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5582       if(!only32)
5583       {
5584         assert(s1h>=0);
5585         if(opcode[i]==4) // BEQ
5586         {
5587           if(s2h>=0) emit_cmp(s1h,s2h);
5588           else emit_test(s1h,s1h);
5589           nottaken1=(int)out;
5590           emit_jne(1);
5591         }
5592         if(opcode[i]==5) // BNE
5593         {
5594           if(s2h>=0) emit_cmp(s1h,s2h);
5595           else emit_test(s1h,s1h);
5596           if(invert) taken=(int)out;
5597           else add_to_linker((int)out,ba[i],internal);
5598           emit_jne(0);
5599         }
5600         if(opcode[i]==6) // BLEZ
5601         {
5602           emit_test(s1h,s1h);
5603           if(invert) taken=(int)out;
5604           else add_to_linker((int)out,ba[i],internal);
5605           emit_js(0);
5606           nottaken1=(int)out;
5607           emit_jne(1);
5608         }
5609         if(opcode[i]==7) // BGTZ
5610         {
5611           emit_test(s1h,s1h);
5612           nottaken1=(int)out;
5613           emit_js(1);
5614           if(invert) taken=(int)out;
5615           else add_to_linker((int)out,ba[i],internal);
5616           emit_jne(0);
5617         }
5618       } // if(!only32)
5619           
5620       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5621       assert(s1l>=0);
5622       if(opcode[i]==4) // BEQ
5623       {
5624         if(s2l>=0) emit_cmp(s1l,s2l);
5625         else emit_test(s1l,s1l);
5626         if(invert){
5627           nottaken=(int)out;
5628           emit_jne(1);
5629         }else{
5630           add_to_linker((int)out,ba[i],internal);
5631           emit_jeq(0);
5632         }
5633       }
5634       if(opcode[i]==5) // BNE
5635       {
5636         if(s2l>=0) emit_cmp(s1l,s2l);
5637         else emit_test(s1l,s1l);
5638         if(invert){
5639           nottaken=(int)out;
5640           emit_jeq(1);
5641         }else{
5642           add_to_linker((int)out,ba[i],internal);
5643           emit_jne(0);
5644         }
5645       }
5646       if(opcode[i]==6) // BLEZ
5647       {
5648         emit_cmpimm(s1l,1);
5649         if(invert){
5650           nottaken=(int)out;
5651           emit_jge(1);
5652         }else{
5653           add_to_linker((int)out,ba[i],internal);
5654           emit_jl(0);
5655         }
5656       }
5657       if(opcode[i]==7) // BGTZ
5658       {
5659         emit_cmpimm(s1l,1);
5660         if(invert){
5661           nottaken=(int)out;
5662           emit_jl(1);
5663         }else{
5664           add_to_linker((int)out,ba[i],internal);
5665           emit_jge(0);
5666         }
5667       }
5668       if(invert) {
5669         if(taken) set_jump_target(taken,(int)out);
5670         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5671         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5672           if(adj) {
5673             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5674             add_to_linker((int)out,ba[i],internal);
5675           }else{
5676             emit_addnop(13);
5677             add_to_linker((int)out,ba[i],internal*2);
5678           }
5679           emit_jmp(0);
5680         }else
5681         #endif
5682         {
5683           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5684           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5685           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5686           if(internal)
5687             assem_debug("branch: internal\n");
5688           else
5689             assem_debug("branch: external\n");
5690           if(internal&&is_ds[(ba[i]-start)>>2]) {
5691             ds_assemble_entry(i);
5692           }
5693           else {
5694             add_to_linker((int)out,ba[i],internal);
5695             emit_jmp(0);
5696           }
5697         }
5698         set_jump_target(nottaken,(int)out);
5699       }
5700
5701       if(nottaken1) set_jump_target(nottaken1,(int)out);
5702       if(adj) {
5703         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5704       }
5705     } // (!unconditional)
5706   } // if(ooo)
5707   else
5708   {
5709     // In-order execution (branch first)
5710     //if(likely[i]) printf("IOL\n");
5711     //else
5712     //printf("IOE\n");
5713     int taken=0,nottaken=0,nottaken1=0;
5714     if(!unconditional&&!nop) {
5715       if(!only32)
5716       {
5717         assert(s1h>=0);
5718         if((opcode[i]&0x2f)==4) // BEQ
5719         {
5720           if(s2h>=0) emit_cmp(s1h,s2h);
5721           else emit_test(s1h,s1h);
5722           nottaken1=(int)out;
5723           emit_jne(2);
5724         }
5725         if((opcode[i]&0x2f)==5) // BNE
5726         {
5727           if(s2h>=0) emit_cmp(s1h,s2h);
5728           else emit_test(s1h,s1h);
5729           taken=(int)out;
5730           emit_jne(1);
5731         }
5732         if((opcode[i]&0x2f)==6) // BLEZ
5733         {
5734           emit_test(s1h,s1h);
5735           taken=(int)out;
5736           emit_js(1);
5737           nottaken1=(int)out;
5738           emit_jne(2);
5739         }
5740         if((opcode[i]&0x2f)==7) // BGTZ
5741         {
5742           emit_test(s1h,s1h);
5743           nottaken1=(int)out;
5744           emit_js(2);
5745           taken=(int)out;
5746           emit_jne(1);
5747         }
5748       } // if(!only32)
5749           
5750       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5751       assert(s1l>=0);
5752       if((opcode[i]&0x2f)==4) // BEQ
5753       {
5754         if(s2l>=0) emit_cmp(s1l,s2l);
5755         else emit_test(s1l,s1l);
5756         nottaken=(int)out;
5757         emit_jne(2);
5758       }
5759       if((opcode[i]&0x2f)==5) // BNE
5760       {
5761         if(s2l>=0) emit_cmp(s1l,s2l);
5762         else emit_test(s1l,s1l);
5763         nottaken=(int)out;
5764         emit_jeq(2);
5765       }
5766       if((opcode[i]&0x2f)==6) // BLEZ
5767       {
5768         emit_cmpimm(s1l,1);
5769         nottaken=(int)out;
5770         emit_jge(2);
5771       }
5772       if((opcode[i]&0x2f)==7) // BGTZ
5773       {
5774         emit_cmpimm(s1l,1);
5775         nottaken=(int)out;
5776         emit_jl(2);
5777       }
5778     } // if(!unconditional)
5779     int adj;
5780     uint64_t ds_unneeded=branch_regs[i].u;
5781     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5782     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5783     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5784     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5785     ds_unneeded|=1;
5786     ds_unneeded_upper|=1;
5787     // branch taken
5788     if(!nop) {
5789       if(taken) set_jump_target(taken,(int)out);
5790       assem_debug("1:\n");
5791       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5792                     ds_unneeded,ds_unneeded_upper);
5793       // load regs
5794       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5795       address_generation(i+1,&branch_regs[i],0);
5796       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5797       ds_assemble(i+1,&branch_regs[i]);
5798       cc=get_reg(branch_regs[i].regmap,CCREG);
5799       if(cc==-1) {
5800         emit_loadreg(CCREG,cc=HOST_CCREG);
5801         // CHECK: Is the following instruction (fall thru) allocated ok?
5802       }
5803       assert(cc==HOST_CCREG);
5804       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5805       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5806       assem_debug("cycle count (adj)\n");
5807       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5808       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5809       if(internal)
5810         assem_debug("branch: internal\n");
5811       else
5812         assem_debug("branch: external\n");
5813       if(internal&&is_ds[(ba[i]-start)>>2]) {
5814         ds_assemble_entry(i);
5815       }
5816       else {
5817         add_to_linker((int)out,ba[i],internal);
5818         emit_jmp(0);
5819       }
5820     }
5821     // branch not taken
5822     cop1_usable=prev_cop1_usable;
5823     if(!unconditional) {
5824       if(nottaken1) set_jump_target(nottaken1,(int)out);
5825       set_jump_target(nottaken,(int)out);
5826       assem_debug("2:\n");
5827       if(!likely[i]) {
5828         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5829                       ds_unneeded,ds_unneeded_upper);
5830         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5831         address_generation(i+1,&branch_regs[i],0);
5832         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5833         ds_assemble(i+1,&branch_regs[i]);
5834       }
5835       cc=get_reg(branch_regs[i].regmap,CCREG);
5836       if(cc==-1&&!likely[i]) {
5837         // Cycle count isn't in a register, temporarily load it then write it out
5838         emit_loadreg(CCREG,HOST_CCREG);
5839         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5840         int jaddr=(int)out;
5841         emit_jns(0);
5842         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5843         emit_storereg(CCREG,HOST_CCREG);
5844       }
5845       else{
5846         cc=get_reg(i_regmap,CCREG);
5847         assert(cc==HOST_CCREG);
5848         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5849         int jaddr=(int)out;
5850         emit_jns(0);
5851         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5852       }
5853     }
5854   }
5855 }
5856
5857 void sjump_assemble(int i,struct regstat *i_regs)
5858 {
5859   signed char *i_regmap=i_regs->regmap;
5860   int cc;
5861   int match;
5862   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5863   assem_debug("smatch=%d\n",match);
5864   int s1h,s1l;
5865   int prev_cop1_usable=cop1_usable;
5866   int unconditional=0,nevertaken=0;
5867   int only32=0;
5868   int invert=0;
5869   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5870   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5871   if(!match) invert=1;
5872   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5873   if(i>(ba[i]-start)>>2) invert=1;
5874   #endif
5875
5876   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5877   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5878
5879   if(ooo[i]) {
5880     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5881     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5882   }
5883   else {
5884     s1l=get_reg(i_regmap,rs1[i]);
5885     s1h=get_reg(i_regmap,rs1[i]|64);
5886   }
5887   if(rs1[i]==0)
5888   {
5889     if(opcode2[i]&1) unconditional=1;
5890     else nevertaken=1;
5891     // These are never taken (r0 is never less than zero)
5892     //assert(opcode2[i]!=0);
5893     //assert(opcode2[i]!=2);
5894     //assert(opcode2[i]!=0x10);
5895     //assert(opcode2[i]!=0x12);
5896   }
5897   else {
5898     only32=(regs[i].was32>>rs1[i])&1;
5899   }
5900
5901   if(ooo[i]) {
5902     // Out of order execution (delay slot first)
5903     //printf("OOOE\n");
5904     address_generation(i+1,i_regs,regs[i].regmap_entry);
5905     ds_assemble(i+1,i_regs);
5906     int adj;
5907     uint64_t bc_unneeded=branch_regs[i].u;
5908     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5909     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5910     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5911     bc_unneeded|=1;
5912     bc_unneeded_upper|=1;
5913     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5914                   bc_unneeded,bc_unneeded_upper);
5915     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5916     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5917     if(rt1[i]==31) {
5918       int rt,return_address;
5919       rt=get_reg(branch_regs[i].regmap,31);
5920       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5921       if(rt>=0) {
5922         // Save the PC even if the branch is not taken
5923         return_address=start+i*4+8;
5924         emit_movimm(return_address,rt); // PC into link register
5925         #ifdef IMM_PREFETCH
5926         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5927         #endif
5928       }
5929     }
5930     cc=get_reg(branch_regs[i].regmap,CCREG);
5931     assert(cc==HOST_CCREG);
5932     if(unconditional) 
5933       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5934     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5935     assem_debug("cycle count (adj)\n");
5936     if(unconditional) {
5937       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5938       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5939         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5940         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5941         if(internal)
5942           assem_debug("branch: internal\n");
5943         else
5944           assem_debug("branch: external\n");
5945         if(internal&&is_ds[(ba[i]-start)>>2]) {
5946           ds_assemble_entry(i);
5947         }
5948         else {
5949           add_to_linker((int)out,ba[i],internal);
5950           emit_jmp(0);
5951         }
5952         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5953         if(((u_int)out)&7) emit_addnop(0);
5954         #endif
5955       }
5956     }
5957     else if(nevertaken) {
5958       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5959       int jaddr=(int)out;
5960       emit_jns(0);
5961       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5962     }
5963     else {
5964       int nottaken=0;
5965       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5966       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5967       if(!only32)
5968       {
5969         assert(s1h>=0);
5970         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5971         {
5972           emit_test(s1h,s1h);
5973           if(invert){
5974             nottaken=(int)out;
5975             emit_jns(1);
5976           }else{
5977             add_to_linker((int)out,ba[i],internal);
5978             emit_js(0);
5979           }
5980         }
5981         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5982         {
5983           emit_test(s1h,s1h);
5984           if(invert){
5985             nottaken=(int)out;
5986             emit_js(1);
5987           }else{
5988             add_to_linker((int)out,ba[i],internal);
5989             emit_jns(0);
5990           }
5991         }
5992       } // if(!only32)
5993       else
5994       {
5995         assert(s1l>=0);
5996         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5997         {
5998           emit_test(s1l,s1l);
5999           if(invert){
6000             nottaken=(int)out;
6001             emit_jns(1);
6002           }else{
6003             add_to_linker((int)out,ba[i],internal);
6004             emit_js(0);
6005           }
6006         }
6007         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6008         {
6009           emit_test(s1l,s1l);
6010           if(invert){
6011             nottaken=(int)out;
6012             emit_js(1);
6013           }else{
6014             add_to_linker((int)out,ba[i],internal);
6015             emit_jns(0);
6016           }
6017         }
6018       } // if(!only32)
6019           
6020       if(invert) {
6021         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6022         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6023           if(adj) {
6024             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6025             add_to_linker((int)out,ba[i],internal);
6026           }else{
6027             emit_addnop(13);
6028             add_to_linker((int)out,ba[i],internal*2);
6029           }
6030           emit_jmp(0);
6031         }else
6032         #endif
6033         {
6034           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6035           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6036           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6037           if(internal)
6038             assem_debug("branch: internal\n");
6039           else
6040             assem_debug("branch: external\n");
6041           if(internal&&is_ds[(ba[i]-start)>>2]) {
6042             ds_assemble_entry(i);
6043           }
6044           else {
6045             add_to_linker((int)out,ba[i],internal);
6046             emit_jmp(0);
6047           }
6048         }
6049         set_jump_target(nottaken,(int)out);
6050       }
6051
6052       if(adj) {
6053         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6054       }
6055     } // (!unconditional)
6056   } // if(ooo)
6057   else
6058   {
6059     // In-order execution (branch first)
6060     //printf("IOE\n");
6061     int nottaken=0;
6062     if(rt1[i]==31) {
6063       int rt,return_address;
6064       rt=get_reg(branch_regs[i].regmap,31);
6065       if(rt>=0) {
6066         // Save the PC even if the branch is not taken
6067         return_address=start+i*4+8;
6068         emit_movimm(return_address,rt); // PC into link register
6069         #ifdef IMM_PREFETCH
6070         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6071         #endif
6072       }
6073     }
6074     if(!unconditional) {
6075       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6076       if(!only32)
6077       {
6078         assert(s1h>=0);
6079         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6080         {
6081           emit_test(s1h,s1h);
6082           nottaken=(int)out;
6083           emit_jns(1);
6084         }
6085         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6086         {
6087           emit_test(s1h,s1h);
6088           nottaken=(int)out;
6089           emit_js(1);
6090         }
6091       } // if(!only32)
6092       else
6093       {
6094         assert(s1l>=0);
6095         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6096         {
6097           emit_test(s1l,s1l);
6098           nottaken=(int)out;
6099           emit_jns(1);
6100         }
6101         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6102         {
6103           emit_test(s1l,s1l);
6104           nottaken=(int)out;
6105           emit_js(1);
6106         }
6107       }
6108     } // if(!unconditional)
6109     int adj;
6110     uint64_t ds_unneeded=branch_regs[i].u;
6111     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6112     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6113     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6114     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6115     ds_unneeded|=1;
6116     ds_unneeded_upper|=1;
6117     // branch taken
6118     if(!nevertaken) {
6119       //assem_debug("1:\n");
6120       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6121                     ds_unneeded,ds_unneeded_upper);
6122       // load regs
6123       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6124       address_generation(i+1,&branch_regs[i],0);
6125       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6126       ds_assemble(i+1,&branch_regs[i]);
6127       cc=get_reg(branch_regs[i].regmap,CCREG);
6128       if(cc==-1) {
6129         emit_loadreg(CCREG,cc=HOST_CCREG);
6130         // CHECK: Is the following instruction (fall thru) allocated ok?
6131       }
6132       assert(cc==HOST_CCREG);
6133       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6134       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6135       assem_debug("cycle count (adj)\n");
6136       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6137       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6138       if(internal)
6139         assem_debug("branch: internal\n");
6140       else
6141         assem_debug("branch: external\n");
6142       if(internal&&is_ds[(ba[i]-start)>>2]) {
6143         ds_assemble_entry(i);
6144       }
6145       else {
6146         add_to_linker((int)out,ba[i],internal);
6147         emit_jmp(0);
6148       }
6149     }
6150     // branch not taken
6151     cop1_usable=prev_cop1_usable;
6152     if(!unconditional) {
6153       set_jump_target(nottaken,(int)out);
6154       assem_debug("1:\n");
6155       if(!likely[i]) {
6156         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6157                       ds_unneeded,ds_unneeded_upper);
6158         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6159         address_generation(i+1,&branch_regs[i],0);
6160         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6161         ds_assemble(i+1,&branch_regs[i]);
6162       }
6163       cc=get_reg(branch_regs[i].regmap,CCREG);
6164       if(cc==-1&&!likely[i]) {
6165         // Cycle count isn't in a register, temporarily load it then write it out
6166         emit_loadreg(CCREG,HOST_CCREG);
6167         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6168         int jaddr=(int)out;
6169         emit_jns(0);
6170         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6171         emit_storereg(CCREG,HOST_CCREG);
6172       }
6173       else{
6174         cc=get_reg(i_regmap,CCREG);
6175         assert(cc==HOST_CCREG);
6176         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6177         int jaddr=(int)out;
6178         emit_jns(0);
6179         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6180       }
6181     }
6182   }
6183 }
6184
6185 void fjump_assemble(int i,struct regstat *i_regs)
6186 {
6187   signed char *i_regmap=i_regs->regmap;
6188   int cc;
6189   int match;
6190   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6191   assem_debug("fmatch=%d\n",match);
6192   int fs,cs;
6193   int eaddr;
6194   int invert=0;
6195   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6196   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6197   if(!match) invert=1;
6198   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6199   if(i>(ba[i]-start)>>2) invert=1;
6200   #endif
6201
6202   if(ooo[i]) {
6203     fs=get_reg(branch_regs[i].regmap,FSREG);
6204     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6205   }
6206   else {
6207     fs=get_reg(i_regmap,FSREG);
6208   }
6209
6210   // Check cop1 unusable
6211   if(!cop1_usable) {
6212     cs=get_reg(i_regmap,CSREG);
6213     assert(cs>=0);
6214     emit_testimm(cs,0x20000000);
6215     eaddr=(int)out;
6216     emit_jeq(0);
6217     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6218     cop1_usable=1;
6219   }
6220
6221   if(ooo[i]) {
6222     // Out of order execution (delay slot first)
6223     //printf("OOOE\n");
6224     ds_assemble(i+1,i_regs);
6225     int adj;
6226     uint64_t bc_unneeded=branch_regs[i].u;
6227     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6228     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6229     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6230     bc_unneeded|=1;
6231     bc_unneeded_upper|=1;
6232     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6233                   bc_unneeded,bc_unneeded_upper);
6234     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6235     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6236     cc=get_reg(branch_regs[i].regmap,CCREG);
6237     assert(cc==HOST_CCREG);
6238     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6239     assem_debug("cycle count (adj)\n");
6240     if(1) {
6241       int nottaken=0;
6242       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6243       if(1) {
6244         assert(fs>=0);
6245         emit_testimm(fs,0x800000);
6246         if(source[i]&0x10000) // BC1T
6247         {
6248           if(invert){
6249             nottaken=(int)out;
6250             emit_jeq(1);
6251           }else{
6252             add_to_linker((int)out,ba[i],internal);
6253             emit_jne(0);
6254           }
6255         }
6256         else // BC1F
6257           if(invert){
6258             nottaken=(int)out;
6259             emit_jne(1);
6260           }else{
6261             add_to_linker((int)out,ba[i],internal);
6262             emit_jeq(0);
6263           }
6264         {
6265         }
6266       } // if(!only32)
6267           
6268       if(invert) {
6269         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6270         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6271         else if(match) emit_addnop(13);
6272         #endif
6273         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6274         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6275         if(internal)
6276           assem_debug("branch: internal\n");
6277         else
6278           assem_debug("branch: external\n");
6279         if(internal&&is_ds[(ba[i]-start)>>2]) {
6280           ds_assemble_entry(i);
6281         }
6282         else {
6283           add_to_linker((int)out,ba[i],internal);
6284           emit_jmp(0);
6285         }
6286         set_jump_target(nottaken,(int)out);
6287       }
6288
6289       if(adj) {
6290         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6291       }
6292     } // (!unconditional)
6293   } // if(ooo)
6294   else
6295   {
6296     // In-order execution (branch first)
6297     //printf("IOE\n");
6298     int nottaken=0;
6299     if(1) {
6300       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6301       if(1) {
6302         assert(fs>=0);
6303         emit_testimm(fs,0x800000);
6304         if(source[i]&0x10000) // BC1T
6305         {
6306           nottaken=(int)out;
6307           emit_jeq(1);
6308         }
6309         else // BC1F
6310         {
6311           nottaken=(int)out;
6312           emit_jne(1);
6313         }
6314       }
6315     } // if(!unconditional)
6316     int adj;
6317     uint64_t ds_unneeded=branch_regs[i].u;
6318     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6319     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6320     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6321     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6322     ds_unneeded|=1;
6323     ds_unneeded_upper|=1;
6324     // branch taken
6325     //assem_debug("1:\n");
6326     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6327                   ds_unneeded,ds_unneeded_upper);
6328     // load regs
6329     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6330     address_generation(i+1,&branch_regs[i],0);
6331     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6332     ds_assemble(i+1,&branch_regs[i]);
6333     cc=get_reg(branch_regs[i].regmap,CCREG);
6334     if(cc==-1) {
6335       emit_loadreg(CCREG,cc=HOST_CCREG);
6336       // CHECK: Is the following instruction (fall thru) allocated ok?
6337     }
6338     assert(cc==HOST_CCREG);
6339     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6340     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6341     assem_debug("cycle count (adj)\n");
6342     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6343     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6344     if(internal)
6345       assem_debug("branch: internal\n");
6346     else
6347       assem_debug("branch: external\n");
6348     if(internal&&is_ds[(ba[i]-start)>>2]) {
6349       ds_assemble_entry(i);
6350     }
6351     else {
6352       add_to_linker((int)out,ba[i],internal);
6353       emit_jmp(0);
6354     }
6355
6356     // branch not taken
6357     if(1) { // <- FIXME (don't need this)
6358       set_jump_target(nottaken,(int)out);
6359       assem_debug("1:\n");
6360       if(!likely[i]) {
6361         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6362                       ds_unneeded,ds_unneeded_upper);
6363         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6364         address_generation(i+1,&branch_regs[i],0);
6365         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6366         ds_assemble(i+1,&branch_regs[i]);
6367       }
6368       cc=get_reg(branch_regs[i].regmap,CCREG);
6369       if(cc==-1&&!likely[i]) {
6370         // Cycle count isn't in a register, temporarily load it then write it out
6371         emit_loadreg(CCREG,HOST_CCREG);
6372         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6373         int jaddr=(int)out;
6374         emit_jns(0);
6375         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6376         emit_storereg(CCREG,HOST_CCREG);
6377       }
6378       else{
6379         cc=get_reg(i_regmap,CCREG);
6380         assert(cc==HOST_CCREG);
6381         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6382         int jaddr=(int)out;
6383         emit_jns(0);
6384         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6385       }
6386     }
6387   }
6388 }
6389
6390 static void pagespan_assemble(int i,struct regstat *i_regs)
6391 {
6392   int s1l=get_reg(i_regs->regmap,rs1[i]);
6393   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6394   int s2l=get_reg(i_regs->regmap,rs2[i]);
6395   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6396   void *nt_branch=NULL;
6397   int taken=0;
6398   int nottaken=0;
6399   int unconditional=0;
6400   if(rs1[i]==0)
6401   {
6402     s1l=s2l;s1h=s2h;
6403     s2l=s2h=-1;
6404   }
6405   else if(rs2[i]==0)
6406   {
6407     s2l=s2h=-1;
6408   }
6409   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6410     s1h=s2h=-1;
6411   }
6412   int hr=0;
6413   int addr,alt,ntaddr;
6414   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6415   else {
6416     while(hr<HOST_REGS)
6417     {
6418       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6419          (i_regs->regmap[hr]&63)!=rs1[i] &&
6420          (i_regs->regmap[hr]&63)!=rs2[i] )
6421       {
6422         addr=hr++;break;
6423       }
6424       hr++;
6425     }
6426   }
6427   while(hr<HOST_REGS)
6428   {
6429     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6430        (i_regs->regmap[hr]&63)!=rs1[i] &&
6431        (i_regs->regmap[hr]&63)!=rs2[i] )
6432     {
6433       alt=hr++;break;
6434     }
6435     hr++;
6436   }
6437   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6438   {
6439     while(hr<HOST_REGS)
6440     {
6441       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6442          (i_regs->regmap[hr]&63)!=rs1[i] &&
6443          (i_regs->regmap[hr]&63)!=rs2[i] )
6444       {
6445         ntaddr=hr;break;
6446       }
6447       hr++;
6448     }
6449   }
6450   assert(hr<HOST_REGS);
6451   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6452     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6453   }
6454   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6455   if(opcode[i]==2) // J
6456   {
6457     unconditional=1;
6458   }
6459   if(opcode[i]==3) // JAL
6460   {
6461     // TODO: mini_ht
6462     int rt=get_reg(i_regs->regmap,31);
6463     emit_movimm(start+i*4+8,rt);
6464     unconditional=1;
6465   }
6466   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6467   {
6468     emit_mov(s1l,addr);
6469     if(opcode2[i]==9) // JALR
6470     {
6471       int rt=get_reg(i_regs->regmap,rt1[i]);
6472       emit_movimm(start+i*4+8,rt);
6473     }
6474   }
6475   if((opcode[i]&0x3f)==4) // BEQ
6476   {
6477     if(rs1[i]==rs2[i])
6478     {
6479       unconditional=1;
6480     }
6481     else
6482     #ifdef HAVE_CMOV_IMM
6483     if(s1h<0) {
6484       if(s2l>=0) emit_cmp(s1l,s2l);
6485       else emit_test(s1l,s1l);
6486       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6487     }
6488     else
6489     #endif
6490     {
6491       assert(s1l>=0);
6492       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6493       if(s1h>=0) {
6494         if(s2h>=0) emit_cmp(s1h,s2h);
6495         else emit_test(s1h,s1h);
6496         emit_cmovne_reg(alt,addr);
6497       }
6498       if(s2l>=0) emit_cmp(s1l,s2l);
6499       else emit_test(s1l,s1l);
6500       emit_cmovne_reg(alt,addr);
6501     }
6502   }
6503   if((opcode[i]&0x3f)==5) // BNE
6504   {
6505     #ifdef HAVE_CMOV_IMM
6506     if(s1h<0) {
6507       if(s2l>=0) emit_cmp(s1l,s2l);
6508       else emit_test(s1l,s1l);
6509       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6510     }
6511     else
6512     #endif
6513     {
6514       assert(s1l>=0);
6515       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6516       if(s1h>=0) {
6517         if(s2h>=0) emit_cmp(s1h,s2h);
6518         else emit_test(s1h,s1h);
6519         emit_cmovne_reg(alt,addr);
6520       }
6521       if(s2l>=0) emit_cmp(s1l,s2l);
6522       else emit_test(s1l,s1l);
6523       emit_cmovne_reg(alt,addr);
6524     }
6525   }
6526   if((opcode[i]&0x3f)==0x14) // BEQL
6527   {
6528     if(s1h>=0) {
6529       if(s2h>=0) emit_cmp(s1h,s2h);
6530       else emit_test(s1h,s1h);
6531       nottaken=(int)out;
6532       emit_jne(0);
6533     }
6534     if(s2l>=0) emit_cmp(s1l,s2l);
6535     else emit_test(s1l,s1l);
6536     if(nottaken) set_jump_target(nottaken,(int)out);
6537     nottaken=(int)out;
6538     emit_jne(0);
6539   }
6540   if((opcode[i]&0x3f)==0x15) // BNEL
6541   {
6542     if(s1h>=0) {
6543       if(s2h>=0) emit_cmp(s1h,s2h);
6544       else emit_test(s1h,s1h);
6545       taken=(int)out;
6546       emit_jne(0);
6547     }
6548     if(s2l>=0) emit_cmp(s1l,s2l);
6549     else emit_test(s1l,s1l);
6550     nottaken=(int)out;
6551     emit_jeq(0);
6552     if(taken) set_jump_target(taken,(int)out);
6553   }
6554   if((opcode[i]&0x3f)==6) // BLEZ
6555   {
6556     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6557     emit_cmpimm(s1l,1);
6558     if(s1h>=0) emit_mov(addr,ntaddr);
6559     emit_cmovl_reg(alt,addr);
6560     if(s1h>=0) {
6561       emit_test(s1h,s1h);
6562       emit_cmovne_reg(ntaddr,addr);
6563       emit_cmovs_reg(alt,addr);
6564     }
6565   }
6566   if((opcode[i]&0x3f)==7) // BGTZ
6567   {
6568     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6569     emit_cmpimm(s1l,1);
6570     if(s1h>=0) emit_mov(addr,alt);
6571     emit_cmovl_reg(ntaddr,addr);
6572     if(s1h>=0) {
6573       emit_test(s1h,s1h);
6574       emit_cmovne_reg(alt,addr);
6575       emit_cmovs_reg(ntaddr,addr);
6576     }
6577   }
6578   if((opcode[i]&0x3f)==0x16) // BLEZL
6579   {
6580     assert((opcode[i]&0x3f)!=0x16);
6581   }
6582   if((opcode[i]&0x3f)==0x17) // BGTZL
6583   {
6584     assert((opcode[i]&0x3f)!=0x17);
6585   }
6586   assert(opcode[i]!=1); // BLTZ/BGEZ
6587
6588   //FIXME: Check CSREG
6589   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6590     if((source[i]&0x30000)==0) // BC1F
6591     {
6592       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6593       emit_testimm(s1l,0x800000);
6594       emit_cmovne_reg(alt,addr);
6595     }
6596     if((source[i]&0x30000)==0x10000) // BC1T
6597     {
6598       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6599       emit_testimm(s1l,0x800000);
6600       emit_cmovne_reg(alt,addr);
6601     }
6602     if((source[i]&0x30000)==0x20000) // BC1FL
6603     {
6604       emit_testimm(s1l,0x800000);
6605       nottaken=(int)out;
6606       emit_jne(0);
6607     }
6608     if((source[i]&0x30000)==0x30000) // BC1TL
6609     {
6610       emit_testimm(s1l,0x800000);
6611       nottaken=(int)out;
6612       emit_jeq(0);
6613     }
6614   }
6615
6616   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6617   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6618   if(likely[i]||unconditional)
6619   {
6620     emit_movimm(ba[i],HOST_BTREG);
6621   }
6622   else if(addr!=HOST_BTREG)
6623   {
6624     emit_mov(addr,HOST_BTREG);
6625   }
6626   void *branch_addr=out;
6627   emit_jmp(0);
6628   int target_addr=start+i*4+5;
6629   void *stub=out;
6630   void *compiled_target_addr=check_addr(target_addr);
6631   emit_extjump_ds((int)branch_addr,target_addr);
6632   if(compiled_target_addr) {
6633     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6634     add_link(target_addr,stub);
6635   }
6636   else set_jump_target((int)branch_addr,(int)stub);
6637   if(likely[i]) {
6638     // Not-taken path
6639     set_jump_target((int)nottaken,(int)out);
6640     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6641     void *branch_addr=out;
6642     emit_jmp(0);
6643     int target_addr=start+i*4+8;
6644     void *stub=out;
6645     void *compiled_target_addr=check_addr(target_addr);
6646     emit_extjump_ds((int)branch_addr,target_addr);
6647     if(compiled_target_addr) {
6648       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6649       add_link(target_addr,stub);
6650     }
6651     else set_jump_target((int)branch_addr,(int)stub);
6652   }
6653 }
6654
6655 // Assemble the delay slot for the above
6656 static void pagespan_ds()
6657 {
6658   assem_debug("initial delay slot:\n");
6659   u_int vaddr=start+1;
6660   u_int page=get_page(vaddr);
6661   u_int vpage=get_vpage(vaddr);
6662   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6663   do_dirty_stub_ds();
6664   ll_add(jump_in+page,vaddr,(void *)out);
6665   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6666   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6667     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6668   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6669     emit_writeword(HOST_BTREG,(int)&branch_target);
6670   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6671   address_generation(0,&regs[0],regs[0].regmap_entry);
6672   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6673     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6674   cop1_usable=0;
6675   is_delayslot=0;
6676   switch(itype[0]) {
6677     case ALU:
6678       alu_assemble(0,&regs[0]);break;
6679     case IMM16:
6680       imm16_assemble(0,&regs[0]);break;
6681     case SHIFT:
6682       shift_assemble(0,&regs[0]);break;
6683     case SHIFTIMM:
6684       shiftimm_assemble(0,&regs[0]);break;
6685     case LOAD:
6686       load_assemble(0,&regs[0]);break;
6687     case LOADLR:
6688       loadlr_assemble(0,&regs[0]);break;
6689     case STORE:
6690       store_assemble(0,&regs[0]);break;
6691     case STORELR:
6692       storelr_assemble(0,&regs[0]);break;
6693     case COP0:
6694       cop0_assemble(0,&regs[0]);break;
6695     case COP1:
6696       cop1_assemble(0,&regs[0]);break;
6697     case C1LS:
6698       c1ls_assemble(0,&regs[0]);break;
6699     case COP2:
6700       cop2_assemble(0,&regs[0]);break;
6701     case C2LS:
6702       c2ls_assemble(0,&regs[0]);break;
6703     case C2OP:
6704       c2op_assemble(0,&regs[0]);break;
6705     case FCONV:
6706       fconv_assemble(0,&regs[0]);break;
6707     case FLOAT:
6708       float_assemble(0,&regs[0]);break;
6709     case FCOMP:
6710       fcomp_assemble(0,&regs[0]);break;
6711     case MULTDIV:
6712       multdiv_assemble(0,&regs[0]);break;
6713     case MOV:
6714       mov_assemble(0,&regs[0]);break;
6715     case SYSCALL:
6716     case HLECALL:
6717     case INTCALL:
6718     case SPAN:
6719     case UJUMP:
6720     case RJUMP:
6721     case CJUMP:
6722     case SJUMP:
6723     case FJUMP:
6724       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6725   }
6726   int btaddr=get_reg(regs[0].regmap,BTREG);
6727   if(btaddr<0) {
6728     btaddr=get_reg(regs[0].regmap,-1);
6729     emit_readword((int)&branch_target,btaddr);
6730   }
6731   assert(btaddr!=HOST_CCREG);
6732   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6733 #ifdef HOST_IMM8
6734   emit_movimm(start+4,HOST_TEMPREG);
6735   emit_cmp(btaddr,HOST_TEMPREG);
6736 #else
6737   emit_cmpimm(btaddr,start+4);
6738 #endif
6739   int branch=(int)out;
6740   emit_jeq(0);
6741   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6742   emit_jmp(jump_vaddr_reg[btaddr]);
6743   set_jump_target(branch,(int)out);
6744   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6745   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6746 }
6747
6748 // Basic liveness analysis for MIPS registers
6749 void unneeded_registers(int istart,int iend,int r)
6750 {
6751   int i;
6752   uint64_t u,uu,gte_u,b,bu,gte_bu;
6753   uint64_t temp_u,temp_uu,temp_gte_u=0;
6754   uint64_t tdep;
6755   uint64_t gte_u_unknown=0;
6756   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6757     gte_u_unknown=~0ll;
6758   if(iend==slen-1) {
6759     u=1;uu=1;
6760     gte_u=gte_u_unknown;
6761   }else{
6762     u=unneeded_reg[iend+1];
6763     uu=unneeded_reg_upper[iend+1];
6764     u=1;uu=1;
6765     gte_u=gte_unneeded[iend+1];
6766   }
6767
6768   for (i=iend;i>=istart;i--)
6769   {
6770     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6771     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6772     {
6773       // If subroutine call, flag return address as a possible branch target
6774       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6775       
6776       if(ba[i]<start || ba[i]>=(start+slen*4))
6777       {
6778         // Branch out of this block, flush all regs
6779         u=1;
6780         uu=1;
6781         gte_u=gte_u_unknown;
6782         /* Hexagon hack 
6783         if(itype[i]==UJUMP&&rt1[i]==31)
6784         {
6785           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6786         }
6787         if(itype[i]==RJUMP&&rs1[i]==31)
6788         {
6789           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6790         }
6791         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6792           if(itype[i]==UJUMP&&rt1[i]==31)
6793           {
6794             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6795             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6796           }
6797           if(itype[i]==RJUMP&&rs1[i]==31)
6798           {
6799             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6800             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6801           }
6802         }*/
6803         branch_unneeded_reg[i]=u;
6804         branch_unneeded_reg_upper[i]=uu;
6805         // Merge in delay slot
6806         tdep=(~uu>>rt1[i+1])&1;
6807         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6808         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6809         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6810         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6811         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6812         u|=1;uu|=1;
6813         gte_u|=gte_rt[i+1];
6814         gte_u&=~gte_rs[i+1];
6815         // If branch is "likely" (and conditional)
6816         // then we skip the delay slot on the fall-thru path
6817         if(likely[i]) {
6818           if(i<slen-1) {
6819             u&=unneeded_reg[i+2];
6820             uu&=unneeded_reg_upper[i+2];
6821             gte_u&=gte_unneeded[i+2];
6822           }
6823           else
6824           {
6825             u=1;
6826             uu=1;
6827             gte_u=gte_u_unknown;
6828           }
6829         }
6830       }
6831       else
6832       {
6833         // Internal branch, flag target
6834         bt[(ba[i]-start)>>2]=1;
6835         if(ba[i]<=start+i*4) {
6836           // Backward branch
6837           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6838           {
6839             // Unconditional branch
6840             temp_u=1;temp_uu=1;
6841             temp_gte_u=0;
6842           } else {
6843             // Conditional branch (not taken case)
6844             temp_u=unneeded_reg[i+2];
6845             temp_uu=unneeded_reg_upper[i+2];
6846             temp_gte_u&=gte_unneeded[i+2];
6847           }
6848           // Merge in delay slot
6849           tdep=(~temp_uu>>rt1[i+1])&1;
6850           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6851           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6852           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6853           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6854           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6855           temp_u|=1;temp_uu|=1;
6856           temp_gte_u|=gte_rt[i+1];
6857           temp_gte_u&=~gte_rs[i+1];
6858           // If branch is "likely" (and conditional)
6859           // then we skip the delay slot on the fall-thru path
6860           if(likely[i]) {
6861             if(i<slen-1) {
6862               temp_u&=unneeded_reg[i+2];
6863               temp_uu&=unneeded_reg_upper[i+2];
6864               temp_gte_u&=gte_unneeded[i+2];
6865             }
6866             else
6867             {
6868               temp_u=1;
6869               temp_uu=1;
6870               temp_gte_u=gte_u_unknown;
6871             }
6872           }
6873           tdep=(~temp_uu>>rt1[i])&1;
6874           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6875           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6876           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6877           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6878           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6879           temp_u|=1;temp_uu|=1;
6880           temp_gte_u|=gte_rt[i];
6881           temp_gte_u&=~gte_rs[i];
6882           unneeded_reg[i]=temp_u;
6883           unneeded_reg_upper[i]=temp_uu;
6884           gte_unneeded[i]=temp_gte_u;
6885           // Only go three levels deep.  This recursion can take an
6886           // excessive amount of time if there are a lot of nested loops.
6887           if(r<2) {
6888             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6889           }else{
6890             unneeded_reg[(ba[i]-start)>>2]=1;
6891             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6892             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6893           }
6894         } /*else*/ if(1) {
6895           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6896           {
6897             // Unconditional branch
6898             u=unneeded_reg[(ba[i]-start)>>2];
6899             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6900             gte_u=gte_unneeded[(ba[i]-start)>>2];
6901             branch_unneeded_reg[i]=u;
6902             branch_unneeded_reg_upper[i]=uu;
6903         //u=1;
6904         //uu=1;
6905         //branch_unneeded_reg[i]=u;
6906         //branch_unneeded_reg_upper[i]=uu;
6907             // Merge in delay slot
6908             tdep=(~uu>>rt1[i+1])&1;
6909             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6910             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6911             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6912             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6913             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6914             u|=1;uu|=1;
6915             gte_u|=gte_rt[i+1];
6916             gte_u&=~gte_rs[i+1];
6917           } else {
6918             // Conditional branch
6919             b=unneeded_reg[(ba[i]-start)>>2];
6920             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6921             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6922             branch_unneeded_reg[i]=b;
6923             branch_unneeded_reg_upper[i]=bu;
6924         //b=1;
6925         //bu=1;
6926         //branch_unneeded_reg[i]=b;
6927         //branch_unneeded_reg_upper[i]=bu;
6928             // Branch delay slot
6929             tdep=(~uu>>rt1[i+1])&1;
6930             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6931             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6932             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6933             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6934             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6935             b|=1;bu|=1;
6936             gte_bu|=gte_rt[i+1];
6937             gte_bu&=~gte_rs[i+1];
6938             // If branch is "likely" then we skip the
6939             // delay slot on the fall-thru path
6940             if(likely[i]) {
6941               u=b;
6942               uu=bu;
6943               gte_u=gte_bu;
6944               if(i<slen-1) {
6945                 u&=unneeded_reg[i+2];
6946                 uu&=unneeded_reg_upper[i+2];
6947                 gte_u&=gte_unneeded[i+2];
6948         //u=1;
6949         //uu=1;
6950               }
6951             } else {
6952               u&=b;
6953               uu&=bu;
6954               gte_u&=gte_bu;
6955         //u=1;
6956         //uu=1;
6957             }
6958             if(i<slen-1) {
6959               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6960               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6961         //branch_unneeded_reg[i]=1;
6962         //branch_unneeded_reg_upper[i]=1;
6963             } else {
6964               branch_unneeded_reg[i]=1;
6965               branch_unneeded_reg_upper[i]=1;
6966             }
6967           }
6968         }
6969       }
6970     }
6971     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6972     {
6973       // SYSCALL instruction (software interrupt)
6974       u=1;
6975       uu=1;
6976     }
6977     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6978     {
6979       // ERET instruction (return from interrupt)
6980       u=1;
6981       uu=1;
6982     }
6983     //u=uu=1; // DEBUG
6984     tdep=(~uu>>rt1[i])&1;
6985     // Written registers are unneeded
6986     u|=1LL<<rt1[i];
6987     u|=1LL<<rt2[i];
6988     uu|=1LL<<rt1[i];
6989     uu|=1LL<<rt2[i];
6990     gte_u|=gte_rt[i];
6991     // Accessed registers are needed
6992     u&=~(1LL<<rs1[i]);
6993     u&=~(1LL<<rs2[i]);
6994     uu&=~(1LL<<us1[i]);
6995     uu&=~(1LL<<us2[i]);
6996     gte_u&=~gte_rs[i];
6997     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6998       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6999     // Source-target dependencies
7000     uu&=~(tdep<<dep1[i]);
7001     uu&=~(tdep<<dep2[i]);
7002     // R0 is always unneeded
7003     u|=1;uu|=1;
7004     // Save it
7005     unneeded_reg[i]=u;
7006     unneeded_reg_upper[i]=uu;
7007     gte_unneeded[i]=gte_u;
7008     /*
7009     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
7010     printf("U:");
7011     int r;
7012     for(r=1;r<=CCREG;r++) {
7013       if((unneeded_reg[i]>>r)&1) {
7014         if(r==HIREG) printf(" HI");
7015         else if(r==LOREG) printf(" LO");
7016         else printf(" r%d",r);
7017       }
7018     }
7019     printf(" UU:");
7020     for(r=1;r<=CCREG;r++) {
7021       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
7022         if(r==HIREG) printf(" HI");
7023         else if(r==LOREG) printf(" LO");
7024         else printf(" r%d",r);
7025       }
7026     }
7027     printf("\n");*/
7028   }
7029 #ifdef FORCE32
7030   for (i=iend;i>=istart;i--)
7031   {
7032     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7033   }
7034 #endif
7035 }
7036
7037 // Identify registers which are likely to contain 32-bit values
7038 // This is used to predict whether any branches will jump to a
7039 // location with 64-bit values in registers.
7040 static void provisional_32bit()
7041 {
7042   int i,j;
7043   uint64_t is32=1;
7044   uint64_t lastbranch=1;
7045   
7046   for(i=0;i<slen;i++)
7047   {
7048     if(i>0) {
7049       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7050         if(i>1) is32=lastbranch;
7051         else is32=1;
7052       }
7053     }
7054     if(i>1)
7055     {
7056       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7057         if(likely[i-2]) {
7058           if(i>2) is32=lastbranch;
7059           else is32=1;
7060         }
7061       }
7062       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7063       {
7064         if(rs1[i-2]==0||rs2[i-2]==0)
7065         {
7066           if(rs1[i-2]) {
7067             is32|=1LL<<rs1[i-2];
7068           }
7069           if(rs2[i-2]) {
7070             is32|=1LL<<rs2[i-2];
7071           }
7072         }
7073       }
7074     }
7075     // If something jumps here with 64-bit values
7076     // then promote those registers to 64 bits
7077     if(bt[i])
7078     {
7079       uint64_t temp_is32=is32;
7080       for(j=i-1;j>=0;j--)
7081       {
7082         if(ba[j]==start+i*4) 
7083           //temp_is32&=branch_regs[j].is32;
7084           temp_is32&=p32[j];
7085       }
7086       for(j=i;j<slen;j++)
7087       {
7088         if(ba[j]==start+i*4) 
7089           temp_is32=1;
7090       }
7091       is32=temp_is32;
7092     }
7093     int type=itype[i];
7094     int op=opcode[i];
7095     int op2=opcode2[i];
7096     int rt=rt1[i];
7097     int s1=rs1[i];
7098     int s2=rs2[i];
7099     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7100       // Branches don't write registers, consider the delay slot instead.
7101       type=itype[i+1];
7102       op=opcode[i+1];
7103       op2=opcode2[i+1];
7104       rt=rt1[i+1];
7105       s1=rs1[i+1];
7106       s2=rs2[i+1];
7107       lastbranch=is32;
7108     }
7109     switch(type) {
7110       case LOAD:
7111         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7112            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7113           is32&=~(1LL<<rt);
7114         else
7115           is32|=1LL<<rt;
7116         break;
7117       case STORE:
7118       case STORELR:
7119         break;
7120       case LOADLR:
7121         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7122         if(op==0x22) is32|=1LL<<rt; // LWL
7123         break;
7124       case IMM16:
7125         if (op==0x08||op==0x09|| // ADDI/ADDIU
7126             op==0x0a||op==0x0b|| // SLTI/SLTIU
7127             op==0x0c|| // ANDI
7128             op==0x0f)  // LUI
7129         {
7130           is32|=1LL<<rt;
7131         }
7132         if(op==0x18||op==0x19) { // DADDI/DADDIU
7133           is32&=~(1LL<<rt);
7134           //if(imm[i]==0)
7135           //  is32|=((is32>>s1)&1LL)<<rt;
7136         }
7137         if(op==0x0d||op==0x0e) { // ORI/XORI
7138           uint64_t sr=((is32>>s1)&1LL);
7139           is32&=~(1LL<<rt);
7140           is32|=sr<<rt;
7141         }
7142         break;
7143       case UJUMP:
7144         break;
7145       case RJUMP:
7146         break;
7147       case CJUMP:
7148         break;
7149       case SJUMP:
7150         break;
7151       case FJUMP:
7152         break;
7153       case ALU:
7154         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7155           is32|=1LL<<rt;
7156         }
7157         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7158           is32|=1LL<<rt;
7159         }
7160         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7161           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7162           is32&=~(1LL<<rt);
7163           is32|=sr<<rt;
7164         }
7165         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7166           if(s1==0&&s2==0) {
7167             is32|=1LL<<rt;
7168           }
7169           else if(s2==0) {
7170             uint64_t sr=((is32>>s1)&1LL);
7171             is32&=~(1LL<<rt);
7172             is32|=sr<<rt;
7173           }
7174           else if(s1==0) {
7175             uint64_t sr=((is32>>s2)&1LL);
7176             is32&=~(1LL<<rt);
7177             is32|=sr<<rt;
7178           }
7179           else {
7180             is32&=~(1LL<<rt);
7181           }
7182         }
7183         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7184           if(s1==0&&s2==0) {
7185             is32|=1LL<<rt;
7186           }
7187           else if(s2==0) {
7188             uint64_t sr=((is32>>s1)&1LL);
7189             is32&=~(1LL<<rt);
7190             is32|=sr<<rt;
7191           }
7192           else {
7193             is32&=~(1LL<<rt);
7194           }
7195         }
7196         break;
7197       case MULTDIV:
7198         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7199           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7200         }
7201         else {
7202           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7203         }
7204         break;
7205       case MOV:
7206         {
7207           uint64_t sr=((is32>>s1)&1LL);
7208           is32&=~(1LL<<rt);
7209           is32|=sr<<rt;
7210         }
7211         break;
7212       case SHIFT:
7213         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7214         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7215         break;
7216       case SHIFTIMM:
7217         is32|=1LL<<rt;
7218         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7219         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7220         break;
7221       case COP0:
7222         if(op2==0) is32|=1LL<<rt; // MFC0
7223         break;
7224       case COP1:
7225       case COP2:
7226         if(op2==0) is32|=1LL<<rt; // MFC1
7227         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7228         if(op2==2) is32|=1LL<<rt; // CFC1
7229         break;
7230       case C1LS:
7231       case C2LS:
7232         break;
7233       case FLOAT:
7234       case FCONV:
7235         break;
7236       case FCOMP:
7237         break;
7238       case C2OP:
7239       case SYSCALL:
7240       case HLECALL:
7241         break;
7242       default:
7243         break;
7244     }
7245     is32|=1;
7246     p32[i]=is32;
7247
7248     if(i>0)
7249     {
7250       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7251       {
7252         if(rt1[i-1]==31) // JAL/JALR
7253         {
7254           // Subroutine call will return here, don't alloc any registers
7255           is32=1;
7256         }
7257         else if(i+1<slen)
7258         {
7259           // Internal branch will jump here, match registers to caller
7260           is32=0x3FFFFFFFFLL;
7261         }
7262       }
7263     }
7264   }
7265 }
7266
7267 // Identify registers which may be assumed to contain 32-bit values
7268 // and where optimizations will rely on this.
7269 // This is used to determine whether backward branches can safely
7270 // jump to a location with 64-bit values in registers.
7271 static void provisional_r32()
7272 {
7273   u_int r32=0;
7274   int i;
7275   
7276   for (i=slen-1;i>=0;i--)
7277   {
7278     int hr;
7279     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7280     {
7281       if(ba[i]<start || ba[i]>=(start+slen*4))
7282       {
7283         // Branch out of this block, don't need anything
7284         r32=0;
7285       }
7286       else
7287       {
7288         // Internal branch
7289         // Need whatever matches the target
7290         // (and doesn't get overwritten by the delay slot instruction)
7291         r32=0;
7292         int t=(ba[i]-start)>>2;
7293         if(ba[i]>start+i*4) {
7294           // Forward branch
7295           //if(!(requires_32bit[t]&~regs[i].was32))
7296           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7297           if(!(pr32[t]&~regs[i].was32))
7298             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7299         }else{
7300           // Backward branch
7301           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7302             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7303         }
7304       }
7305       // Conditional branch may need registers for following instructions
7306       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7307       {
7308         if(i<slen-2) {
7309           //r32|=requires_32bit[i+2];
7310           r32|=pr32[i+2];
7311           r32&=regs[i].was32;
7312           // Mark this address as a branch target since it may be called
7313           // upon return from interrupt
7314           //bt[i+2]=1;
7315         }
7316       }
7317       // Merge in delay slot
7318       if(!likely[i]) {
7319         // These are overwritten unless the branch is "likely"
7320         // and the delay slot is nullified if not taken
7321         r32&=~(1LL<<rt1[i+1]);
7322         r32&=~(1LL<<rt2[i+1]);
7323       }
7324       // Assume these are needed (delay slot)
7325       if(us1[i+1]>0)
7326       {
7327         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7328       }
7329       if(us2[i+1]>0)
7330       {
7331         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7332       }
7333       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7334       {
7335         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7336       }
7337       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7338       {
7339         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7340       }
7341     }
7342     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7343     {
7344       // SYSCALL instruction (software interrupt)
7345       r32=0;
7346     }
7347     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7348     {
7349       // ERET instruction (return from interrupt)
7350       r32=0;
7351     }
7352     // Check 32 bits
7353     r32&=~(1LL<<rt1[i]);
7354     r32&=~(1LL<<rt2[i]);
7355     if(us1[i]>0)
7356     {
7357       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7358     }
7359     if(us2[i]>0)
7360     {
7361       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7362     }
7363     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7364     {
7365       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7366     }
7367     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7368     {
7369       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7370     }
7371     //requires_32bit[i]=r32;
7372     pr32[i]=r32;
7373     
7374     // Dirty registers which are 32-bit, require 32-bit input
7375     // as they will be written as 32-bit values
7376     for(hr=0;hr<HOST_REGS;hr++)
7377     {
7378       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7379         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7380           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7381           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7382           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7383         }
7384       }
7385     }
7386   }
7387 }
7388
7389 // Write back dirty registers as soon as we will no longer modify them,
7390 // so that we don't end up with lots of writes at the branches.
7391 void clean_registers(int istart,int iend,int wr)
7392 {
7393   int i;
7394   int r;
7395   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7396   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7397   if(iend==slen-1) {
7398     will_dirty_i=will_dirty_next=0;
7399     wont_dirty_i=wont_dirty_next=0;
7400   }else{
7401     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7402     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7403   }
7404   for (i=iend;i>=istart;i--)
7405   {
7406     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7407     {
7408       if(ba[i]<start || ba[i]>=(start+slen*4))
7409       {
7410         // Branch out of this block, flush all regs
7411         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7412         {
7413           // Unconditional branch
7414           will_dirty_i=0;
7415           wont_dirty_i=0;
7416           // Merge in delay slot (will dirty)
7417           for(r=0;r<HOST_REGS;r++) {
7418             if(r!=EXCLUDE_REG) {
7419               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7420               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7421               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7422               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7423               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7424               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7425               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7426               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7427               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7428               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7429               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7430               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7431               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7432               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7433             }
7434           }
7435         }
7436         else
7437         {
7438           // Conditional branch
7439           will_dirty_i=0;
7440           wont_dirty_i=wont_dirty_next;
7441           // Merge in delay slot (will dirty)
7442           for(r=0;r<HOST_REGS;r++) {
7443             if(r!=EXCLUDE_REG) {
7444               if(!likely[i]) {
7445                 // Might not dirty if likely branch is not taken
7446                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7447                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7448                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7449                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7450                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7451                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7452                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7453                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7454                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7455                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7456                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7457                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7458                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7459                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7460               }
7461             }
7462           }
7463         }
7464         // Merge in delay slot (wont dirty)
7465         for(r=0;r<HOST_REGS;r++) {
7466           if(r!=EXCLUDE_REG) {
7467             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7468             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7469             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7470             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7471             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7472             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7473             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7474             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7475             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7476             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7477           }
7478         }
7479         if(wr) {
7480           #ifndef DESTRUCTIVE_WRITEBACK
7481           branch_regs[i].dirty&=wont_dirty_i;
7482           #endif
7483           branch_regs[i].dirty|=will_dirty_i;
7484         }
7485       }
7486       else
7487       {
7488         // Internal branch
7489         if(ba[i]<=start+i*4) {
7490           // Backward branch
7491           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7492           {
7493             // Unconditional branch
7494             temp_will_dirty=0;
7495             temp_wont_dirty=0;
7496             // Merge in delay slot (will dirty)
7497             for(r=0;r<HOST_REGS;r++) {
7498               if(r!=EXCLUDE_REG) {
7499                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7500                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7501                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7502                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7503                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7504                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7505                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7506                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7507                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7508                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7509                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7510                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7511                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7512                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7513               }
7514             }
7515           } else {
7516             // Conditional branch (not taken case)
7517             temp_will_dirty=will_dirty_next;
7518             temp_wont_dirty=wont_dirty_next;
7519             // Merge in delay slot (will dirty)
7520             for(r=0;r<HOST_REGS;r++) {
7521               if(r!=EXCLUDE_REG) {
7522                 if(!likely[i]) {
7523                   // Will not dirty if likely branch is not taken
7524                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7525                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7526                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7527                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7528                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7529                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7530                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7531                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7532                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7533                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7534                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7535                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7536                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7537                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7538                 }
7539               }
7540             }
7541           }
7542           // Merge in delay slot (wont dirty)
7543           for(r=0;r<HOST_REGS;r++) {
7544             if(r!=EXCLUDE_REG) {
7545               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7546               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7547               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7548               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7549               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7550               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7551               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7552               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7553               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7554               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7555             }
7556           }
7557           // Deal with changed mappings
7558           if(i<iend) {
7559             for(r=0;r<HOST_REGS;r++) {
7560               if(r!=EXCLUDE_REG) {
7561                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7562                   temp_will_dirty&=~(1<<r);
7563                   temp_wont_dirty&=~(1<<r);
7564                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7565                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7566                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7567                   } else {
7568                     temp_will_dirty|=1<<r;
7569                     temp_wont_dirty|=1<<r;
7570                   }
7571                 }
7572               }
7573             }
7574           }
7575           if(wr) {
7576             will_dirty[i]=temp_will_dirty;
7577             wont_dirty[i]=temp_wont_dirty;
7578             clean_registers((ba[i]-start)>>2,i-1,0);
7579           }else{
7580             // Limit recursion.  It can take an excessive amount
7581             // of time if there are a lot of nested loops.
7582             will_dirty[(ba[i]-start)>>2]=0;
7583             wont_dirty[(ba[i]-start)>>2]=-1;
7584           }
7585         }
7586         /*else*/ if(1)
7587         {
7588           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7589           {
7590             // Unconditional branch
7591             will_dirty_i=0;
7592             wont_dirty_i=0;
7593           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7594             for(r=0;r<HOST_REGS;r++) {
7595               if(r!=EXCLUDE_REG) {
7596                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7597                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7598                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7599                 }
7600                 if(branch_regs[i].regmap[r]>=0) {
7601                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7602                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7603                 }
7604               }
7605             }
7606           //}
7607             // Merge in delay slot
7608             for(r=0;r<HOST_REGS;r++) {
7609               if(r!=EXCLUDE_REG) {
7610                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7611                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7612                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7613                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7614                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7615                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7616                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7617                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7618                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7619                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7620                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7621                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7622                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7623                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7624               }
7625             }
7626           } else {
7627             // Conditional branch
7628             will_dirty_i=will_dirty_next;
7629             wont_dirty_i=wont_dirty_next;
7630           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7631             for(r=0;r<HOST_REGS;r++) {
7632               if(r!=EXCLUDE_REG) {
7633                 signed char target_reg=branch_regs[i].regmap[r];
7634                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7635                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7636                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7637                 }
7638                 else if(target_reg>=0) {
7639                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7640                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7641                 }
7642                 // Treat delay slot as part of branch too
7643                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7644                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7645                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7646                 }
7647                 else
7648                 {
7649                   will_dirty[i+1]&=~(1<<r);
7650                 }*/
7651               }
7652             }
7653           //}
7654             // Merge in delay slot
7655             for(r=0;r<HOST_REGS;r++) {
7656               if(r!=EXCLUDE_REG) {
7657                 if(!likely[i]) {
7658                   // Might not dirty if likely branch is not taken
7659                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7660                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7661                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7662                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7663                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7664                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7665                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7666                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7667                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7668                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7669                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7670                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7671                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7672                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7673                 }
7674               }
7675             }
7676           }
7677           // Merge in delay slot (won't dirty)
7678           for(r=0;r<HOST_REGS;r++) {
7679             if(r!=EXCLUDE_REG) {
7680               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7681               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7682               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7683               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7684               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7685               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7686               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7687               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7688               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7689               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7690             }
7691           }
7692           if(wr) {
7693             #ifndef DESTRUCTIVE_WRITEBACK
7694             branch_regs[i].dirty&=wont_dirty_i;
7695             #endif
7696             branch_regs[i].dirty|=will_dirty_i;
7697           }
7698         }
7699       }
7700     }
7701     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7702     {
7703       // SYSCALL instruction (software interrupt)
7704       will_dirty_i=0;
7705       wont_dirty_i=0;
7706     }
7707     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7708     {
7709       // ERET instruction (return from interrupt)
7710       will_dirty_i=0;
7711       wont_dirty_i=0;
7712     }
7713     will_dirty_next=will_dirty_i;
7714     wont_dirty_next=wont_dirty_i;
7715     for(r=0;r<HOST_REGS;r++) {
7716       if(r!=EXCLUDE_REG) {
7717         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7718         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7719         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7720         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7721         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7722         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7723         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7724         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7725         if(i>istart) {
7726           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7727           {
7728             // Don't store a register immediately after writing it,
7729             // may prevent dual-issue.
7730             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7731             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7732           }
7733         }
7734       }
7735     }
7736     // Save it
7737     will_dirty[i]=will_dirty_i;
7738     wont_dirty[i]=wont_dirty_i;
7739     // Mark registers that won't be dirtied as not dirty
7740     if(wr) {
7741       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7742       for(r=0;r<HOST_REGS;r++) {
7743         if((will_dirty_i>>r)&1) {
7744           printf(" r%d",r);
7745         }
7746       }
7747       printf("\n");*/
7748
7749       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7750         regs[i].dirty|=will_dirty_i;
7751         #ifndef DESTRUCTIVE_WRITEBACK
7752         regs[i].dirty&=wont_dirty_i;
7753         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7754         {
7755           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7756             for(r=0;r<HOST_REGS;r++) {
7757               if(r!=EXCLUDE_REG) {
7758                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7759                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7760                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7761               }
7762             }
7763           }
7764         }
7765         else
7766         {
7767           if(i<iend) {
7768             for(r=0;r<HOST_REGS;r++) {
7769               if(r!=EXCLUDE_REG) {
7770                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7771                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7772                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7773               }
7774             }
7775           }
7776         }
7777         #endif
7778       //}
7779     }
7780     // Deal with changed mappings
7781     temp_will_dirty=will_dirty_i;
7782     temp_wont_dirty=wont_dirty_i;
7783     for(r=0;r<HOST_REGS;r++) {
7784       if(r!=EXCLUDE_REG) {
7785         int nr;
7786         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7787           if(wr) {
7788             #ifndef DESTRUCTIVE_WRITEBACK
7789             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7790             #endif
7791             regs[i].wasdirty|=will_dirty_i&(1<<r);
7792           }
7793         }
7794         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7795           // Register moved to a different register
7796           will_dirty_i&=~(1<<r);
7797           wont_dirty_i&=~(1<<r);
7798           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7799           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7800           if(wr) {
7801             #ifndef DESTRUCTIVE_WRITEBACK
7802             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7803             #endif
7804             regs[i].wasdirty|=will_dirty_i&(1<<r);
7805           }
7806         }
7807         else {
7808           will_dirty_i&=~(1<<r);
7809           wont_dirty_i&=~(1<<r);
7810           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7811             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7812             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7813           } else {
7814             wont_dirty_i|=1<<r;
7815             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7816           }
7817         }
7818       }
7819     }
7820   }
7821 }
7822
7823 #ifdef DISASM
7824   /* disassembly */
7825 void disassemble_inst(int i)
7826 {
7827     if (bt[i]) printf("*"); else printf(" ");
7828     switch(itype[i]) {
7829       case UJUMP:
7830         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7831       case CJUMP:
7832         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7833       case SJUMP:
7834         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7835       case FJUMP:
7836         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7837       case RJUMP:
7838         if (opcode[i]==0x9&&rt1[i]!=31)
7839           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7840         else
7841           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7842         break;
7843       case SPAN:
7844         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7845       case IMM16:
7846         if(opcode[i]==0xf) //LUI
7847           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7848         else
7849           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7850         break;
7851       case LOAD:
7852       case LOADLR:
7853         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7854         break;
7855       case STORE:
7856       case STORELR:
7857         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7858         break;
7859       case ALU:
7860       case SHIFT:
7861         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7862         break;
7863       case MULTDIV:
7864         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7865         break;
7866       case SHIFTIMM:
7867         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7868         break;
7869       case MOV:
7870         if((opcode2[i]&0x1d)==0x10)
7871           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7872         else if((opcode2[i]&0x1d)==0x11)
7873           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7874         else
7875           printf (" %x: %s\n",start+i*4,insn[i]);
7876         break;
7877       case COP0:
7878         if(opcode2[i]==0)
7879           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7880         else if(opcode2[i]==4)
7881           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7882         else printf (" %x: %s\n",start+i*4,insn[i]);
7883         break;
7884       case COP1:
7885         if(opcode2[i]<3)
7886           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7887         else if(opcode2[i]>3)
7888           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7889         else printf (" %x: %s\n",start+i*4,insn[i]);
7890         break;
7891       case COP2:
7892         if(opcode2[i]<3)
7893           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7894         else if(opcode2[i]>3)
7895           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7896         else printf (" %x: %s\n",start+i*4,insn[i]);
7897         break;
7898       case C1LS:
7899         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7900         break;
7901       case C2LS:
7902         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7903         break;
7904       case INTCALL:
7905         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7906         break;
7907       default:
7908         //printf (" %s %8x\n",insn[i],source[i]);
7909         printf (" %x: %s\n",start+i*4,insn[i]);
7910     }
7911 }
7912 #else
7913 static void disassemble_inst(int i) {}
7914 #endif // DISASM
7915
7916 #define DRC_TEST_VAL 0x74657374
7917
7918 static int new_dynarec_test(void)
7919 {
7920   int (*testfunc)(void) = (void *)out;
7921   int ret;
7922   emit_movimm(DRC_TEST_VAL,0); // test
7923   emit_jmpreg(14);
7924   literal_pool(0);
7925 #ifdef __arm__
7926   __clear_cache((void *)testfunc, out);
7927 #endif
7928   SysPrintf("testing if we can run recompiled code..\n");
7929   ret = testfunc();
7930   if (ret == DRC_TEST_VAL)
7931     SysPrintf("test passed.\n");
7932   else
7933     SysPrintf("test failed: %08x\n", ret);
7934   out=(u_char *)BASE_ADDR;
7935   return ret == DRC_TEST_VAL;
7936 }
7937
7938 // clear the state completely, instead of just marking
7939 // things invalid like invalidate_all_pages() does
7940 void new_dynarec_clear_full()
7941 {
7942   int n;
7943   out=(u_char *)BASE_ADDR;
7944   memset(invalid_code,1,sizeof(invalid_code));
7945   memset(hash_table,0xff,sizeof(hash_table));
7946   memset(mini_ht,-1,sizeof(mini_ht));
7947   memset(restore_candidate,0,sizeof(restore_candidate));
7948   memset(shadow,0,sizeof(shadow));
7949   copy=shadow;
7950   expirep=16384; // Expiry pointer, +2 blocks
7951   pending_exception=0;
7952   literalcount=0;
7953   stop_after_jal=0;
7954   inv_code_start=inv_code_end=~0;
7955   // TLB
7956 #ifndef DISABLE_TLB
7957   using_tlb=0;
7958   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7959     memory_map[n]=-1;
7960   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7961     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7962   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7963     memory_map[n]=-1;
7964 #endif
7965   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7966   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7967   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7968 }
7969
7970 void new_dynarec_init()
7971 {
7972   SysPrintf("Init new dynarec\n");
7973   out=(u_char *)BASE_ADDR;
7974 #if BASE_ADDR_FIXED
7975   if (mmap (out, 1<<TARGET_SIZE_2,
7976             PROT_READ | PROT_WRITE | PROT_EXEC,
7977             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7978             -1, 0) <= 0) {
7979     SysPrintf("mmap() failed: %s\n", strerror(errno));
7980   }
7981 #else
7982   // not all systems allow execute in data segment by default
7983   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7984     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7985 #endif
7986 #ifdef MUPEN64
7987   rdword=&readmem_dword;
7988   fake_pc.f.r.rs=&readmem_dword;
7989   fake_pc.f.r.rt=&readmem_dword;
7990   fake_pc.f.r.rd=&readmem_dword;
7991 #endif
7992   int n;
7993   cycle_multiplier=200;
7994   new_dynarec_clear_full();
7995 #ifdef HOST_IMM8
7996   // Copy this into local area so we don't have to put it in every literal pool
7997   invc_ptr=invalid_code;
7998 #endif
7999 #ifdef MUPEN64
8000   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
8001     writemem[n] = write_nomem_new;
8002     writememb[n] = write_nomemb_new;
8003     writememh[n] = write_nomemh_new;
8004 #ifndef FORCE32
8005     writememd[n] = write_nomemd_new;
8006 #endif
8007     readmem[n] = read_nomem_new;
8008     readmemb[n] = read_nomemb_new;
8009     readmemh[n] = read_nomemh_new;
8010 #ifndef FORCE32
8011     readmemd[n] = read_nomemd_new;
8012 #endif
8013   }
8014   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
8015     writemem[n] = write_rdram_new;
8016     writememb[n] = write_rdramb_new;
8017     writememh[n] = write_rdramh_new;
8018 #ifndef FORCE32
8019     writememd[n] = write_rdramd_new;
8020 #endif
8021   }
8022   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
8023     writemem[n] = write_nomem_new;
8024     writememb[n] = write_nomemb_new;
8025     writememh[n] = write_nomemh_new;
8026 #ifndef FORCE32
8027     writememd[n] = write_nomemd_new;
8028 #endif
8029     readmem[n] = read_nomem_new;
8030     readmemb[n] = read_nomemb_new;
8031     readmemh[n] = read_nomemh_new;
8032 #ifndef FORCE32
8033     readmemd[n] = read_nomemd_new;
8034 #endif
8035   }
8036 #endif
8037   tlb_hacks();
8038   arch_init();
8039   new_dynarec_test();
8040 #ifndef RAM_FIXED
8041   ram_offset=(u_int)rdram-0x80000000;
8042 #endif
8043   if (ram_offset!=0)
8044     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
8045 }
8046
8047 void new_dynarec_cleanup()
8048 {
8049   int n;
8050   #if BASE_ADDR_FIXED
8051   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
8052   #endif
8053   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8054   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8055   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8056   #ifdef ROM_COPY
8057   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
8058   #endif
8059 }
8060
8061 static u_int *get_source_start(u_int addr, u_int *limit)
8062 {
8063   if (addr < 0x00200000 ||
8064     (0xa0000000 <= addr && addr < 0xa0200000)) {
8065     // used for BIOS calls mostly?
8066     *limit = (addr&0xa0000000)|0x00200000;
8067     return (u_int *)((u_int)rdram + (addr&0x1fffff));
8068   }
8069   else if (!Config.HLE && (
8070     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8071     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8072     // BIOS
8073     *limit = (addr & 0xfff00000) | 0x80000;
8074     return (u_int *)((u_int)psxR + (addr&0x7ffff));
8075   }
8076   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
8077     *limit = (addr & 0x80600000) + 0x00200000;
8078     return (u_int *)((u_int)rdram + (addr&0x1fffff));
8079   }
8080 }
8081
8082 static u_int scan_for_ret(u_int addr)
8083 {
8084   u_int limit = 0;
8085   u_int *mem;
8086
8087   mem = get_source_start(addr, &limit);
8088   if (mem == NULL)
8089     return addr;
8090
8091   if (limit > addr + 0x1000)
8092     limit = addr + 0x1000;
8093   for (; addr < limit; addr += 4, mem++) {
8094     if (*mem == 0x03e00008) // jr $ra
8095       return addr + 8;
8096   }
8097 }
8098
8099 struct savestate_block {
8100   uint32_t addr;
8101   uint32_t regflags;
8102 };
8103
8104 static int addr_cmp(const void *p1_, const void *p2_)
8105 {
8106   const struct savestate_block *p1 = p1_, *p2 = p2_;
8107   return p1->addr - p2->addr;
8108 }
8109
8110 int new_dynarec_save_blocks(void *save, int size)
8111 {
8112   struct savestate_block *blocks = save;
8113   int maxcount = size / sizeof(blocks[0]);
8114   struct savestate_block tmp_blocks[1024];
8115   struct ll_entry *head;
8116   int p, s, d, o, bcnt;
8117   u_int addr;
8118
8119   o = 0;
8120   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
8121     bcnt = 0;
8122     for (head = jump_in[p]; head != NULL; head = head->next) {
8123       tmp_blocks[bcnt].addr = head->vaddr;
8124       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
8125       bcnt++;
8126     }
8127     if (bcnt < 1)
8128       continue;
8129     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
8130
8131     addr = tmp_blocks[0].addr;
8132     for (s = d = 0; s < bcnt; s++) {
8133       if (tmp_blocks[s].addr < addr)
8134         continue;
8135       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
8136         tmp_blocks[d++] = tmp_blocks[s];
8137       addr = scan_for_ret(tmp_blocks[s].addr);
8138     }
8139
8140     if (o + d > maxcount)
8141       d = maxcount - o;
8142     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
8143     o += d;
8144   }
8145
8146   return o * sizeof(blocks[0]);
8147 }
8148
8149 void new_dynarec_load_blocks(const void *save, int size)
8150 {
8151   const struct savestate_block *blocks = save;
8152   int count = size / sizeof(blocks[0]);
8153   u_int regs_save[32];
8154   uint32_t f;
8155   int i, b;
8156
8157   get_addr(psxRegs.pc);
8158
8159   // change GPRs for speculation to at least partially work..
8160   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
8161   for (i = 1; i < 32; i++)
8162     psxRegs.GPR.r[i] = 0x80000000;
8163
8164   for (b = 0; b < count; b++) {
8165     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
8166       if (f & 1)
8167         psxRegs.GPR.r[i] = 0x1f800000;
8168     }
8169
8170     get_addr(blocks[b].addr);
8171
8172     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
8173       if (f & 1)
8174         psxRegs.GPR.r[i] = 0x80000000;
8175     }
8176   }
8177
8178   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
8179 }
8180
8181 int new_recompile_block(int addr)
8182 {
8183   u_int pagelimit = 0;
8184   u_int state_rflags = 0;
8185   int i;
8186
8187   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8188   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8189   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8190   //if(debug) 
8191   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8192   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8193   /*if(Count>=312978186) {
8194     rlist();
8195   }*/
8196   //rlist();
8197
8198   // this is just for speculation
8199   for (i = 1; i < 32; i++) {
8200     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
8201       state_rflags |= 1 << i;
8202   }
8203
8204   start = (u_int)addr&~3;
8205   //assert(((u_int)addr&1)==0);
8206   new_dynarec_did_compile=1;
8207   if (Config.HLE && start == 0x80001000) // hlecall
8208   {
8209     // XXX: is this enough? Maybe check hleSoftCall?
8210     u_int beginning=(u_int)out;
8211     u_int page=get_page(start);
8212     invalid_code[start>>12]=0;
8213     emit_movimm(start,0);
8214     emit_writeword(0,(int)&pcaddr);
8215     emit_jmp((int)new_dyna_leave);
8216     literal_pool(0);
8217 #ifdef __arm__
8218     __clear_cache((void *)beginning,out);
8219 #endif
8220     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
8221     return 0;
8222   }
8223
8224   source = get_source_start(start, &pagelimit);
8225   if (source == NULL) {
8226     SysPrintf("Compile at bogus memory address: %08x\n", addr);
8227     exit(1);
8228   }
8229
8230   /* Pass 1: disassemble */
8231   /* Pass 2: register dependencies, branch targets */
8232   /* Pass 3: register allocation */
8233   /* Pass 4: branch dependencies */
8234   /* Pass 5: pre-alloc */
8235   /* Pass 6: optimize clean/dirty state */
8236   /* Pass 7: flag 32-bit registers */
8237   /* Pass 8: assembly */
8238   /* Pass 9: linker */
8239   /* Pass 10: garbage collection / free memory */
8240
8241   int j;
8242   int done=0;
8243   unsigned int type,op,op2;
8244
8245   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8246   
8247   /* Pass 1 disassembly */
8248
8249   for(i=0;!done;i++) {
8250     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8251     minimum_free_regs[i]=0;
8252     opcode[i]=op=source[i]>>26;
8253     switch(op)
8254     {
8255       case 0x00: strcpy(insn[i],"special"); type=NI;
8256         op2=source[i]&0x3f;
8257         switch(op2)
8258         {
8259           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8260           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8261           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8262           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8263           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8264           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8265           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8266           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8267           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8268           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8269           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8270           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8271           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8272           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8273           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8274           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8275           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8276           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8277           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8278           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8279           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8280           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8281           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8282           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8283           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8284           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8285           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8286           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8287           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8288           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8289           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8290           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8291           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8292           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8293           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8294 #ifndef FORCE32
8295           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8296           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8297           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8298           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8299           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8300           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8301           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8302           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8303           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8304           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8305           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8306           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8307           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8308           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8309           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8310           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8311           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8312 #endif
8313         }
8314         break;
8315       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8316         op2=(source[i]>>16)&0x1f;
8317         switch(op2)
8318         {
8319           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8320           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8321           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8322           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8323           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8324           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8325           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8326           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8327           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8328           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8329           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8330           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8331           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8332           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8333         }
8334         break;
8335       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8336       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8337       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8338       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8339       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8340       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8341       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8342       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8343       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8344       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8345       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8346       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8347       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8348       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8349       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8350         op2=(source[i]>>21)&0x1f;
8351         switch(op2)
8352         {
8353           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8354           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8355           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8356           switch(source[i]&0x3f)
8357           {
8358             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8359             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8360             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8361             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8362 #ifdef PCSX
8363             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8364 #else
8365             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8366 #endif
8367           }
8368         }
8369         break;
8370       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8371         op2=(source[i]>>21)&0x1f;
8372         switch(op2)
8373         {
8374           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8375           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8376           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8377           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8378           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8379           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8380           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8381           switch((source[i]>>16)&0x3)
8382           {
8383             case 0x00: strcpy(insn[i],"BC1F"); break;
8384             case 0x01: strcpy(insn[i],"BC1T"); break;
8385             case 0x02: strcpy(insn[i],"BC1FL"); break;
8386             case 0x03: strcpy(insn[i],"BC1TL"); break;
8387           }
8388           break;
8389           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8390           switch(source[i]&0x3f)
8391           {
8392             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8393             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8394             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8395             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8396             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8397             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8398             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8399             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8400             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8401             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8402             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8403             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8404             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8405             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8406             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8407             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8408             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8409             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8410             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8411             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8412             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8413             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8414             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8415             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8416             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8417             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8418             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8419             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8420             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8421             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8422             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8423             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8424             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8425             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8426             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8427           }
8428           break;
8429           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8430           switch(source[i]&0x3f)
8431           {
8432             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8433             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8434             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8435             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8436             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8437             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8438             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8439             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8440             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8441             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8442             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8443             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8444             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8445             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8446             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8447             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8448             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8449             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8450             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8451             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8452             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8453             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8454             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8455             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8456             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8457             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8458             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8459             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8460             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8461             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8462             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8463             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8464             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8465             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8466             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8467           }
8468           break;
8469           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8470           switch(source[i]&0x3f)
8471           {
8472             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8473             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8474           }
8475           break;
8476           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8477           switch(source[i]&0x3f)
8478           {
8479             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8480             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8481           }
8482           break;
8483         }
8484         break;
8485 #ifndef FORCE32
8486       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8487       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8488       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8489       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8490       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8491       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8492       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8493       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8494 #endif
8495       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8496       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8497       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8498       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8499       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8500       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8501       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8502 #ifndef FORCE32
8503       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8504 #endif
8505       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8506       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8507       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8508       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8509 #ifndef FORCE32
8510       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8511       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8512 #endif
8513       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8514       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8515       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8516       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8517 #ifndef FORCE32
8518       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8519       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8520       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8521 #endif
8522       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8523       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8524 #ifndef FORCE32
8525       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8526       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8527       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8528 #endif
8529 #ifdef PCSX
8530       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8531         op2=(source[i]>>21)&0x1f;
8532         //if (op2 & 0x10) {
8533         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8534           if (gte_handlers[source[i]&0x3f]!=NULL) {
8535             if (gte_regnames[source[i]&0x3f]!=NULL)
8536               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8537             else
8538               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8539             type=C2OP;
8540           }
8541         }
8542         else switch(op2)
8543         {
8544           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8545           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8546           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8547           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8548         }
8549         break;
8550       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8551       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8552       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8553 #endif
8554       default: strcpy(insn[i],"???"); type=NI;
8555         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8556         break;
8557     }
8558     itype[i]=type;
8559     opcode2[i]=op2;
8560     /* Get registers/immediates */
8561     lt1[i]=0;
8562     us1[i]=0;
8563     us2[i]=0;
8564     dep1[i]=0;
8565     dep2[i]=0;
8566     gte_rs[i]=gte_rt[i]=0;
8567     switch(type) {
8568       case LOAD:
8569         rs1[i]=(source[i]>>21)&0x1f;
8570         rs2[i]=0;
8571         rt1[i]=(source[i]>>16)&0x1f;
8572         rt2[i]=0;
8573         imm[i]=(short)source[i];
8574         break;
8575       case STORE:
8576       case STORELR:
8577         rs1[i]=(source[i]>>21)&0x1f;
8578         rs2[i]=(source[i]>>16)&0x1f;
8579         rt1[i]=0;
8580         rt2[i]=0;
8581         imm[i]=(short)source[i];
8582         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8583         break;
8584       case LOADLR:
8585         // LWL/LWR only load part of the register,
8586         // therefore the target register must be treated as a source too
8587         rs1[i]=(source[i]>>21)&0x1f;
8588         rs2[i]=(source[i]>>16)&0x1f;
8589         rt1[i]=(source[i]>>16)&0x1f;
8590         rt2[i]=0;
8591         imm[i]=(short)source[i];
8592         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8593         if(op==0x26) dep1[i]=rt1[i]; // LWR
8594         break;
8595       case IMM16:
8596         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8597         else rs1[i]=(source[i]>>21)&0x1f;
8598         rs2[i]=0;
8599         rt1[i]=(source[i]>>16)&0x1f;
8600         rt2[i]=0;
8601         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8602           imm[i]=(unsigned short)source[i];
8603         }else{
8604           imm[i]=(short)source[i];
8605         }
8606         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8607         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8608         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8609         break;
8610       case UJUMP:
8611         rs1[i]=0;
8612         rs2[i]=0;
8613         rt1[i]=0;
8614         rt2[i]=0;
8615         // The JAL instruction writes to r31.
8616         if (op&1) {
8617           rt1[i]=31;
8618         }
8619         rs2[i]=CCREG;
8620         break;
8621       case RJUMP:
8622         rs1[i]=(source[i]>>21)&0x1f;
8623         rs2[i]=0;
8624         rt1[i]=0;
8625         rt2[i]=0;
8626         // The JALR instruction writes to rd.
8627         if (op2&1) {
8628           rt1[i]=(source[i]>>11)&0x1f;
8629         }
8630         rs2[i]=CCREG;
8631         break;
8632       case CJUMP:
8633         rs1[i]=(source[i]>>21)&0x1f;
8634         rs2[i]=(source[i]>>16)&0x1f;
8635         rt1[i]=0;
8636         rt2[i]=0;
8637         if(op&2) { // BGTZ/BLEZ
8638           rs2[i]=0;
8639         }
8640         us1[i]=rs1[i];
8641         us2[i]=rs2[i];
8642         likely[i]=op>>4;
8643         break;
8644       case SJUMP:
8645         rs1[i]=(source[i]>>21)&0x1f;
8646         rs2[i]=CCREG;
8647         rt1[i]=0;
8648         rt2[i]=0;
8649         us1[i]=rs1[i];
8650         if(op2&0x10) { // BxxAL
8651           rt1[i]=31;
8652           // NOTE: If the branch is not taken, r31 is still overwritten
8653         }
8654         likely[i]=(op2&2)>>1;
8655         break;
8656       case FJUMP:
8657         rs1[i]=FSREG;
8658         rs2[i]=CSREG;
8659         rt1[i]=0;
8660         rt2[i]=0;
8661         likely[i]=((source[i])>>17)&1;
8662         break;
8663       case ALU:
8664         rs1[i]=(source[i]>>21)&0x1f; // source
8665         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8666         rt1[i]=(source[i]>>11)&0x1f; // destination
8667         rt2[i]=0;
8668         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8669           us1[i]=rs1[i];us2[i]=rs2[i];
8670         }
8671         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8672           dep1[i]=rs1[i];dep2[i]=rs2[i];
8673         }
8674         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8675           dep1[i]=rs1[i];dep2[i]=rs2[i];
8676         }
8677         break;
8678       case MULTDIV:
8679         rs1[i]=(source[i]>>21)&0x1f; // source
8680         rs2[i]=(source[i]>>16)&0x1f; // divisor
8681         rt1[i]=HIREG;
8682         rt2[i]=LOREG;
8683         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8684           us1[i]=rs1[i];us2[i]=rs2[i];
8685         }
8686         break;
8687       case MOV:
8688         rs1[i]=0;
8689         rs2[i]=0;
8690         rt1[i]=0;
8691         rt2[i]=0;
8692         if(op2==0x10) rs1[i]=HIREG; // MFHI
8693         if(op2==0x11) rt1[i]=HIREG; // MTHI
8694         if(op2==0x12) rs1[i]=LOREG; // MFLO
8695         if(op2==0x13) rt1[i]=LOREG; // MTLO
8696         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8697         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8698         dep1[i]=rs1[i];
8699         break;
8700       case SHIFT:
8701         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8702         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8703         rt1[i]=(source[i]>>11)&0x1f; // destination
8704         rt2[i]=0;
8705         // DSLLV/DSRLV/DSRAV are 64-bit
8706         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8707         break;
8708       case SHIFTIMM:
8709         rs1[i]=(source[i]>>16)&0x1f;
8710         rs2[i]=0;
8711         rt1[i]=(source[i]>>11)&0x1f;
8712         rt2[i]=0;
8713         imm[i]=(source[i]>>6)&0x1f;
8714         // DSxx32 instructions
8715         if(op2>=0x3c) imm[i]|=0x20;
8716         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8717         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8718         break;
8719       case COP0:
8720         rs1[i]=0;
8721         rs2[i]=0;
8722         rt1[i]=0;
8723         rt2[i]=0;
8724         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8725         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8726         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8727         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8728         break;
8729       case COP1:
8730         rs1[i]=0;
8731         rs2[i]=0;
8732         rt1[i]=0;
8733         rt2[i]=0;
8734         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8735         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8736         if(op2==5) us1[i]=rs1[i]; // DMTC1
8737         rs2[i]=CSREG;
8738         break;
8739       case COP2:
8740         rs1[i]=0;
8741         rs2[i]=0;
8742         rt1[i]=0;
8743         rt2[i]=0;
8744         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8745         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8746         rs2[i]=CSREG;
8747         int gr=(source[i]>>11)&0x1F;
8748         switch(op2)
8749         {
8750           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8751           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8752           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
8753           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8754         }
8755         break;
8756       case C1LS:
8757         rs1[i]=(source[i]>>21)&0x1F;
8758         rs2[i]=CSREG;
8759         rt1[i]=0;
8760         rt2[i]=0;
8761         imm[i]=(short)source[i];
8762         break;
8763       case C2LS:
8764         rs1[i]=(source[i]>>21)&0x1F;
8765         rs2[i]=0;
8766         rt1[i]=0;
8767         rt2[i]=0;
8768         imm[i]=(short)source[i];
8769         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8770         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8771         break;
8772       case C2OP:
8773         rs1[i]=0;
8774         rs2[i]=0;
8775         rt1[i]=0;
8776         rt2[i]=0;
8777         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
8778         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
8779         gte_rt[i]|=1ll<<63; // every op changes flags
8780         if((source[i]&0x3f)==GTE_MVMVA) {
8781           int v = (source[i] >> 15) & 3;
8782           gte_rs[i]&=~0xe3fll;
8783           if(v==3) gte_rs[i]|=0xe00ll;
8784           else gte_rs[i]|=3ll<<(v*2);
8785         }
8786         break;
8787       case FLOAT:
8788       case FCONV:
8789         rs1[i]=0;
8790         rs2[i]=CSREG;
8791         rt1[i]=0;
8792         rt2[i]=0;
8793         break;
8794       case FCOMP:
8795         rs1[i]=FSREG;
8796         rs2[i]=CSREG;
8797         rt1[i]=FSREG;
8798         rt2[i]=0;
8799         break;
8800       case SYSCALL:
8801       case HLECALL:
8802       case INTCALL:
8803         rs1[i]=CCREG;
8804         rs2[i]=0;
8805         rt1[i]=0;
8806         rt2[i]=0;
8807         break;
8808       default:
8809         rs1[i]=0;
8810         rs2[i]=0;
8811         rt1[i]=0;
8812         rt2[i]=0;
8813     }
8814     /* Calculate branch target addresses */
8815     if(type==UJUMP)
8816       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8817     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8818       ba[i]=start+i*4+8; // Ignore never taken branch
8819     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8820       ba[i]=start+i*4+8; // Ignore never taken branch
8821     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8822       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8823     else ba[i]=-1;
8824 #ifdef PCSX
8825     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8826       int do_in_intrp=0;
8827       // branch in delay slot?
8828       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8829         // don't handle first branch and call interpreter if it's hit
8830         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8831         do_in_intrp=1;
8832       }
8833       // basic load delay detection
8834       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8835         int t=(ba[i-1]-start)/4;
8836         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8837           // jump target wants DS result - potential load delay effect
8838           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
8839           do_in_intrp=1;
8840           bt[t+1]=1; // expected return from interpreter
8841         }
8842         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8843               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8844           // v0 overwrite like this is a sign of trouble, bail out
8845           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8846           do_in_intrp=1;
8847         }
8848       }
8849       if(do_in_intrp) {
8850         rs1[i-1]=CCREG;
8851         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8852         ba[i-1]=-1;
8853         itype[i-1]=INTCALL;
8854         done=2;
8855         i--; // don't compile the DS
8856       }
8857     }
8858 #endif
8859     /* Is this the end of the block? */
8860     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8861       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8862         done=2;
8863       }
8864       else {
8865         if(stop_after_jal) done=1;
8866         // Stop on BREAK
8867         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8868       }
8869       // Don't recompile stuff that's already compiled
8870       if(check_addr(start+i*4+4)) done=1;
8871       // Don't get too close to the limit
8872       if(i>MAXBLOCK/2) done=1;
8873     }
8874     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8875     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8876     if(done==2) {
8877       // Does the block continue due to a branch?
8878       for(j=i-1;j>=0;j--)
8879       {
8880         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8881         if(ba[j]==start+i*4+4) done=j=0;
8882         if(ba[j]==start+i*4+8) done=j=0;
8883       }
8884     }
8885     //assert(i<MAXBLOCK-1);
8886     if(start+i*4==pagelimit-4) done=1;
8887     assert(start+i*4<pagelimit);
8888     if (i==MAXBLOCK-1) done=1;
8889     // Stop if we're compiling junk
8890     if(itype[i]==NI&&opcode[i]==0x11) {
8891       done=stop_after_jal=1;
8892       SysPrintf("Disabled speculative precompilation\n");
8893     }
8894   }
8895   slen=i;
8896   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8897     if(start+i*4==pagelimit) {
8898       itype[i-1]=SPAN;
8899     }
8900   }
8901   assert(slen>0);
8902
8903   /* Pass 2 - Register dependencies and branch targets */
8904
8905   unneeded_registers(0,slen-1,0);
8906   
8907   /* Pass 3 - Register allocation */
8908
8909   struct regstat current; // Current register allocations/status
8910   current.is32=1;
8911   current.dirty=0;
8912   current.u=unneeded_reg[0];
8913   current.uu=unneeded_reg_upper[0];
8914   clear_all_regs(current.regmap);
8915   alloc_reg(&current,0,CCREG);
8916   dirty_reg(&current,CCREG);
8917   current.isconst=0;
8918   current.wasconst=0;
8919   current.waswritten=0;
8920   int ds=0;
8921   int cc=0;
8922   int hr=-1;
8923
8924 #ifndef FORCE32
8925   provisional_32bit();
8926 #endif
8927   if((u_int)addr&1) {
8928     // First instruction is delay slot
8929     cc=-1;
8930     bt[1]=1;
8931     ds=1;
8932     unneeded_reg[0]=1;
8933     unneeded_reg_upper[0]=1;
8934     current.regmap[HOST_BTREG]=BTREG;
8935   }
8936   
8937   for(i=0;i<slen;i++)
8938   {
8939     if(bt[i])
8940     {
8941       int hr;
8942       for(hr=0;hr<HOST_REGS;hr++)
8943       {
8944         // Is this really necessary?
8945         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8946       }
8947       current.isconst=0;
8948       current.waswritten=0;
8949     }
8950     if(i>1)
8951     {
8952       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8953       {
8954         if(rs1[i-2]==0||rs2[i-2]==0)
8955         {
8956           if(rs1[i-2]) {
8957             current.is32|=1LL<<rs1[i-2];
8958             int hr=get_reg(current.regmap,rs1[i-2]|64);
8959             if(hr>=0) current.regmap[hr]=-1;
8960           }
8961           if(rs2[i-2]) {
8962             current.is32|=1LL<<rs2[i-2];
8963             int hr=get_reg(current.regmap,rs2[i-2]|64);
8964             if(hr>=0) current.regmap[hr]=-1;
8965           }
8966         }
8967       }
8968     }
8969 #ifndef FORCE32
8970     // If something jumps here with 64-bit values
8971     // then promote those registers to 64 bits
8972     if(bt[i])
8973     {
8974       uint64_t temp_is32=current.is32;
8975       for(j=i-1;j>=0;j--)
8976       {
8977         if(ba[j]==start+i*4) 
8978           temp_is32&=branch_regs[j].is32;
8979       }
8980       for(j=i;j<slen;j++)
8981       {
8982         if(ba[j]==start+i*4) 
8983           //temp_is32=1;
8984           temp_is32&=p32[j];
8985       }
8986       if(temp_is32!=current.is32) {
8987         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8988         #ifndef DESTRUCTIVE_WRITEBACK
8989         if(ds)
8990         #endif
8991         for(hr=0;hr<HOST_REGS;hr++)
8992         {
8993           int r=current.regmap[hr];
8994           if(r>0&&r<64)
8995           {
8996             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8997               temp_is32|=1LL<<r;
8998               //printf("restore %d\n",r);
8999             }
9000           }
9001         }
9002         current.is32=temp_is32;
9003       }
9004     }
9005 #else
9006     current.is32=-1LL;
9007 #endif
9008
9009     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
9010     regs[i].wasconst=current.isconst;
9011     regs[i].was32=current.is32;
9012     regs[i].wasdirty=current.dirty;
9013     regs[i].loadedconst=0;
9014     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
9015     // To change a dirty register from 32 to 64 bits, we must write
9016     // it out during the previous cycle (for branches, 2 cycles)
9017     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
9018     {
9019       uint64_t temp_is32=current.is32;
9020       for(j=i-1;j>=0;j--)
9021       {
9022         if(ba[j]==start+i*4+4) 
9023           temp_is32&=branch_regs[j].is32;
9024       }
9025       for(j=i;j<slen;j++)
9026       {
9027         if(ba[j]==start+i*4+4) 
9028           //temp_is32=1;
9029           temp_is32&=p32[j];
9030       }
9031       if(temp_is32!=current.is32) {
9032         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9033         for(hr=0;hr<HOST_REGS;hr++)
9034         {
9035           int r=current.regmap[hr];
9036           if(r>0)
9037           {
9038             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9039               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
9040               {
9041                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
9042                 {
9043                   //printf("dump %d/r%d\n",hr,r);
9044                   current.regmap[hr]=-1;
9045                   if(get_reg(current.regmap,r|64)>=0) 
9046                     current.regmap[get_reg(current.regmap,r|64)]=-1;
9047                 }
9048               }
9049             }
9050           }
9051         }
9052       }
9053     }
9054     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
9055     {
9056       uint64_t temp_is32=current.is32;
9057       for(j=i-1;j>=0;j--)
9058       {
9059         if(ba[j]==start+i*4+8) 
9060           temp_is32&=branch_regs[j].is32;
9061       }
9062       for(j=i;j<slen;j++)
9063       {
9064         if(ba[j]==start+i*4+8) 
9065           //temp_is32=1;
9066           temp_is32&=p32[j];
9067       }
9068       if(temp_is32!=current.is32) {
9069         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9070         for(hr=0;hr<HOST_REGS;hr++)
9071         {
9072           int r=current.regmap[hr];
9073           if(r>0)
9074           {
9075             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9076               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
9077               {
9078                 //printf("dump %d/r%d\n",hr,r);
9079                 current.regmap[hr]=-1;
9080                 if(get_reg(current.regmap,r|64)>=0) 
9081                   current.regmap[get_reg(current.regmap,r|64)]=-1;
9082               }
9083             }
9084           }
9085         }
9086       }
9087     }
9088     #endif
9089     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9090       if(i+1<slen) {
9091         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9092         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9093         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9094         current.u|=1;
9095         current.uu|=1;
9096       } else {
9097         current.u=1;
9098         current.uu=1;
9099       }
9100     } else {
9101       if(i+1<slen) {
9102         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
9103         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9104         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9105         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9106         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9107         current.u|=1;
9108         current.uu|=1;
9109       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
9110     }
9111     is_ds[i]=ds;
9112     if(ds) {
9113       ds=0; // Skip delay slot, already allocated as part of branch
9114       // ...but we need to alloc it in case something jumps here
9115       if(i+1<slen) {
9116         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
9117         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
9118       }else{
9119         current.u=branch_unneeded_reg[i-1];
9120         current.uu=branch_unneeded_reg_upper[i-1];
9121       }
9122       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9123       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9124       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9125       current.u|=1;
9126       current.uu|=1;
9127       struct regstat temp;
9128       memcpy(&temp,&current,sizeof(current));
9129       temp.wasdirty=temp.dirty;
9130       temp.was32=temp.is32;
9131       // TODO: Take into account unconditional branches, as below
9132       delayslot_alloc(&temp,i);
9133       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
9134       regs[i].wasdirty=temp.wasdirty;
9135       regs[i].was32=temp.was32;
9136       regs[i].dirty=temp.dirty;
9137       regs[i].is32=temp.is32;
9138       regs[i].isconst=0;
9139       regs[i].wasconst=0;
9140       current.isconst=0;
9141       // Create entry (branch target) regmap
9142       for(hr=0;hr<HOST_REGS;hr++)
9143       {
9144         int r=temp.regmap[hr];
9145         if(r>=0) {
9146           if(r!=regmap_pre[i][hr]) {
9147             regs[i].regmap_entry[hr]=-1;
9148           }
9149           else
9150           {
9151             if(r<64){
9152               if((current.u>>r)&1) {
9153                 regs[i].regmap_entry[hr]=-1;
9154                 regs[i].regmap[hr]=-1;
9155                 //Don't clear regs in the delay slot as the branch might need them
9156                 //current.regmap[hr]=-1;
9157               }else
9158                 regs[i].regmap_entry[hr]=r;
9159             }
9160             else {
9161               if((current.uu>>(r&63))&1) {
9162                 regs[i].regmap_entry[hr]=-1;
9163                 regs[i].regmap[hr]=-1;
9164                 //Don't clear regs in the delay slot as the branch might need them
9165                 //current.regmap[hr]=-1;
9166               }else
9167                 regs[i].regmap_entry[hr]=r;
9168             }
9169           }
9170         } else {
9171           // First instruction expects CCREG to be allocated
9172           if(i==0&&hr==HOST_CCREG) 
9173             regs[i].regmap_entry[hr]=CCREG;
9174           else
9175             regs[i].regmap_entry[hr]=-1;
9176         }
9177       }
9178     }
9179     else { // Not delay slot
9180       switch(itype[i]) {
9181         case UJUMP:
9182           //current.isconst=0; // DEBUG
9183           //current.wasconst=0; // DEBUG
9184           //regs[i].wasconst=0; // DEBUG
9185           clear_const(&current,rt1[i]);
9186           alloc_cc(&current,i);
9187           dirty_reg(&current,CCREG);
9188           if (rt1[i]==31) {
9189             alloc_reg(&current,i,31);
9190             dirty_reg(&current,31);
9191             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9192             //assert(rt1[i+1]!=rt1[i]);
9193             #ifdef REG_PREFETCH
9194             alloc_reg(&current,i,PTEMP);
9195             #endif
9196             //current.is32|=1LL<<rt1[i];
9197           }
9198           ooo[i]=1;
9199           delayslot_alloc(&current,i+1);
9200           //current.isconst=0; // DEBUG
9201           ds=1;
9202           //printf("i=%d, isconst=%x\n",i,current.isconst);
9203           break;
9204         case RJUMP:
9205           //current.isconst=0;
9206           //current.wasconst=0;
9207           //regs[i].wasconst=0;
9208           clear_const(&current,rs1[i]);
9209           clear_const(&current,rt1[i]);
9210           alloc_cc(&current,i);
9211           dirty_reg(&current,CCREG);
9212           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9213             alloc_reg(&current,i,rs1[i]);
9214             if (rt1[i]!=0) {
9215               alloc_reg(&current,i,rt1[i]);
9216               dirty_reg(&current,rt1[i]);
9217               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9218               assert(rt1[i+1]!=rt1[i]);
9219               #ifdef REG_PREFETCH
9220               alloc_reg(&current,i,PTEMP);
9221               #endif
9222             }
9223             #ifdef USE_MINI_HT
9224             if(rs1[i]==31) { // JALR
9225               alloc_reg(&current,i,RHASH);
9226               #ifndef HOST_IMM_ADDR32
9227               alloc_reg(&current,i,RHTBL);
9228               #endif
9229             }
9230             #endif
9231             delayslot_alloc(&current,i+1);
9232           } else {
9233             // The delay slot overwrites our source register,
9234             // allocate a temporary register to hold the old value.
9235             current.isconst=0;
9236             current.wasconst=0;
9237             regs[i].wasconst=0;
9238             delayslot_alloc(&current,i+1);
9239             current.isconst=0;
9240             alloc_reg(&current,i,RTEMP);
9241           }
9242           //current.isconst=0; // DEBUG
9243           ooo[i]=1;
9244           ds=1;
9245           break;
9246         case CJUMP:
9247           //current.isconst=0;
9248           //current.wasconst=0;
9249           //regs[i].wasconst=0;
9250           clear_const(&current,rs1[i]);
9251           clear_const(&current,rs2[i]);
9252           if((opcode[i]&0x3E)==4) // BEQ/BNE
9253           {
9254             alloc_cc(&current,i);
9255             dirty_reg(&current,CCREG);
9256             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9257             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9258             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9259             {
9260               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9261               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9262             }
9263             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9264                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9265               // The delay slot overwrites one of our conditions.
9266               // Allocate the branch condition registers instead.
9267               current.isconst=0;
9268               current.wasconst=0;
9269               regs[i].wasconst=0;
9270               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9271               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9272               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9273               {
9274                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9275                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9276               }
9277             }
9278             else
9279             {
9280               ooo[i]=1;
9281               delayslot_alloc(&current,i+1);
9282             }
9283           }
9284           else
9285           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9286           {
9287             alloc_cc(&current,i);
9288             dirty_reg(&current,CCREG);
9289             alloc_reg(&current,i,rs1[i]);
9290             if(!(current.is32>>rs1[i]&1))
9291             {
9292               alloc_reg64(&current,i,rs1[i]);
9293             }
9294             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9295               // The delay slot overwrites one of our conditions.
9296               // Allocate the branch condition registers instead.
9297               current.isconst=0;
9298               current.wasconst=0;
9299               regs[i].wasconst=0;
9300               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9301               if(!((current.is32>>rs1[i])&1))
9302               {
9303                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9304               }
9305             }
9306             else
9307             {
9308               ooo[i]=1;
9309               delayslot_alloc(&current,i+1);
9310             }
9311           }
9312           else
9313           // Don't alloc the delay slot yet because we might not execute it
9314           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9315           {
9316             current.isconst=0;
9317             current.wasconst=0;
9318             regs[i].wasconst=0;
9319             alloc_cc(&current,i);
9320             dirty_reg(&current,CCREG);
9321             alloc_reg(&current,i,rs1[i]);
9322             alloc_reg(&current,i,rs2[i]);
9323             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9324             {
9325               alloc_reg64(&current,i,rs1[i]);
9326               alloc_reg64(&current,i,rs2[i]);
9327             }
9328           }
9329           else
9330           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9331           {
9332             current.isconst=0;
9333             current.wasconst=0;
9334             regs[i].wasconst=0;
9335             alloc_cc(&current,i);
9336             dirty_reg(&current,CCREG);
9337             alloc_reg(&current,i,rs1[i]);
9338             if(!(current.is32>>rs1[i]&1))
9339             {
9340               alloc_reg64(&current,i,rs1[i]);
9341             }
9342           }
9343           ds=1;
9344           //current.isconst=0;
9345           break;
9346         case SJUMP:
9347           //current.isconst=0;
9348           //current.wasconst=0;
9349           //regs[i].wasconst=0;
9350           clear_const(&current,rs1[i]);
9351           clear_const(&current,rt1[i]);
9352           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9353           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9354           {
9355             alloc_cc(&current,i);
9356             dirty_reg(&current,CCREG);
9357             alloc_reg(&current,i,rs1[i]);
9358             if(!(current.is32>>rs1[i]&1))
9359             {
9360               alloc_reg64(&current,i,rs1[i]);
9361             }
9362             if (rt1[i]==31) { // BLTZAL/BGEZAL
9363               alloc_reg(&current,i,31);
9364               dirty_reg(&current,31);
9365               //#ifdef REG_PREFETCH
9366               //alloc_reg(&current,i,PTEMP);
9367               //#endif
9368               //current.is32|=1LL<<rt1[i];
9369             }
9370             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9371                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9372               // Allocate the branch condition registers instead.
9373               current.isconst=0;
9374               current.wasconst=0;
9375               regs[i].wasconst=0;
9376               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9377               if(!((current.is32>>rs1[i])&1))
9378               {
9379                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9380               }
9381             }
9382             else
9383             {
9384               ooo[i]=1;
9385               delayslot_alloc(&current,i+1);
9386             }
9387           }
9388           else
9389           // Don't alloc the delay slot yet because we might not execute it
9390           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9391           {
9392             current.isconst=0;
9393             current.wasconst=0;
9394             regs[i].wasconst=0;
9395             alloc_cc(&current,i);
9396             dirty_reg(&current,CCREG);
9397             alloc_reg(&current,i,rs1[i]);
9398             if(!(current.is32>>rs1[i]&1))
9399             {
9400               alloc_reg64(&current,i,rs1[i]);
9401             }
9402           }
9403           ds=1;
9404           //current.isconst=0;
9405           break;
9406         case FJUMP:
9407           current.isconst=0;
9408           current.wasconst=0;
9409           regs[i].wasconst=0;
9410           if(likely[i]==0) // BC1F/BC1T
9411           {
9412             // TODO: Theoretically we can run out of registers here on x86.
9413             // The delay slot can allocate up to six, and we need to check
9414             // CSREG before executing the delay slot.  Possibly we can drop
9415             // the cycle count and then reload it after checking that the
9416             // FPU is in a usable state, or don't do out-of-order execution.
9417             alloc_cc(&current,i);
9418             dirty_reg(&current,CCREG);
9419             alloc_reg(&current,i,FSREG);
9420             alloc_reg(&current,i,CSREG);
9421             if(itype[i+1]==FCOMP) {
9422               // The delay slot overwrites the branch condition.
9423               // Allocate the branch condition registers instead.
9424               alloc_cc(&current,i);
9425               dirty_reg(&current,CCREG);
9426               alloc_reg(&current,i,CSREG);
9427               alloc_reg(&current,i,FSREG);
9428             }
9429             else {
9430               ooo[i]=1;
9431               delayslot_alloc(&current,i+1);
9432               alloc_reg(&current,i+1,CSREG);
9433             }
9434           }
9435           else
9436           // Don't alloc the delay slot yet because we might not execute it
9437           if(likely[i]) // BC1FL/BC1TL
9438           {
9439             alloc_cc(&current,i);
9440             dirty_reg(&current,CCREG);
9441             alloc_reg(&current,i,CSREG);
9442             alloc_reg(&current,i,FSREG);
9443           }
9444           ds=1;
9445           current.isconst=0;
9446           break;
9447         case IMM16:
9448           imm16_alloc(&current,i);
9449           break;
9450         case LOAD:
9451         case LOADLR:
9452           load_alloc(&current,i);
9453           break;
9454         case STORE:
9455         case STORELR:
9456           store_alloc(&current,i);
9457           break;
9458         case ALU:
9459           alu_alloc(&current,i);
9460           break;
9461         case SHIFT:
9462           shift_alloc(&current,i);
9463           break;
9464         case MULTDIV:
9465           multdiv_alloc(&current,i);
9466           break;
9467         case SHIFTIMM:
9468           shiftimm_alloc(&current,i);
9469           break;
9470         case MOV:
9471           mov_alloc(&current,i);
9472           break;
9473         case COP0:
9474           cop0_alloc(&current,i);
9475           break;
9476         case COP1:
9477         case COP2:
9478           cop1_alloc(&current,i);
9479           break;
9480         case C1LS:
9481           c1ls_alloc(&current,i);
9482           break;
9483         case C2LS:
9484           c2ls_alloc(&current,i);
9485           break;
9486         case C2OP:
9487           c2op_alloc(&current,i);
9488           break;
9489         case FCONV:
9490           fconv_alloc(&current,i);
9491           break;
9492         case FLOAT:
9493           float_alloc(&current,i);
9494           break;
9495         case FCOMP:
9496           fcomp_alloc(&current,i);
9497           break;
9498         case SYSCALL:
9499         case HLECALL:
9500         case INTCALL:
9501           syscall_alloc(&current,i);
9502           break;
9503         case SPAN:
9504           pagespan_alloc(&current,i);
9505           break;
9506       }
9507       
9508       // Drop the upper half of registers that have become 32-bit
9509       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9510       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9511         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9512         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9513         current.uu|=1;
9514       } else {
9515         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9516         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9517         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9518         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9519         current.uu|=1;
9520       }
9521
9522       // Create entry (branch target) regmap
9523       for(hr=0;hr<HOST_REGS;hr++)
9524       {
9525         int r,or,er;
9526         r=current.regmap[hr];
9527         if(r>=0) {
9528           if(r!=regmap_pre[i][hr]) {
9529             // TODO: delay slot (?)
9530             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9531             if(or<0||(r&63)>=TEMPREG){
9532               regs[i].regmap_entry[hr]=-1;
9533             }
9534             else
9535             {
9536               // Just move it to a different register
9537               regs[i].regmap_entry[hr]=r;
9538               // If it was dirty before, it's still dirty
9539               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9540             }
9541           }
9542           else
9543           {
9544             // Unneeded
9545             if(r==0){
9546               regs[i].regmap_entry[hr]=0;
9547             }
9548             else
9549             if(r<64){
9550               if((current.u>>r)&1) {
9551                 regs[i].regmap_entry[hr]=-1;
9552                 //regs[i].regmap[hr]=-1;
9553                 current.regmap[hr]=-1;
9554               }else
9555                 regs[i].regmap_entry[hr]=r;
9556             }
9557             else {
9558               if((current.uu>>(r&63))&1) {
9559                 regs[i].regmap_entry[hr]=-1;
9560                 //regs[i].regmap[hr]=-1;
9561                 current.regmap[hr]=-1;
9562               }else
9563                 regs[i].regmap_entry[hr]=r;
9564             }
9565           }
9566         } else {
9567           // Branches expect CCREG to be allocated at the target
9568           if(regmap_pre[i][hr]==CCREG) 
9569             regs[i].regmap_entry[hr]=CCREG;
9570           else
9571             regs[i].regmap_entry[hr]=-1;
9572         }
9573       }
9574       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9575     }
9576
9577     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
9578       current.waswritten|=1<<rs1[i-1];
9579     current.waswritten&=~(1<<rt1[i]);
9580     current.waswritten&=~(1<<rt2[i]);
9581     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
9582       current.waswritten&=~(1<<rs1[i]);
9583
9584     /* Branch post-alloc */
9585     if(i>0)
9586     {
9587       current.was32=current.is32;
9588       current.wasdirty=current.dirty;
9589       switch(itype[i-1]) {
9590         case UJUMP:
9591           memcpy(&branch_regs[i-1],&current,sizeof(current));
9592           branch_regs[i-1].isconst=0;
9593           branch_regs[i-1].wasconst=0;
9594           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9595           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9596           alloc_cc(&branch_regs[i-1],i-1);
9597           dirty_reg(&branch_regs[i-1],CCREG);
9598           if(rt1[i-1]==31) { // JAL
9599             alloc_reg(&branch_regs[i-1],i-1,31);
9600             dirty_reg(&branch_regs[i-1],31);
9601             branch_regs[i-1].is32|=1LL<<31;
9602           }
9603           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9604           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9605           break;
9606         case RJUMP:
9607           memcpy(&branch_regs[i-1],&current,sizeof(current));
9608           branch_regs[i-1].isconst=0;
9609           branch_regs[i-1].wasconst=0;
9610           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9611           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9612           alloc_cc(&branch_regs[i-1],i-1);
9613           dirty_reg(&branch_regs[i-1],CCREG);
9614           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9615           if(rt1[i-1]!=0) { // JALR
9616             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9617             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9618             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9619           }
9620           #ifdef USE_MINI_HT
9621           if(rs1[i-1]==31) { // JALR
9622             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9623             #ifndef HOST_IMM_ADDR32
9624             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9625             #endif
9626           }
9627           #endif
9628           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9629           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9630           break;
9631         case CJUMP:
9632           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9633           {
9634             alloc_cc(&current,i-1);
9635             dirty_reg(&current,CCREG);
9636             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9637                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9638               // The delay slot overwrote one of our conditions
9639               // Delay slot goes after the test (in order)
9640               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9641               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9642               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9643               current.u|=1;
9644               current.uu|=1;
9645               delayslot_alloc(&current,i);
9646               current.isconst=0;
9647             }
9648             else
9649             {
9650               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9651               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9652               // Alloc the branch condition registers
9653               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9654               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9655               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9656               {
9657                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9658                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9659               }
9660             }
9661             memcpy(&branch_regs[i-1],&current,sizeof(current));
9662             branch_regs[i-1].isconst=0;
9663             branch_regs[i-1].wasconst=0;
9664             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9665             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9666           }
9667           else
9668           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9669           {
9670             alloc_cc(&current,i-1);
9671             dirty_reg(&current,CCREG);
9672             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9673               // The delay slot overwrote the branch condition
9674               // Delay slot goes after the test (in order)
9675               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9676               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9677               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9678               current.u|=1;
9679               current.uu|=1;
9680               delayslot_alloc(&current,i);
9681               current.isconst=0;
9682             }
9683             else
9684             {
9685               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9686               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9687               // Alloc the branch condition register
9688               alloc_reg(&current,i-1,rs1[i-1]);
9689               if(!(current.is32>>rs1[i-1]&1))
9690               {
9691                 alloc_reg64(&current,i-1,rs1[i-1]);
9692               }
9693             }
9694             memcpy(&branch_regs[i-1],&current,sizeof(current));
9695             branch_regs[i-1].isconst=0;
9696             branch_regs[i-1].wasconst=0;
9697             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9698             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9699           }
9700           else
9701           // Alloc the delay slot in case the branch is taken
9702           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9703           {
9704             memcpy(&branch_regs[i-1],&current,sizeof(current));
9705             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9706             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9707             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9708             alloc_cc(&branch_regs[i-1],i);
9709             dirty_reg(&branch_regs[i-1],CCREG);
9710             delayslot_alloc(&branch_regs[i-1],i);
9711             branch_regs[i-1].isconst=0;
9712             alloc_reg(&current,i,CCREG); // Not taken path
9713             dirty_reg(&current,CCREG);
9714             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9715           }
9716           else
9717           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9718           {
9719             memcpy(&branch_regs[i-1],&current,sizeof(current));
9720             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9721             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9722             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9723             alloc_cc(&branch_regs[i-1],i);
9724             dirty_reg(&branch_regs[i-1],CCREG);
9725             delayslot_alloc(&branch_regs[i-1],i);
9726             branch_regs[i-1].isconst=0;
9727             alloc_reg(&current,i,CCREG); // Not taken path
9728             dirty_reg(&current,CCREG);
9729             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9730           }
9731           break;
9732         case SJUMP:
9733           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9734           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9735           {
9736             alloc_cc(&current,i-1);
9737             dirty_reg(&current,CCREG);
9738             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9739               // The delay slot overwrote the branch condition
9740               // Delay slot goes after the test (in order)
9741               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9742               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9743               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9744               current.u|=1;
9745               current.uu|=1;
9746               delayslot_alloc(&current,i);
9747               current.isconst=0;
9748             }
9749             else
9750             {
9751               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9752               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9753               // Alloc the branch condition register
9754               alloc_reg(&current,i-1,rs1[i-1]);
9755               if(!(current.is32>>rs1[i-1]&1))
9756               {
9757                 alloc_reg64(&current,i-1,rs1[i-1]);
9758               }
9759             }
9760             memcpy(&branch_regs[i-1],&current,sizeof(current));
9761             branch_regs[i-1].isconst=0;
9762             branch_regs[i-1].wasconst=0;
9763             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9764             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9765           }
9766           else
9767           // Alloc the delay slot in case the branch is taken
9768           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9769           {
9770             memcpy(&branch_regs[i-1],&current,sizeof(current));
9771             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9772             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9773             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9774             alloc_cc(&branch_regs[i-1],i);
9775             dirty_reg(&branch_regs[i-1],CCREG);
9776             delayslot_alloc(&branch_regs[i-1],i);
9777             branch_regs[i-1].isconst=0;
9778             alloc_reg(&current,i,CCREG); // Not taken path
9779             dirty_reg(&current,CCREG);
9780             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9781           }
9782           // FIXME: BLTZAL/BGEZAL
9783           if(opcode2[i-1]&0x10) { // BxxZAL
9784             alloc_reg(&branch_regs[i-1],i-1,31);
9785             dirty_reg(&branch_regs[i-1],31);
9786             branch_regs[i-1].is32|=1LL<<31;
9787           }
9788           break;
9789         case FJUMP:
9790           if(likely[i-1]==0) // BC1F/BC1T
9791           {
9792             alloc_cc(&current,i-1);
9793             dirty_reg(&current,CCREG);
9794             if(itype[i]==FCOMP) {
9795               // The delay slot overwrote the branch condition
9796               // Delay slot goes after the test (in order)
9797               delayslot_alloc(&current,i);
9798               current.isconst=0;
9799             }
9800             else
9801             {
9802               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9803               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9804               // Alloc the branch condition register
9805               alloc_reg(&current,i-1,FSREG);
9806             }
9807             memcpy(&branch_regs[i-1],&current,sizeof(current));
9808             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9809           }
9810           else // BC1FL/BC1TL
9811           {
9812             // Alloc the delay slot in case the branch is taken
9813             memcpy(&branch_regs[i-1],&current,sizeof(current));
9814             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9815             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9816             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9817             alloc_cc(&branch_regs[i-1],i);
9818             dirty_reg(&branch_regs[i-1],CCREG);
9819             delayslot_alloc(&branch_regs[i-1],i);
9820             branch_regs[i-1].isconst=0;
9821             alloc_reg(&current,i,CCREG); // Not taken path
9822             dirty_reg(&current,CCREG);
9823             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9824           }
9825           break;
9826       }
9827
9828       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9829       {
9830         if(rt1[i-1]==31) // JAL/JALR
9831         {
9832           // Subroutine call will return here, don't alloc any registers
9833           current.is32=1;
9834           current.dirty=0;
9835           clear_all_regs(current.regmap);
9836           alloc_reg(&current,i,CCREG);
9837           dirty_reg(&current,CCREG);
9838         }
9839         else if(i+1<slen)
9840         {
9841           // Internal branch will jump here, match registers to caller
9842           current.is32=0x3FFFFFFFFLL;
9843           current.dirty=0;
9844           clear_all_regs(current.regmap);
9845           alloc_reg(&current,i,CCREG);
9846           dirty_reg(&current,CCREG);
9847           for(j=i-1;j>=0;j--)
9848           {
9849             if(ba[j]==start+i*4+4) {
9850               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9851               current.is32=branch_regs[j].is32;
9852               current.dirty=branch_regs[j].dirty;
9853               break;
9854             }
9855           }
9856           while(j>=0) {
9857             if(ba[j]==start+i*4+4) {
9858               for(hr=0;hr<HOST_REGS;hr++) {
9859                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9860                   current.regmap[hr]=-1;
9861                 }
9862                 current.is32&=branch_regs[j].is32;
9863                 current.dirty&=branch_regs[j].dirty;
9864               }
9865             }
9866             j--;
9867           }
9868         }
9869       }
9870     }
9871
9872     // Count cycles in between branches
9873     ccadj[i]=cc;
9874     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9875     {
9876       cc=0;
9877     }
9878 #if defined(PCSX) && !defined(DRC_DBG)
9879     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
9880     {
9881       // GTE runs in parallel until accessed, divide by 2 for a rough guess
9882       cc+=gte_cycletab[source[i]&0x3f]/2;
9883     }
9884     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
9885     {
9886       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9887     }
9888     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
9889     {
9890       cc+=4;
9891     }
9892     else if(itype[i]==C2LS)
9893     {
9894       cc+=4;
9895     }
9896 #endif
9897     else
9898     {
9899       cc++;
9900     }
9901
9902     flush_dirty_uppers(&current);
9903     if(!is_ds[i]) {
9904       regs[i].is32=current.is32;
9905       regs[i].dirty=current.dirty;
9906       regs[i].isconst=current.isconst;
9907       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
9908     }
9909     for(hr=0;hr<HOST_REGS;hr++) {
9910       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9911         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9912           regs[i].wasconst&=~(1<<hr);
9913         }
9914       }
9915     }
9916     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9917     regs[i].waswritten=current.waswritten;
9918   }
9919   
9920   /* Pass 4 - Cull unused host registers */
9921   
9922   uint64_t nr=0;
9923   
9924   for (i=slen-1;i>=0;i--)
9925   {
9926     int hr;
9927     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9928     {
9929       if(ba[i]<start || ba[i]>=(start+slen*4))
9930       {
9931         // Branch out of this block, don't need anything
9932         nr=0;
9933       }
9934       else
9935       {
9936         // Internal branch
9937         // Need whatever matches the target
9938         nr=0;
9939         int t=(ba[i]-start)>>2;
9940         for(hr=0;hr<HOST_REGS;hr++)
9941         {
9942           if(regs[i].regmap_entry[hr]>=0) {
9943             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9944           }
9945         }
9946       }
9947       // Conditional branch may need registers for following instructions
9948       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9949       {
9950         if(i<slen-2) {
9951           nr|=needed_reg[i+2];
9952           for(hr=0;hr<HOST_REGS;hr++)
9953           {
9954             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9955             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9956           }
9957         }
9958       }
9959       // Don't need stuff which is overwritten
9960       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9961       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9962       // Merge in delay slot
9963       for(hr=0;hr<HOST_REGS;hr++)
9964       {
9965         if(!likely[i]) {
9966           // These are overwritten unless the branch is "likely"
9967           // and the delay slot is nullified if not taken
9968           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9969           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9970         }
9971         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9972         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9973         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9974         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9975         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9976         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9977         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9978         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9979         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9980           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9981           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9982         }
9983         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9984           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9985           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9986         }
9987         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9988           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9989           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9990         }
9991       }
9992     }
9993     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9994     {
9995       // SYSCALL instruction (software interrupt)
9996       nr=0;
9997     }
9998     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9999     {
10000       // ERET instruction (return from interrupt)
10001       nr=0;
10002     }
10003     else // Non-branch
10004     {
10005       if(i<slen-1) {
10006         for(hr=0;hr<HOST_REGS;hr++) {
10007           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
10008           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
10009           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
10010           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
10011         }
10012       }
10013     }
10014     for(hr=0;hr<HOST_REGS;hr++)
10015     {
10016       // Overwritten registers are not needed
10017       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10018       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10019       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10020       // Source registers are needed
10021       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10022       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10023       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
10024       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
10025       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10026       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10027       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10028       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10029       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
10030         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10031         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10032       }
10033       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
10034         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10035         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10036       }
10037       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
10038         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
10039         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
10040       }
10041       // Don't store a register immediately after writing it,
10042       // may prevent dual-issue.
10043       // But do so if this is a branch target, otherwise we
10044       // might have to load the register before the branch.
10045       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
10046         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
10047            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
10048           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10049           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10050         }
10051         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
10052            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
10053           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10054           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10055         }
10056       }
10057     }
10058     // Cycle count is needed at branches.  Assume it is needed at the target too.
10059     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
10060       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10061       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10062     }
10063     // Save it
10064     needed_reg[i]=nr;
10065     
10066     // Deallocate unneeded registers
10067     for(hr=0;hr<HOST_REGS;hr++)
10068     {
10069       if(!((nr>>hr)&1)) {
10070         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
10071         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10072            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10073            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
10074         {
10075           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10076           {
10077             if(likely[i]) {
10078               regs[i].regmap[hr]=-1;
10079               regs[i].isconst&=~(1<<hr);
10080               if(i<slen-2) {
10081                 regmap_pre[i+2][hr]=-1;
10082                 regs[i+2].wasconst&=~(1<<hr);
10083               }
10084             }
10085           }
10086         }
10087         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10088         {
10089           int d1=0,d2=0,map=0,temp=0;
10090           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
10091           {
10092             d1=dep1[i+1];
10093             d2=dep2[i+1];
10094           }
10095           if(using_tlb) {
10096             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
10097                itype[i+1]==STORE || itype[i+1]==STORELR ||
10098                itype[i+1]==C1LS || itype[i+1]==C2LS)
10099             map=TLREG;
10100           } else
10101           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
10102              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10103             map=INVCP;
10104           }
10105           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
10106              itype[i+1]==C1LS || itype[i+1]==C2LS)
10107             temp=FTEMP;
10108           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10109              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10110              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
10111              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
10112              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10113              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
10114              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
10115              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
10116              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
10117              regs[i].regmap[hr]!=map )
10118           {
10119             regs[i].regmap[hr]=-1;
10120             regs[i].isconst&=~(1<<hr);
10121             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
10122                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
10123                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
10124                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
10125                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
10126                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
10127                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
10128                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
10129                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
10130                branch_regs[i].regmap[hr]!=map)
10131             {
10132               branch_regs[i].regmap[hr]=-1;
10133               branch_regs[i].regmap_entry[hr]=-1;
10134               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10135               {
10136                 if(!likely[i]&&i<slen-2) {
10137                   regmap_pre[i+2][hr]=-1;
10138                   regs[i+2].wasconst&=~(1<<hr);
10139                 }
10140               }
10141             }
10142           }
10143         }
10144         else
10145         {
10146           // Non-branch
10147           if(i>0)
10148           {
10149             int d1=0,d2=0,map=-1,temp=-1;
10150             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
10151             {
10152               d1=dep1[i];
10153               d2=dep2[i];
10154             }
10155             if(using_tlb) {
10156               if(itype[i]==LOAD || itype[i]==LOADLR ||
10157                  itype[i]==STORE || itype[i]==STORELR ||
10158                  itype[i]==C1LS || itype[i]==C2LS)
10159               map=TLREG;
10160             } else if(itype[i]==STORE || itype[i]==STORELR ||
10161                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10162               map=INVCP;
10163             }
10164             if(itype[i]==LOADLR || itype[i]==STORELR ||
10165                itype[i]==C1LS || itype[i]==C2LS)
10166               temp=FTEMP;
10167             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10168                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10169                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10170                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10171                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10172                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10173             {
10174               if(i<slen-1&&!is_ds[i]) {
10175                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10176                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10177                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10178                 {
10179                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10180                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10181                 }
10182                 regmap_pre[i+1][hr]=-1;
10183                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10184                 regs[i+1].wasconst&=~(1<<hr);
10185               }
10186               regs[i].regmap[hr]=-1;
10187               regs[i].isconst&=~(1<<hr);
10188             }
10189           }
10190         }
10191       }
10192     }
10193   }
10194   
10195   /* Pass 5 - Pre-allocate registers */
10196   
10197   // If a register is allocated during a loop, try to allocate it for the
10198   // entire loop, if possible.  This avoids loading/storing registers
10199   // inside of the loop.
10200   
10201   signed char f_regmap[HOST_REGS];
10202   clear_all_regs(f_regmap);
10203   for(i=0;i<slen-1;i++)
10204   {
10205     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10206     {
10207       if(ba[i]>=start && ba[i]<(start+i*4)) 
10208       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10209       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10210       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10211       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10212       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10213       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10214       {
10215         int t=(ba[i]-start)>>2;
10216         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10217         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10218         for(hr=0;hr<HOST_REGS;hr++)
10219         {
10220           if(regs[i].regmap[hr]>64) {
10221             if(!((regs[i].dirty>>hr)&1))
10222               f_regmap[hr]=regs[i].regmap[hr];
10223             else f_regmap[hr]=-1;
10224           }
10225           else if(regs[i].regmap[hr]>=0) {
10226             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10227               // dealloc old register
10228               int n;
10229               for(n=0;n<HOST_REGS;n++)
10230               {
10231                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10232               }
10233               // and alloc new one
10234               f_regmap[hr]=regs[i].regmap[hr];
10235             }
10236           }
10237           if(branch_regs[i].regmap[hr]>64) {
10238             if(!((branch_regs[i].dirty>>hr)&1))
10239               f_regmap[hr]=branch_regs[i].regmap[hr];
10240             else f_regmap[hr]=-1;
10241           }
10242           else if(branch_regs[i].regmap[hr]>=0) {
10243             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10244               // dealloc old register
10245               int n;
10246               for(n=0;n<HOST_REGS;n++)
10247               {
10248                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10249               }
10250               // and alloc new one
10251               f_regmap[hr]=branch_regs[i].regmap[hr];
10252             }
10253           }
10254           if(ooo[i]) {
10255             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10256               f_regmap[hr]=branch_regs[i].regmap[hr];
10257           }else{
10258             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10259               f_regmap[hr]=branch_regs[i].regmap[hr];
10260           }
10261           // Avoid dirty->clean transition
10262           #ifdef DESTRUCTIVE_WRITEBACK
10263           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10264           #endif
10265           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10266           // case above, however it's always a good idea.  We can't hoist the
10267           // load if the register was already allocated, so there's no point
10268           // wasting time analyzing most of these cases.  It only "succeeds"
10269           // when the mapping was different and the load can be replaced with
10270           // a mov, which is of negligible benefit.  So such cases are
10271           // skipped below.
10272           if(f_regmap[hr]>0) {
10273             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10274               int r=f_regmap[hr];
10275               for(j=t;j<=i;j++)
10276               {
10277                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10278                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10279                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10280                 if(r>63) {
10281                   // NB This can exclude the case where the upper-half
10282                   // register is lower numbered than the lower-half
10283                   // register.  Not sure if it's worth fixing...
10284                   if(get_reg(regs[j].regmap,r&63)<0) break;
10285                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10286                   if(regs[j].is32&(1LL<<(r&63))) break;
10287                 }
10288                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10289                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10290                   int k;
10291                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10292                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10293                     if(r>63) {
10294                       if(get_reg(regs[i].regmap,r&63)<0) break;
10295                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10296                     }
10297                     k=i;
10298                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10299                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10300                         //printf("no free regs for store %x\n",start+(k-1)*4);
10301                         break;
10302                       }
10303                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10304                         //printf("no-match due to different register\n");
10305                         break;
10306                       }
10307                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10308                         //printf("no-match due to branch\n");
10309                         break;
10310                       }
10311                       // call/ret fast path assumes no registers allocated
10312                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10313                         break;
10314                       }
10315                       if(r>63) {
10316                         // NB This can exclude the case where the upper-half
10317                         // register is lower numbered than the lower-half
10318                         // register.  Not sure if it's worth fixing...
10319                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10320                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10321                       }
10322                       k--;
10323                     }
10324                     if(i<slen-1) {
10325                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10326                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10327                         //printf("bad match after branch\n");
10328                         break;
10329                       }
10330                     }
10331                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10332                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10333                       while(k<i) {
10334                         regs[k].regmap_entry[hr]=f_regmap[hr];
10335                         regs[k].regmap[hr]=f_regmap[hr];
10336                         regmap_pre[k+1][hr]=f_regmap[hr];
10337                         regs[k].wasdirty&=~(1<<hr);
10338                         regs[k].dirty&=~(1<<hr);
10339                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10340                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10341                         regs[k].wasconst&=~(1<<hr);
10342                         regs[k].isconst&=~(1<<hr);
10343                         k++;
10344                       }
10345                     }
10346                     else {
10347                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10348                       break;
10349                     }
10350                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10351                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10352                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10353                       regs[i].regmap_entry[hr]=f_regmap[hr];
10354                       regs[i].regmap[hr]=f_regmap[hr];
10355                       regs[i].wasdirty&=~(1<<hr);
10356                       regs[i].dirty&=~(1<<hr);
10357                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10358                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10359                       regs[i].wasconst&=~(1<<hr);
10360                       regs[i].isconst&=~(1<<hr);
10361                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10362                       branch_regs[i].wasdirty&=~(1<<hr);
10363                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10364                       branch_regs[i].regmap[hr]=f_regmap[hr];
10365                       branch_regs[i].dirty&=~(1<<hr);
10366                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10367                       branch_regs[i].wasconst&=~(1<<hr);
10368                       branch_regs[i].isconst&=~(1<<hr);
10369                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10370                         regmap_pre[i+2][hr]=f_regmap[hr];
10371                         regs[i+2].wasdirty&=~(1<<hr);
10372                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10373                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10374                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10375                       }
10376                     }
10377                   }
10378                   for(k=t;k<j;k++) {
10379                     // Alloc register clean at beginning of loop,
10380                     // but may dirty it in pass 6
10381                     regs[k].regmap_entry[hr]=f_regmap[hr];
10382                     regs[k].regmap[hr]=f_regmap[hr];
10383                     regs[k].dirty&=~(1<<hr);
10384                     regs[k].wasconst&=~(1<<hr);
10385                     regs[k].isconst&=~(1<<hr);
10386                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10387                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10388                       branch_regs[k].regmap[hr]=f_regmap[hr];
10389                       branch_regs[k].dirty&=~(1<<hr);
10390                       branch_regs[k].wasconst&=~(1<<hr);
10391                       branch_regs[k].isconst&=~(1<<hr);
10392                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10393                         regmap_pre[k+2][hr]=f_regmap[hr];
10394                         regs[k+2].wasdirty&=~(1<<hr);
10395                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10396                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10397                       }
10398                     }
10399                     else
10400                     {
10401                       regmap_pre[k+1][hr]=f_regmap[hr];
10402                       regs[k+1].wasdirty&=~(1<<hr);
10403                     }
10404                   }
10405                   if(regs[j].regmap[hr]==f_regmap[hr])
10406                     regs[j].regmap_entry[hr]=f_regmap[hr];
10407                   break;
10408                 }
10409                 if(j==i) break;
10410                 if(regs[j].regmap[hr]>=0)
10411                   break;
10412                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10413                   //printf("no-match due to different register\n");
10414                   break;
10415                 }
10416                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10417                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10418                   break;
10419                 }
10420                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10421                 {
10422                   // Stop on unconditional branch
10423                   break;
10424                 }
10425                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10426                 {
10427                   if(ooo[j]) {
10428                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10429                       break;
10430                   }else{
10431                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10432                       break;
10433                   }
10434                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10435                     //printf("no-match due to different register (branch)\n");
10436                     break;
10437                   }
10438                 }
10439                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10440                   //printf("No free regs for store %x\n",start+j*4);
10441                   break;
10442                 }
10443                 if(f_regmap[hr]>=64) {
10444                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10445                     break;
10446                   }
10447                   else
10448                   {
10449                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10450                       break;
10451                     }
10452                   }
10453                 }
10454               }
10455             }
10456           }
10457         }
10458       }
10459     }else{
10460       // Non branch or undetermined branch target
10461       for(hr=0;hr<HOST_REGS;hr++)
10462       {
10463         if(hr!=EXCLUDE_REG) {
10464           if(regs[i].regmap[hr]>64) {
10465             if(!((regs[i].dirty>>hr)&1))
10466               f_regmap[hr]=regs[i].regmap[hr];
10467           }
10468           else if(regs[i].regmap[hr]>=0) {
10469             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10470               // dealloc old register
10471               int n;
10472               for(n=0;n<HOST_REGS;n++)
10473               {
10474                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10475               }
10476               // and alloc new one
10477               f_regmap[hr]=regs[i].regmap[hr];
10478             }
10479           }
10480         }
10481       }
10482       // Try to restore cycle count at branch targets
10483       if(bt[i]) {
10484         for(j=i;j<slen-1;j++) {
10485           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10486           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10487             //printf("no free regs for store %x\n",start+j*4);
10488             break;
10489           }
10490         }
10491         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10492           int k=i;
10493           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10494           while(k<j) {
10495             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10496             regs[k].regmap[HOST_CCREG]=CCREG;
10497             regmap_pre[k+1][HOST_CCREG]=CCREG;
10498             regs[k+1].wasdirty|=1<<HOST_CCREG;
10499             regs[k].dirty|=1<<HOST_CCREG;
10500             regs[k].wasconst&=~(1<<HOST_CCREG);
10501             regs[k].isconst&=~(1<<HOST_CCREG);
10502             k++;
10503           }
10504           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10505         }
10506         // Work backwards from the branch target
10507         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10508         {
10509           //printf("Extend backwards\n");
10510           int k;
10511           k=i;
10512           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10513             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10514               //printf("no free regs for store %x\n",start+(k-1)*4);
10515               break;
10516             }
10517             k--;
10518           }
10519           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10520             //printf("Extend CC, %x ->\n",start+k*4);
10521             while(k<=i) {
10522               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10523               regs[k].regmap[HOST_CCREG]=CCREG;
10524               regmap_pre[k+1][HOST_CCREG]=CCREG;
10525               regs[k+1].wasdirty|=1<<HOST_CCREG;
10526               regs[k].dirty|=1<<HOST_CCREG;
10527               regs[k].wasconst&=~(1<<HOST_CCREG);
10528               regs[k].isconst&=~(1<<HOST_CCREG);
10529               k++;
10530             }
10531           }
10532           else {
10533             //printf("Fail Extend CC, %x ->\n",start+k*4);
10534           }
10535         }
10536       }
10537       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10538          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10539          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10540          itype[i]!=FCONV&&itype[i]!=FCOMP)
10541       {
10542         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10543       }
10544     }
10545   }
10546   
10547   // Cache memory offset or tlb map pointer if a register is available
10548   #ifndef HOST_IMM_ADDR32
10549   #ifndef RAM_OFFSET
10550   if(using_tlb)
10551   #endif
10552   {
10553     int earliest_available[HOST_REGS];
10554     int loop_start[HOST_REGS];
10555     int score[HOST_REGS];
10556     int end[HOST_REGS];
10557     int reg=using_tlb?MMREG:ROREG;
10558
10559     // Init
10560     for(hr=0;hr<HOST_REGS;hr++) {
10561       score[hr]=0;earliest_available[hr]=0;
10562       loop_start[hr]=MAXBLOCK;
10563     }
10564     for(i=0;i<slen-1;i++)
10565     {
10566       // Can't do anything if no registers are available
10567       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10568         for(hr=0;hr<HOST_REGS;hr++) {
10569           score[hr]=0;earliest_available[hr]=i+1;
10570           loop_start[hr]=MAXBLOCK;
10571         }
10572       }
10573       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10574         if(!ooo[i]) {
10575           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10576             for(hr=0;hr<HOST_REGS;hr++) {
10577               score[hr]=0;earliest_available[hr]=i+1;
10578               loop_start[hr]=MAXBLOCK;
10579             }
10580           }
10581         }else{
10582           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10583             for(hr=0;hr<HOST_REGS;hr++) {
10584               score[hr]=0;earliest_available[hr]=i+1;
10585               loop_start[hr]=MAXBLOCK;
10586             }
10587           }
10588         }
10589       }
10590       // Mark unavailable registers
10591       for(hr=0;hr<HOST_REGS;hr++) {
10592         if(regs[i].regmap[hr]>=0) {
10593           score[hr]=0;earliest_available[hr]=i+1;
10594           loop_start[hr]=MAXBLOCK;
10595         }
10596         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10597           if(branch_regs[i].regmap[hr]>=0) {
10598             score[hr]=0;earliest_available[hr]=i+2;
10599             loop_start[hr]=MAXBLOCK;
10600           }
10601         }
10602       }
10603       // No register allocations after unconditional jumps
10604       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10605       {
10606         for(hr=0;hr<HOST_REGS;hr++) {
10607           score[hr]=0;earliest_available[hr]=i+2;
10608           loop_start[hr]=MAXBLOCK;
10609         }
10610         i++; // Skip delay slot too
10611         //printf("skip delay slot: %x\n",start+i*4);
10612       }
10613       else
10614       // Possible match
10615       if(itype[i]==LOAD||itype[i]==LOADLR||
10616          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10617         for(hr=0;hr<HOST_REGS;hr++) {
10618           if(hr!=EXCLUDE_REG) {
10619             end[hr]=i-1;
10620             for(j=i;j<slen-1;j++) {
10621               if(regs[j].regmap[hr]>=0) break;
10622               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10623                 if(branch_regs[j].regmap[hr]>=0) break;
10624                 if(ooo[j]) {
10625                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10626                 }else{
10627                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10628                 }
10629               }
10630               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10631               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10632                 int t=(ba[j]-start)>>2;
10633                 if(t<j&&t>=earliest_available[hr]) {
10634                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10635                     // Score a point for hoisting loop invariant
10636                     if(t<loop_start[hr]) loop_start[hr]=t;
10637                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10638                     score[hr]++;
10639                     end[hr]=j;
10640                   }
10641                 }
10642                 else if(t<j) {
10643                   if(regs[t].regmap[hr]==reg) {
10644                     // Score a point if the branch target matches this register
10645                     score[hr]++;
10646                     end[hr]=j;
10647                   }
10648                 }
10649                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10650                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10651                   score[hr]++;
10652                   end[hr]=j;
10653                 }
10654               }
10655               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10656               {
10657                 // Stop on unconditional branch
10658                 break;
10659               }
10660               else
10661               if(itype[j]==LOAD||itype[j]==LOADLR||
10662                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10663                 score[hr]++;
10664                 end[hr]=j;
10665               }
10666             }
10667           }
10668         }
10669         // Find highest score and allocate that register
10670         int maxscore=0;
10671         for(hr=0;hr<HOST_REGS;hr++) {
10672           if(hr!=EXCLUDE_REG) {
10673             if(score[hr]>score[maxscore]) {
10674               maxscore=hr;
10675               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10676             }
10677           }
10678         }
10679         if(score[maxscore]>1)
10680         {
10681           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10682           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10683             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10684             assert(regs[j].regmap[maxscore]<0);
10685             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10686             regs[j].regmap[maxscore]=reg;
10687             regs[j].dirty&=~(1<<maxscore);
10688             regs[j].wasconst&=~(1<<maxscore);
10689             regs[j].isconst&=~(1<<maxscore);
10690             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10691               branch_regs[j].regmap[maxscore]=reg;
10692               branch_regs[j].wasdirty&=~(1<<maxscore);
10693               branch_regs[j].dirty&=~(1<<maxscore);
10694               branch_regs[j].wasconst&=~(1<<maxscore);
10695               branch_regs[j].isconst&=~(1<<maxscore);
10696               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10697                 regmap_pre[j+2][maxscore]=reg;
10698                 regs[j+2].wasdirty&=~(1<<maxscore);
10699               }
10700               // loop optimization (loop_preload)
10701               int t=(ba[j]-start)>>2;
10702               if(t==loop_start[maxscore]) {
10703                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10704                   regs[t].regmap_entry[maxscore]=reg;
10705               }
10706             }
10707             else
10708             {
10709               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10710                 regmap_pre[j+1][maxscore]=reg;
10711                 regs[j+1].wasdirty&=~(1<<maxscore);
10712               }
10713             }
10714           }
10715           i=j-1;
10716           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10717           for(hr=0;hr<HOST_REGS;hr++) {
10718             score[hr]=0;earliest_available[hr]=i+i;
10719             loop_start[hr]=MAXBLOCK;
10720           }
10721         }
10722       }
10723     }
10724   }
10725   #endif
10726   
10727   // This allocates registers (if possible) one instruction prior
10728   // to use, which can avoid a load-use penalty on certain CPUs.
10729   for(i=0;i<slen-1;i++)
10730   {
10731     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10732     {
10733       if(!bt[i+1])
10734       {
10735         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10736            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10737         {
10738           if(rs1[i+1]) {
10739             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10740             {
10741               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10742               {
10743                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10744                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10745                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10746                 regs[i].isconst&=~(1<<hr);
10747                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10748                 constmap[i][hr]=constmap[i+1][hr];
10749                 regs[i+1].wasdirty&=~(1<<hr);
10750                 regs[i].dirty&=~(1<<hr);
10751               }
10752             }
10753           }
10754           if(rs2[i+1]) {
10755             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10756             {
10757               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10758               {
10759                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10760                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10761                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10762                 regs[i].isconst&=~(1<<hr);
10763                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10764                 constmap[i][hr]=constmap[i+1][hr];
10765                 regs[i+1].wasdirty&=~(1<<hr);
10766                 regs[i].dirty&=~(1<<hr);
10767               }
10768             }
10769           }
10770           // Preload target address for load instruction (non-constant)
10771           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10772             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10773             {
10774               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10775               {
10776                 regs[i].regmap[hr]=rs1[i+1];
10777                 regmap_pre[i+1][hr]=rs1[i+1];
10778                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10779                 regs[i].isconst&=~(1<<hr);
10780                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10781                 constmap[i][hr]=constmap[i+1][hr];
10782                 regs[i+1].wasdirty&=~(1<<hr);
10783                 regs[i].dirty&=~(1<<hr);
10784               }
10785             }
10786           }
10787           // Load source into target register 
10788           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10789             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10790             {
10791               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10792               {
10793                 regs[i].regmap[hr]=rs1[i+1];
10794                 regmap_pre[i+1][hr]=rs1[i+1];
10795                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10796                 regs[i].isconst&=~(1<<hr);
10797                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10798                 constmap[i][hr]=constmap[i+1][hr];
10799                 regs[i+1].wasdirty&=~(1<<hr);
10800                 regs[i].dirty&=~(1<<hr);
10801               }
10802             }
10803           }
10804           // Preload map address
10805           #ifndef HOST_IMM_ADDR32
10806           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10807             hr=get_reg(regs[i+1].regmap,TLREG);
10808             if(hr>=0) {
10809               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10810               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10811                 int nr;
10812                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10813                 {
10814                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10815                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10816                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10817                   regs[i].isconst&=~(1<<hr);
10818                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10819                   constmap[i][hr]=constmap[i+1][hr];
10820                   regs[i+1].wasdirty&=~(1<<hr);
10821                   regs[i].dirty&=~(1<<hr);
10822                 }
10823                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10824                 {
10825                   // move it to another register
10826                   regs[i+1].regmap[hr]=-1;
10827                   regmap_pre[i+2][hr]=-1;
10828                   regs[i+1].regmap[nr]=TLREG;
10829                   regmap_pre[i+2][nr]=TLREG;
10830                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10831                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10832                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10833                   regs[i].isconst&=~(1<<nr);
10834                   regs[i+1].isconst&=~(1<<nr);
10835                   regs[i].dirty&=~(1<<nr);
10836                   regs[i+1].wasdirty&=~(1<<nr);
10837                   regs[i+1].dirty&=~(1<<nr);
10838                   regs[i+2].wasdirty&=~(1<<nr);
10839                 }
10840               }
10841             }
10842           }
10843           #endif
10844           // Address for store instruction (non-constant)
10845           if(itype[i+1]==STORE||itype[i+1]==STORELR
10846              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10847             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10848               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10849               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10850               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10851               assert(hr>=0);
10852               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10853               {
10854                 regs[i].regmap[hr]=rs1[i+1];
10855                 regmap_pre[i+1][hr]=rs1[i+1];
10856                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10857                 regs[i].isconst&=~(1<<hr);
10858                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10859                 constmap[i][hr]=constmap[i+1][hr];
10860                 regs[i+1].wasdirty&=~(1<<hr);
10861                 regs[i].dirty&=~(1<<hr);
10862               }
10863             }
10864           }
10865           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10866             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10867               int nr;
10868               hr=get_reg(regs[i+1].regmap,FTEMP);
10869               assert(hr>=0);
10870               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10871               {
10872                 regs[i].regmap[hr]=rs1[i+1];
10873                 regmap_pre[i+1][hr]=rs1[i+1];
10874                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10875                 regs[i].isconst&=~(1<<hr);
10876                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10877                 constmap[i][hr]=constmap[i+1][hr];
10878                 regs[i+1].wasdirty&=~(1<<hr);
10879                 regs[i].dirty&=~(1<<hr);
10880               }
10881               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10882               {
10883                 // move it to another register
10884                 regs[i+1].regmap[hr]=-1;
10885                 regmap_pre[i+2][hr]=-1;
10886                 regs[i+1].regmap[nr]=FTEMP;
10887                 regmap_pre[i+2][nr]=FTEMP;
10888                 regs[i].regmap[nr]=rs1[i+1];
10889                 regmap_pre[i+1][nr]=rs1[i+1];
10890                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10891                 regs[i].isconst&=~(1<<nr);
10892                 regs[i+1].isconst&=~(1<<nr);
10893                 regs[i].dirty&=~(1<<nr);
10894                 regs[i+1].wasdirty&=~(1<<nr);
10895                 regs[i+1].dirty&=~(1<<nr);
10896                 regs[i+2].wasdirty&=~(1<<nr);
10897               }
10898             }
10899           }
10900           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10901             if(itype[i+1]==LOAD) 
10902               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10903             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10904               hr=get_reg(regs[i+1].regmap,FTEMP);
10905             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10906               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10907               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10908             }
10909             if(hr>=0&&regs[i].regmap[hr]<0) {
10910               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10911               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10912                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10913                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10914                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10915                 regs[i].isconst&=~(1<<hr);
10916                 regs[i+1].wasdirty&=~(1<<hr);
10917                 regs[i].dirty&=~(1<<hr);
10918               }
10919             }
10920           }
10921         }
10922       }
10923     }
10924   }
10925   
10926   /* Pass 6 - Optimize clean/dirty state */
10927   clean_registers(0,slen-1,1);
10928   
10929   /* Pass 7 - Identify 32-bit registers */
10930 #ifndef FORCE32
10931   provisional_r32();
10932
10933   u_int r32=0;
10934   
10935   for (i=slen-1;i>=0;i--)
10936   {
10937     int hr;
10938     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10939     {
10940       if(ba[i]<start || ba[i]>=(start+slen*4))
10941       {
10942         // Branch out of this block, don't need anything
10943         r32=0;
10944       }
10945       else
10946       {
10947         // Internal branch
10948         // Need whatever matches the target
10949         // (and doesn't get overwritten by the delay slot instruction)
10950         r32=0;
10951         int t=(ba[i]-start)>>2;
10952         if(ba[i]>start+i*4) {
10953           // Forward branch
10954           if(!(requires_32bit[t]&~regs[i].was32))
10955             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10956         }else{
10957           // Backward branch
10958           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10959           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10960           if(!(pr32[t]&~regs[i].was32))
10961             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10962         }
10963       }
10964       // Conditional branch may need registers for following instructions
10965       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10966       {
10967         if(i<slen-2) {
10968           r32|=requires_32bit[i+2];
10969           r32&=regs[i].was32;
10970           // Mark this address as a branch target since it may be called
10971           // upon return from interrupt
10972           bt[i+2]=1;
10973         }
10974       }
10975       // Merge in delay slot
10976       if(!likely[i]) {
10977         // These are overwritten unless the branch is "likely"
10978         // and the delay slot is nullified if not taken
10979         r32&=~(1LL<<rt1[i+1]);
10980         r32&=~(1LL<<rt2[i+1]);
10981       }
10982       // Assume these are needed (delay slot)
10983       if(us1[i+1]>0)
10984       {
10985         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10986       }
10987       if(us2[i+1]>0)
10988       {
10989         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10990       }
10991       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10992       {
10993         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10994       }
10995       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10996       {
10997         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10998       }
10999     }
11000     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
11001     {
11002       // SYSCALL instruction (software interrupt)
11003       r32=0;
11004     }
11005     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
11006     {
11007       // ERET instruction (return from interrupt)
11008       r32=0;
11009     }
11010     // Check 32 bits
11011     r32&=~(1LL<<rt1[i]);
11012     r32&=~(1LL<<rt2[i]);
11013     if(us1[i]>0)
11014     {
11015       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
11016     }
11017     if(us2[i]>0)
11018     {
11019       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
11020     }
11021     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
11022     {
11023       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
11024     }
11025     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
11026     {
11027       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
11028     }
11029     requires_32bit[i]=r32;
11030     
11031     // Dirty registers which are 32-bit, require 32-bit input
11032     // as they will be written as 32-bit values
11033     for(hr=0;hr<HOST_REGS;hr++)
11034     {
11035       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
11036         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
11037           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
11038           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
11039         }
11040       }
11041     }
11042     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
11043   }
11044 #else
11045   for (i=slen-1;i>=0;i--)
11046   {
11047     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11048     {
11049       // Conditional branch
11050       if((source[i]>>16)!=0x1000&&i<slen-2) {
11051         // Mark this address as a branch target since it may be called
11052         // upon return from interrupt
11053         bt[i+2]=1;
11054       }
11055     }
11056   }
11057 #endif
11058
11059   if(itype[slen-1]==SPAN) {
11060     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
11061   }
11062
11063 #ifdef DISASM
11064   /* Debug/disassembly */
11065   for(i=0;i<slen;i++)
11066   {
11067     printf("U:");
11068     int r;
11069     for(r=1;r<=CCREG;r++) {
11070       if((unneeded_reg[i]>>r)&1) {
11071         if(r==HIREG) printf(" HI");
11072         else if(r==LOREG) printf(" LO");
11073         else printf(" r%d",r);
11074       }
11075     }
11076 #ifndef FORCE32
11077     printf(" UU:");
11078     for(r=1;r<=CCREG;r++) {
11079       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
11080         if(r==HIREG) printf(" HI");
11081         else if(r==LOREG) printf(" LO");
11082         else printf(" r%d",r);
11083       }
11084     }
11085     printf(" 32:");
11086     for(r=0;r<=CCREG;r++) {
11087       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11088       if((regs[i].was32>>r)&1) {
11089         if(r==CCREG) printf(" CC");
11090         else if(r==HIREG) printf(" HI");
11091         else if(r==LOREG) printf(" LO");
11092         else printf(" r%d",r);
11093       }
11094     }
11095 #endif
11096     printf("\n");
11097     #if defined(__i386__) || defined(__x86_64__)
11098     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
11099     #endif
11100     #ifdef __arm__
11101     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
11102     #endif
11103     printf("needs: ");
11104     if(needed_reg[i]&1) printf("eax ");
11105     if((needed_reg[i]>>1)&1) printf("ecx ");
11106     if((needed_reg[i]>>2)&1) printf("edx ");
11107     if((needed_reg[i]>>3)&1) printf("ebx ");
11108     if((needed_reg[i]>>5)&1) printf("ebp ");
11109     if((needed_reg[i]>>6)&1) printf("esi ");
11110     if((needed_reg[i]>>7)&1) printf("edi ");
11111     printf("r:");
11112     for(r=0;r<=CCREG;r++) {
11113       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11114       if((requires_32bit[i]>>r)&1) {
11115         if(r==CCREG) printf(" CC");
11116         else if(r==HIREG) printf(" HI");
11117         else if(r==LOREG) printf(" LO");
11118         else printf(" r%d",r);
11119       }
11120     }
11121     printf("\n");
11122     /*printf("pr:");
11123     for(r=0;r<=CCREG;r++) {
11124       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11125       if((pr32[i]>>r)&1) {
11126         if(r==CCREG) printf(" CC");
11127         else if(r==HIREG) printf(" HI");
11128         else if(r==LOREG) printf(" LO");
11129         else printf(" r%d",r);
11130       }
11131     }
11132     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
11133     printf("\n");*/
11134     #if defined(__i386__) || defined(__x86_64__)
11135     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
11136     printf("dirty: ");
11137     if(regs[i].wasdirty&1) printf("eax ");
11138     if((regs[i].wasdirty>>1)&1) printf("ecx ");
11139     if((regs[i].wasdirty>>2)&1) printf("edx ");
11140     if((regs[i].wasdirty>>3)&1) printf("ebx ");
11141     if((regs[i].wasdirty>>5)&1) printf("ebp ");
11142     if((regs[i].wasdirty>>6)&1) printf("esi ");
11143     if((regs[i].wasdirty>>7)&1) printf("edi ");
11144     #endif
11145     #ifdef __arm__
11146     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
11147     printf("dirty: ");
11148     if(regs[i].wasdirty&1) printf("r0 ");
11149     if((regs[i].wasdirty>>1)&1) printf("r1 ");
11150     if((regs[i].wasdirty>>2)&1) printf("r2 ");
11151     if((regs[i].wasdirty>>3)&1) printf("r3 ");
11152     if((regs[i].wasdirty>>4)&1) printf("r4 ");
11153     if((regs[i].wasdirty>>5)&1) printf("r5 ");
11154     if((regs[i].wasdirty>>6)&1) printf("r6 ");
11155     if((regs[i].wasdirty>>7)&1) printf("r7 ");
11156     if((regs[i].wasdirty>>8)&1) printf("r8 ");
11157     if((regs[i].wasdirty>>9)&1) printf("r9 ");
11158     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11159     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11160     #endif
11161     printf("\n");
11162     disassemble_inst(i);
11163     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11164     #if defined(__i386__) || defined(__x86_64__)
11165     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11166     if(regs[i].dirty&1) printf("eax ");
11167     if((regs[i].dirty>>1)&1) printf("ecx ");
11168     if((regs[i].dirty>>2)&1) printf("edx ");
11169     if((regs[i].dirty>>3)&1) printf("ebx ");
11170     if((regs[i].dirty>>5)&1) printf("ebp ");
11171     if((regs[i].dirty>>6)&1) printf("esi ");
11172     if((regs[i].dirty>>7)&1) printf("edi ");
11173     #endif
11174     #ifdef __arm__
11175     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11176     if(regs[i].dirty&1) printf("r0 ");
11177     if((regs[i].dirty>>1)&1) printf("r1 ");
11178     if((regs[i].dirty>>2)&1) printf("r2 ");
11179     if((regs[i].dirty>>3)&1) printf("r3 ");
11180     if((regs[i].dirty>>4)&1) printf("r4 ");
11181     if((regs[i].dirty>>5)&1) printf("r5 ");
11182     if((regs[i].dirty>>6)&1) printf("r6 ");
11183     if((regs[i].dirty>>7)&1) printf("r7 ");
11184     if((regs[i].dirty>>8)&1) printf("r8 ");
11185     if((regs[i].dirty>>9)&1) printf("r9 ");
11186     if((regs[i].dirty>>10)&1) printf("r10 ");
11187     if((regs[i].dirty>>12)&1) printf("r12 ");
11188     #endif
11189     printf("\n");
11190     if(regs[i].isconst) {
11191       printf("constants: ");
11192       #if defined(__i386__) || defined(__x86_64__)
11193       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11194       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11195       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11196       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11197       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11198       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11199       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11200       #endif
11201       #ifdef __arm__
11202       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11203       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11204       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11205       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11206       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11207       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11208       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11209       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11210       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11211       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11212       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11213       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11214       #endif
11215       printf("\n");
11216     }
11217 #ifndef FORCE32
11218     printf(" 32:");
11219     for(r=0;r<=CCREG;r++) {
11220       if((regs[i].is32>>r)&1) {
11221         if(r==CCREG) printf(" CC");
11222         else if(r==HIREG) printf(" HI");
11223         else if(r==LOREG) printf(" LO");
11224         else printf(" r%d",r);
11225       }
11226     }
11227     printf("\n");
11228 #endif
11229     /*printf(" p32:");
11230     for(r=0;r<=CCREG;r++) {
11231       if((p32[i]>>r)&1) {
11232         if(r==CCREG) printf(" CC");
11233         else if(r==HIREG) printf(" HI");
11234         else if(r==LOREG) printf(" LO");
11235         else printf(" r%d",r);
11236       }
11237     }
11238     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11239     else printf("\n");*/
11240     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11241       #if defined(__i386__) || defined(__x86_64__)
11242       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11243       if(branch_regs[i].dirty&1) printf("eax ");
11244       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11245       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11246       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11247       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11248       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11249       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11250       #endif
11251       #ifdef __arm__
11252       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11253       if(branch_regs[i].dirty&1) printf("r0 ");
11254       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11255       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11256       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11257       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11258       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11259       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11260       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11261       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11262       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11263       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11264       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11265       #endif
11266 #ifndef FORCE32
11267       printf(" 32:");
11268       for(r=0;r<=CCREG;r++) {
11269         if((branch_regs[i].is32>>r)&1) {
11270           if(r==CCREG) printf(" CC");
11271           else if(r==HIREG) printf(" HI");
11272           else if(r==LOREG) printf(" LO");
11273           else printf(" r%d",r);
11274         }
11275       }
11276       printf("\n");
11277 #endif
11278     }
11279   }
11280 #endif // DISASM
11281
11282   /* Pass 8 - Assembly */
11283   linkcount=0;stubcount=0;
11284   ds=0;is_delayslot=0;
11285   cop1_usable=0;
11286   uint64_t is32_pre=0;
11287   u_int dirty_pre=0;
11288   u_int beginning=(u_int)out;
11289   if((u_int)addr&1) {
11290     ds=1;
11291     pagespan_ds();
11292   }
11293   u_int instr_addr0_override=0;
11294
11295 #ifdef PCSX
11296   if (start == 0x80030000) {
11297     // nasty hack for fastbios thing
11298     // override block entry to this code
11299     instr_addr0_override=(u_int)out;
11300     emit_movimm(start,0);
11301     // abuse io address var as a flag that we
11302     // have already returned here once
11303     emit_readword((int)&address,1);
11304     emit_writeword(0,(int)&pcaddr);
11305     emit_writeword(0,(int)&address);
11306     emit_cmp(0,1);
11307     emit_jne((int)new_dyna_leave);
11308   }
11309 #endif
11310   for(i=0;i<slen;i++)
11311   {
11312     //if(ds) printf("ds: ");
11313     disassemble_inst(i);
11314     if(ds) {
11315       ds=0; // Skip delay slot
11316       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11317       instr_addr[i]=0;
11318     } else {
11319       speculate_register_values(i);
11320       #ifndef DESTRUCTIVE_WRITEBACK
11321       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11322       {
11323         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11324               unneeded_reg[i],unneeded_reg_upper[i]);
11325         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11326               unneeded_reg[i],unneeded_reg_upper[i]);
11327       }
11328       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11329         is32_pre=branch_regs[i].is32;
11330         dirty_pre=branch_regs[i].dirty;
11331       }else{
11332         is32_pre=regs[i].is32;
11333         dirty_pre=regs[i].dirty;
11334       }
11335       #endif
11336       // write back
11337       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11338       {
11339         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11340                       unneeded_reg[i],unneeded_reg_upper[i]);
11341         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11342       }
11343       // branch target entry point
11344       instr_addr[i]=(u_int)out;
11345       assem_debug("<->\n");
11346       // load regs
11347       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11348         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11349       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11350       address_generation(i,&regs[i],regs[i].regmap_entry);
11351       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11352       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11353       {
11354         // Load the delay slot registers if necessary
11355         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11356           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11357         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11358           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11359         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11360           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11361       }
11362       else if(i+1<slen)
11363       {
11364         // Preload registers for following instruction
11365         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11366           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11367             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11368         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11369           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11370             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11371       }
11372       // TODO: if(is_ooo(i)) address_generation(i+1);
11373       if(itype[i]==CJUMP||itype[i]==FJUMP)
11374         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11375       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11376         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11377       if(bt[i]) cop1_usable=0;
11378       // assemble
11379       switch(itype[i]) {
11380         case ALU:
11381           alu_assemble(i,&regs[i]);break;
11382         case IMM16:
11383           imm16_assemble(i,&regs[i]);break;
11384         case SHIFT:
11385           shift_assemble(i,&regs[i]);break;
11386         case SHIFTIMM:
11387           shiftimm_assemble(i,&regs[i]);break;
11388         case LOAD:
11389           load_assemble(i,&regs[i]);break;
11390         case LOADLR:
11391           loadlr_assemble(i,&regs[i]);break;
11392         case STORE:
11393           store_assemble(i,&regs[i]);break;
11394         case STORELR:
11395           storelr_assemble(i,&regs[i]);break;
11396         case COP0:
11397           cop0_assemble(i,&regs[i]);break;
11398         case COP1:
11399           cop1_assemble(i,&regs[i]);break;
11400         case C1LS:
11401           c1ls_assemble(i,&regs[i]);break;
11402         case COP2:
11403           cop2_assemble(i,&regs[i]);break;
11404         case C2LS:
11405           c2ls_assemble(i,&regs[i]);break;
11406         case C2OP:
11407           c2op_assemble(i,&regs[i]);break;
11408         case FCONV:
11409           fconv_assemble(i,&regs[i]);break;
11410         case FLOAT:
11411           float_assemble(i,&regs[i]);break;
11412         case FCOMP:
11413           fcomp_assemble(i,&regs[i]);break;
11414         case MULTDIV:
11415           multdiv_assemble(i,&regs[i]);break;
11416         case MOV:
11417           mov_assemble(i,&regs[i]);break;
11418         case SYSCALL:
11419           syscall_assemble(i,&regs[i]);break;
11420         case HLECALL:
11421           hlecall_assemble(i,&regs[i]);break;
11422         case INTCALL:
11423           intcall_assemble(i,&regs[i]);break;
11424         case UJUMP:
11425           ujump_assemble(i,&regs[i]);ds=1;break;
11426         case RJUMP:
11427           rjump_assemble(i,&regs[i]);ds=1;break;
11428         case CJUMP:
11429           cjump_assemble(i,&regs[i]);ds=1;break;
11430         case SJUMP:
11431           sjump_assemble(i,&regs[i]);ds=1;break;
11432         case FJUMP:
11433           fjump_assemble(i,&regs[i]);ds=1;break;
11434         case SPAN:
11435           pagespan_assemble(i,&regs[i]);break;
11436       }
11437       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11438         literal_pool(1024);
11439       else
11440         literal_pool_jumpover(256);
11441     }
11442   }
11443   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11444   // If the block did not end with an unconditional branch,
11445   // add a jump to the next instruction.
11446   if(i>1) {
11447     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11448       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11449       assert(i==slen);
11450       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11451         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11452         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11453           emit_loadreg(CCREG,HOST_CCREG);
11454         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11455       }
11456       else if(!likely[i-2])
11457       {
11458         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11459         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11460       }
11461       else
11462       {
11463         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11464         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11465       }
11466       add_to_linker((int)out,start+i*4,0);
11467       emit_jmp(0);
11468     }
11469   }
11470   else
11471   {
11472     assert(i>0);
11473     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11474     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11475     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11476       emit_loadreg(CCREG,HOST_CCREG);
11477     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11478     add_to_linker((int)out,start+i*4,0);
11479     emit_jmp(0);
11480   }
11481
11482   // TODO: delay slot stubs?
11483   // Stubs
11484   for(i=0;i<stubcount;i++)
11485   {
11486     switch(stubs[i][0])
11487     {
11488       case LOADB_STUB:
11489       case LOADH_STUB:
11490       case LOADW_STUB:
11491       case LOADD_STUB:
11492       case LOADBU_STUB:
11493       case LOADHU_STUB:
11494         do_readstub(i);break;
11495       case STOREB_STUB:
11496       case STOREH_STUB:
11497       case STOREW_STUB:
11498       case STORED_STUB:
11499         do_writestub(i);break;
11500       case CC_STUB:
11501         do_ccstub(i);break;
11502       case INVCODE_STUB:
11503         do_invstub(i);break;
11504       case FP_STUB:
11505         do_cop1stub(i);break;
11506       case STORELR_STUB:
11507         do_unalignedwritestub(i);break;
11508     }
11509   }
11510
11511   if (instr_addr0_override)
11512     instr_addr[0] = instr_addr0_override;
11513
11514   /* Pass 9 - Linker */
11515   for(i=0;i<linkcount;i++)
11516   {
11517     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11518     literal_pool(64);
11519     if(!link_addr[i][2])
11520     {
11521       void *stub=out;
11522       void *addr=check_addr(link_addr[i][1]);
11523       emit_extjump(link_addr[i][0],link_addr[i][1]);
11524       if(addr) {
11525         set_jump_target(link_addr[i][0],(int)addr);
11526         add_link(link_addr[i][1],stub);
11527       }
11528       else set_jump_target(link_addr[i][0],(int)stub);
11529     }
11530     else
11531     {
11532       // Internal branch
11533       int target=(link_addr[i][1]-start)>>2;
11534       assert(target>=0&&target<slen);
11535       assert(instr_addr[target]);
11536       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11537       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11538       //#else
11539       set_jump_target(link_addr[i][0],instr_addr[target]);
11540       //#endif
11541     }
11542   }
11543   // External Branch Targets (jump_in)
11544   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11545   for(i=0;i<slen;i++)
11546   {
11547     if(bt[i]||i==0)
11548     {
11549       if(instr_addr[i]) // TODO - delay slots (=null)
11550       {
11551         u_int vaddr=start+i*4;
11552         u_int page=get_page(vaddr);
11553         u_int vpage=get_vpage(vaddr);
11554         literal_pool(256);
11555         {
11556           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11557           assem_debug("jump_in: %x\n",start+i*4);
11558           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11559           int entry_point=do_dirty_stub(i);
11560           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
11561           // If there was an existing entry in the hash table,
11562           // replace it with the new address.
11563           // Don't add new entries.  We'll insert the
11564           // ones that actually get used in check_addr().
11565           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11566           if(ht_bin[0]==vaddr) {
11567             ht_bin[1]=entry_point;
11568           }
11569           if(ht_bin[2]==vaddr) {
11570             ht_bin[3]=entry_point;
11571           }
11572         }
11573       }
11574     }
11575   }
11576   // Write out the literal pool if necessary
11577   literal_pool(0);
11578   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11579   // Align code
11580   if(((u_int)out)&7) emit_addnop(13);
11581   #endif
11582   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11583   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11584   memcpy(copy,source,slen*4);
11585   copy+=slen*4;
11586   
11587   #ifdef __arm__
11588   __clear_cache((void *)beginning,out);
11589   #endif
11590   
11591   // If we're within 256K of the end of the buffer,
11592   // start over from the beginning. (Is 256K enough?)
11593   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11594   
11595   // Trap writes to any of the pages we compiled
11596   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11597     invalid_code[i]=0;
11598 #ifndef DISABLE_TLB
11599     memory_map[i]|=0x40000000;
11600     if((signed int)start>=(signed int)0xC0000000) {
11601       assert(using_tlb);
11602       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11603       invalid_code[j]=0;
11604       memory_map[j]|=0x40000000;
11605       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11606     }
11607 #endif
11608   }
11609   inv_code_start=inv_code_end=~0;
11610 #ifdef PCSX
11611   // for PCSX we need to mark all mirrors too
11612   if(get_page(start)<(RAM_SIZE>>12))
11613     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11614       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11615       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11616       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11617 #endif
11618   
11619   /* Pass 10 - Free memory by expiring oldest blocks */
11620   
11621   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11622   while(expirep!=end)
11623   {
11624     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11625     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11626     inv_debug("EXP: Phase %d\n",expirep);
11627     switch((expirep>>11)&3)
11628     {
11629       case 0:
11630         // Clear jump_in and jump_dirty
11631         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11632         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11633         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11634         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11635         break;
11636       case 1:
11637         // Clear pointers
11638         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11639         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11640         break;
11641       case 2:
11642         // Clear hash table
11643         for(i=0;i<32;i++) {
11644           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11645           if((ht_bin[3]>>shift)==(base>>shift) ||
11646              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11647             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11648             ht_bin[2]=ht_bin[3]=-1;
11649           }
11650           if((ht_bin[1]>>shift)==(base>>shift) ||
11651              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11652             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11653             ht_bin[0]=ht_bin[2];
11654             ht_bin[1]=ht_bin[3];
11655             ht_bin[2]=ht_bin[3]=-1;
11656           }
11657         }
11658         break;
11659       case 3:
11660         // Clear jump_out
11661         #ifdef __arm__
11662         if((expirep&2047)==0) 
11663           do_clear_cache();
11664         #endif
11665         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11666         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11667         break;
11668     }
11669     expirep=(expirep+1)&65535;
11670   }
11671   return 0;
11672 }
11673
11674 // vim:shiftwidth=2:expandtab