drc: some debug code improvements
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <sys/mman.h>
25
26 #include "emu_if.h" //emulator interface
27
28 //#define DISASM
29 //#define assem_debug printf
30 //#define inv_debug printf
31 #define assem_debug(...)
32 #define inv_debug(...)
33
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43
44 #define MAXBLOCK 4096
45 #define MAX_OUTPUT_BLOCK_SIZE 262144
46
47 int cycle_multiplier; // 100 for 1.0
48 #define CLOCK_ADJUST(x) (((x) * cycle_multiplier + 50) / 100)
49
50 struct regstat
51 {
52   signed char regmap_entry[HOST_REGS];
53   signed char regmap[HOST_REGS];
54   uint64_t was32;
55   uint64_t is32;
56   uint64_t wasdirty;
57   uint64_t dirty;
58   uint64_t u;
59   uint64_t uu;
60   u_int wasconst;
61   u_int isconst;
62   u_int loadedconst;             // host regs that have constants loaded
63   u_int waswritten;              // MIPS regs that were used as store base before
64   uint64_t constmap[HOST_REGS];
65 };
66
67 struct ll_entry
68 {
69   u_int vaddr;
70   u_int reg32;
71   void *addr;
72   struct ll_entry *next;
73 };
74
75   u_int start;
76   u_int *source;
77   u_int pagelimit;
78   char insn[MAXBLOCK][10];
79   u_char itype[MAXBLOCK];
80   u_char opcode[MAXBLOCK];
81   u_char opcode2[MAXBLOCK];
82   u_char bt[MAXBLOCK];
83   u_char rs1[MAXBLOCK];
84   u_char rs2[MAXBLOCK];
85   u_char rt1[MAXBLOCK];
86   u_char rt2[MAXBLOCK];
87   u_char us1[MAXBLOCK];
88   u_char us2[MAXBLOCK];
89   u_char dep1[MAXBLOCK];
90   u_char dep2[MAXBLOCK];
91   u_char lt1[MAXBLOCK];
92   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
93   static uint64_t gte_rt[MAXBLOCK];
94   static uint64_t gte_unneeded[MAXBLOCK];
95   static u_int smrv[32]; // speculated MIPS register values
96   static u_int smrv_strong; // mask or regs that are likely to have correct values
97   static u_int smrv_weak; // same, but somewhat less likely
98   static u_int smrv_strong_next; // same, but after current insn executes
99   static u_int smrv_weak_next;
100   int imm[MAXBLOCK];
101   u_int ba[MAXBLOCK];
102   char likely[MAXBLOCK];
103   char is_ds[MAXBLOCK];
104   char ooo[MAXBLOCK];
105   uint64_t unneeded_reg[MAXBLOCK];
106   uint64_t unneeded_reg_upper[MAXBLOCK];
107   uint64_t branch_unneeded_reg[MAXBLOCK];
108   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
109   uint64_t p32[MAXBLOCK];
110   uint64_t pr32[MAXBLOCK];
111   signed char regmap_pre[MAXBLOCK][HOST_REGS];
112   signed char regmap[MAXBLOCK][HOST_REGS];
113   signed char regmap_entry[MAXBLOCK][HOST_REGS];
114   uint64_t constmap[MAXBLOCK][HOST_REGS];
115   struct regstat regs[MAXBLOCK];
116   struct regstat branch_regs[MAXBLOCK];
117   signed char minimum_free_regs[MAXBLOCK];
118   u_int needed_reg[MAXBLOCK];
119   uint64_t requires_32bit[MAXBLOCK];
120   u_int wont_dirty[MAXBLOCK];
121   u_int will_dirty[MAXBLOCK];
122   int ccadj[MAXBLOCK];
123   int slen;
124   u_int instr_addr[MAXBLOCK];
125   u_int link_addr[MAXBLOCK][3];
126   int linkcount;
127   u_int stubs[MAXBLOCK*3][8];
128   int stubcount;
129   u_int literals[1024][2];
130   int literalcount;
131   int is_delayslot;
132   int cop1_usable;
133   u_char *out;
134   struct ll_entry *jump_in[4096];
135   struct ll_entry *jump_out[4096];
136   struct ll_entry *jump_dirty[4096];
137   u_int hash_table[65536][4]  __attribute__((aligned(16)));
138   char shadow[1048576]  __attribute__((aligned(16)));
139   void *copy;
140   int expirep;
141 #ifndef PCSX
142   u_int using_tlb;
143 #else
144   static const u_int using_tlb=0;
145 #endif
146   int new_dynarec_did_compile;
147   int new_dynarec_hacks;
148   u_int stop_after_jal;
149   extern u_char restore_candidate[512];
150   extern int cycle_count;
151
152   /* registers that may be allocated */
153   /* 1-31 gpr */
154 #define HIREG 32 // hi
155 #define LOREG 33 // lo
156 #define FSREG 34 // FPU status (FCSR)
157 #define CSREG 35 // Coprocessor status
158 #define CCREG 36 // Cycle count
159 #define INVCP 37 // Pointer to invalid_code
160 #define MMREG 38 // Pointer to memory_map
161 #define ROREG 39 // ram offset (if rdram!=0x80000000)
162 #define TEMPREG 40
163 #define FTEMP 40 // FPU temporary register
164 #define PTEMP 41 // Prefetch temporary register
165 #define TLREG 42 // TLB mapping offset
166 #define RHASH 43 // Return address hash
167 #define RHTBL 44 // Return address hash table address
168 #define RTEMP 45 // JR/JALR address register
169 #define MAXREG 45
170 #define AGEN1 46 // Address generation temporary register
171 #define AGEN2 47 // Address generation temporary register
172 #define MGEN1 48 // Maptable address generation temporary register
173 #define MGEN2 49 // Maptable address generation temporary register
174 #define BTREG 50 // Branch target temporary register
175
176   /* instruction types */
177 #define NOP 0     // No operation
178 #define LOAD 1    // Load
179 #define STORE 2   // Store
180 #define LOADLR 3  // Unaligned load
181 #define STORELR 4 // Unaligned store
182 #define MOV 5     // Move 
183 #define ALU 6     // Arithmetic/logic
184 #define MULTDIV 7 // Multiply/divide
185 #define SHIFT 8   // Shift by register
186 #define SHIFTIMM 9// Shift by immediate
187 #define IMM16 10  // 16-bit immediate
188 #define RJUMP 11  // Unconditional jump to register
189 #define UJUMP 12  // Unconditional jump
190 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
191 #define SJUMP 14  // Conditional branch (regimm format)
192 #define COP0 15   // Coprocessor 0
193 #define COP1 16   // Coprocessor 1
194 #define C1LS 17   // Coprocessor 1 load/store
195 #define FJUMP 18  // Conditional branch (floating point)
196 #define FLOAT 19  // Floating point unit
197 #define FCONV 20  // Convert integer to float
198 #define FCOMP 21  // Floating point compare (sets FSREG)
199 #define SYSCALL 22// SYSCALL
200 #define OTHER 23  // Other
201 #define SPAN 24   // Branch/delay slot spans 2 pages
202 #define NI 25     // Not implemented
203 #define HLECALL 26// PCSX fake opcodes for HLE
204 #define COP2 27   // Coprocessor 2 move
205 #define C2LS 28   // Coprocessor 2 load/store
206 #define C2OP 29   // Coprocessor 2 operation
207 #define INTCALL 30// Call interpreter to handle rare corner cases
208
209   /* stubs */
210 #define CC_STUB 1
211 #define FP_STUB 2
212 #define LOADB_STUB 3
213 #define LOADH_STUB 4
214 #define LOADW_STUB 5
215 #define LOADD_STUB 6
216 #define LOADBU_STUB 7
217 #define LOADHU_STUB 8
218 #define STOREB_STUB 9
219 #define STOREH_STUB 10
220 #define STOREW_STUB 11
221 #define STORED_STUB 12
222 #define STORELR_STUB 13
223 #define INVCODE_STUB 14
224
225   /* branch codes */
226 #define TAKEN 1
227 #define NOTTAKEN 2
228 #define NULLDS 3
229
230 // asm linkage
231 int new_recompile_block(int addr);
232 void *get_addr_ht(u_int vaddr);
233 void invalidate_block(u_int block);
234 void invalidate_addr(u_int addr);
235 void remove_hash(int vaddr);
236 void jump_vaddr();
237 void dyna_linker();
238 void dyna_linker_ds();
239 void verify_code();
240 void verify_code_vm();
241 void verify_code_ds();
242 void cc_interrupt();
243 void fp_exception();
244 void fp_exception_ds();
245 void jump_syscall();
246 void jump_syscall_hle();
247 void jump_eret();
248 void jump_hlecall();
249 void jump_intcall();
250 void new_dyna_leave();
251
252 // TLB
253 void TLBWI_new();
254 void TLBWR_new();
255 void read_nomem_new();
256 void read_nomemb_new();
257 void read_nomemh_new();
258 void read_nomemd_new();
259 void write_nomem_new();
260 void write_nomemb_new();
261 void write_nomemh_new();
262 void write_nomemd_new();
263 void write_rdram_new();
264 void write_rdramb_new();
265 void write_rdramh_new();
266 void write_rdramd_new();
267 extern u_int memory_map[1048576];
268
269 // Needed by assembler
270 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
271 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
272 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
273 void load_all_regs(signed char i_regmap[]);
274 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
275 void load_regs_entry(int t);
276 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
277
278 int tracedebug=0;
279
280 //#define DEBUG_CYCLE_COUNT 1
281
282 static void tlb_hacks()
283 {
284 #ifndef DISABLE_TLB
285   // Goldeneye hack
286   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
287   {
288     u_int addr;
289     int n;
290     switch (ROM_HEADER->Country_code&0xFF) 
291     {
292       case 0x45: // U
293         addr=0x34b30;
294         break;                   
295       case 0x4A: // J 
296         addr=0x34b70;    
297         break;    
298       case 0x50: // E 
299         addr=0x329f0;
300         break;                        
301       default: 
302         // Unknown country code
303         addr=0;
304         break;
305     }
306     u_int rom_addr=(u_int)rom;
307     #ifdef ROM_COPY
308     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
309     // in the lower 4G of memory to use this hack.  Copy it if necessary.
310     if((void *)rom>(void *)0xffffffff) {
311       munmap(ROM_COPY, 67108864);
312       if(mmap(ROM_COPY, 12582912,
313               PROT_READ | PROT_WRITE,
314               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
315               -1, 0) <= 0) {printf("mmap() failed\n");}
316       memcpy(ROM_COPY,rom,12582912);
317       rom_addr=(u_int)ROM_COPY;
318     }
319     #endif
320     if(addr) {
321       for(n=0x7F000;n<0x80000;n++) {
322         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
323       }
324     }
325   }
326 #endif
327 }
328
329 static u_int get_page(u_int vaddr)
330 {
331 #ifndef PCSX
332   u_int page=(vaddr^0x80000000)>>12;
333 #else
334   u_int page=vaddr&~0xe0000000;
335   if (page < 0x1000000)
336     page &= ~0x0e00000; // RAM mirrors
337   page>>=12;
338 #endif
339 #ifndef DISABLE_TLB
340   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
341 #endif
342   if(page>2048) page=2048+(page&2047);
343   return page;
344 }
345
346 #ifndef PCSX
347 static u_int get_vpage(u_int vaddr)
348 {
349   u_int vpage=(vaddr^0x80000000)>>12;
350 #ifndef DISABLE_TLB
351   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
352 #endif
353   if(vpage>2048) vpage=2048+(vpage&2047);
354   return vpage;
355 }
356 #else
357 // no virtual mem in PCSX
358 static u_int get_vpage(u_int vaddr)
359 {
360   return get_page(vaddr);
361 }
362 #endif
363
364 // Get address from virtual address
365 // This is called from the recompiled JR/JALR instructions
366 void *get_addr(u_int vaddr)
367 {
368   u_int page=get_page(vaddr);
369   u_int vpage=get_vpage(vaddr);
370   struct ll_entry *head;
371   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
372   head=jump_in[page];
373   while(head!=NULL) {
374     if(head->vaddr==vaddr&&head->reg32==0) {
375   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
376       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
377       ht_bin[3]=ht_bin[1];
378       ht_bin[2]=ht_bin[0];
379       ht_bin[1]=(int)head->addr;
380       ht_bin[0]=vaddr;
381       return head->addr;
382     }
383     head=head->next;
384   }
385   head=jump_dirty[vpage];
386   while(head!=NULL) {
387     if(head->vaddr==vaddr&&head->reg32==0) {
388       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
389       // Don't restore blocks which are about to expire from the cache
390       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
391       if(verify_dirty(head->addr)) {
392         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
393         invalid_code[vaddr>>12]=0;
394         inv_code_start=inv_code_end=~0;
395 #ifndef DISABLE_TLB
396         memory_map[vaddr>>12]|=0x40000000;
397 #endif
398         if(vpage<2048) {
399 #ifndef DISABLE_TLB
400           if(tlb_LUT_r[vaddr>>12]) {
401             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
402             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
403           }
404 #endif
405           restore_candidate[vpage>>3]|=1<<(vpage&7);
406         }
407         else restore_candidate[page>>3]|=1<<(page&7);
408         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
409         if(ht_bin[0]==vaddr) {
410           ht_bin[1]=(int)head->addr; // Replace existing entry
411         }
412         else
413         {
414           ht_bin[3]=ht_bin[1];
415           ht_bin[2]=ht_bin[0];
416           ht_bin[1]=(int)head->addr;
417           ht_bin[0]=vaddr;
418         }
419         return head->addr;
420       }
421     }
422     head=head->next;
423   }
424   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
425   int r=new_recompile_block(vaddr);
426   if(r==0) return get_addr(vaddr);
427   // Execute in unmapped page, generate pagefault execption
428   Status|=2;
429   Cause=(vaddr<<31)|0x8;
430   EPC=(vaddr&1)?vaddr-5:vaddr;
431   BadVAddr=(vaddr&~1);
432   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
433   EntryHi=BadVAddr&0xFFFFE000;
434   return get_addr_ht(0x80000000);
435 }
436 // Look up address in hash table first
437 void *get_addr_ht(u_int vaddr)
438 {
439   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
440   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
442   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
443   return get_addr(vaddr);
444 }
445
446 void *get_addr_32(u_int vaddr,u_int flags)
447 {
448 #ifdef FORCE32
449   return get_addr(vaddr);
450 #else
451   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
452   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
453   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
454   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
455   u_int page=get_page(vaddr);
456   u_int vpage=get_vpage(vaddr);
457   struct ll_entry *head;
458   head=jump_in[page];
459   while(head!=NULL) {
460     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
461       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
462       if(head->reg32==0) {
463         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
464         if(ht_bin[0]==-1) {
465           ht_bin[1]=(int)head->addr;
466           ht_bin[0]=vaddr;
467         }else if(ht_bin[2]==-1) {
468           ht_bin[3]=(int)head->addr;
469           ht_bin[2]=vaddr;
470         }
471         //ht_bin[3]=ht_bin[1];
472         //ht_bin[2]=ht_bin[0];
473         //ht_bin[1]=(int)head->addr;
474         //ht_bin[0]=vaddr;
475       }
476       return head->addr;
477     }
478     head=head->next;
479   }
480   head=jump_dirty[vpage];
481   while(head!=NULL) {
482     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
483       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
484       // Don't restore blocks which are about to expire from the cache
485       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
486       if(verify_dirty(head->addr)) {
487         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
488         invalid_code[vaddr>>12]=0;
489         inv_code_start=inv_code_end=~0;
490         memory_map[vaddr>>12]|=0x40000000;
491         if(vpage<2048) {
492 #ifndef DISABLE_TLB
493           if(tlb_LUT_r[vaddr>>12]) {
494             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
495             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
496           }
497 #endif
498           restore_candidate[vpage>>3]|=1<<(vpage&7);
499         }
500         else restore_candidate[page>>3]|=1<<(page&7);
501         if(head->reg32==0) {
502           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
503           if(ht_bin[0]==-1) {
504             ht_bin[1]=(int)head->addr;
505             ht_bin[0]=vaddr;
506           }else if(ht_bin[2]==-1) {
507             ht_bin[3]=(int)head->addr;
508             ht_bin[2]=vaddr;
509           }
510           //ht_bin[3]=ht_bin[1];
511           //ht_bin[2]=ht_bin[0];
512           //ht_bin[1]=(int)head->addr;
513           //ht_bin[0]=vaddr;
514         }
515         return head->addr;
516       }
517     }
518     head=head->next;
519   }
520   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
521   int r=new_recompile_block(vaddr);
522   if(r==0) return get_addr(vaddr);
523   // Execute in unmapped page, generate pagefault execption
524   Status|=2;
525   Cause=(vaddr<<31)|0x8;
526   EPC=(vaddr&1)?vaddr-5:vaddr;
527   BadVAddr=(vaddr&~1);
528   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
529   EntryHi=BadVAddr&0xFFFFE000;
530   return get_addr_ht(0x80000000);
531 #endif
532 }
533
534 void clear_all_regs(signed char regmap[])
535 {
536   int hr;
537   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
538 }
539
540 signed char get_reg(signed char regmap[],int r)
541 {
542   int hr;
543   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
544   return -1;
545 }
546
547 // Find a register that is available for two consecutive cycles
548 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
549 {
550   int hr;
551   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
552   return -1;
553 }
554
555 int count_free_regs(signed char regmap[])
556 {
557   int count=0;
558   int hr;
559   for(hr=0;hr<HOST_REGS;hr++)
560   {
561     if(hr!=EXCLUDE_REG) {
562       if(regmap[hr]<0) count++;
563     }
564   }
565   return count;
566 }
567
568 void dirty_reg(struct regstat *cur,signed char reg)
569 {
570   int hr;
571   if(!reg) return;
572   for (hr=0;hr<HOST_REGS;hr++) {
573     if((cur->regmap[hr]&63)==reg) {
574       cur->dirty|=1<<hr;
575     }
576   }
577 }
578
579 // If we dirty the lower half of a 64 bit register which is now being
580 // sign-extended, we need to dump the upper half.
581 // Note: Do this only after completion of the instruction, because
582 // some instructions may need to read the full 64-bit value even if
583 // overwriting it (eg SLTI, DSRA32).
584 static void flush_dirty_uppers(struct regstat *cur)
585 {
586   int hr,reg;
587   for (hr=0;hr<HOST_REGS;hr++) {
588     if((cur->dirty>>hr)&1) {
589       reg=cur->regmap[hr];
590       if(reg>=64) 
591         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
592     }
593   }
594 }
595
596 void set_const(struct regstat *cur,signed char reg,uint64_t value)
597 {
598   int hr;
599   if(!reg) return;
600   for (hr=0;hr<HOST_REGS;hr++) {
601     if(cur->regmap[hr]==reg) {
602       cur->isconst|=1<<hr;
603       cur->constmap[hr]=value;
604     }
605     else if((cur->regmap[hr]^64)==reg) {
606       cur->isconst|=1<<hr;
607       cur->constmap[hr]=value>>32;
608     }
609   }
610 }
611
612 void clear_const(struct regstat *cur,signed char reg)
613 {
614   int hr;
615   if(!reg) return;
616   for (hr=0;hr<HOST_REGS;hr++) {
617     if((cur->regmap[hr]&63)==reg) {
618       cur->isconst&=~(1<<hr);
619     }
620   }
621 }
622
623 int is_const(struct regstat *cur,signed char reg)
624 {
625   int hr;
626   if(reg<0) return 0;
627   if(!reg) return 1;
628   for (hr=0;hr<HOST_REGS;hr++) {
629     if((cur->regmap[hr]&63)==reg) {
630       return (cur->isconst>>hr)&1;
631     }
632   }
633   return 0;
634 }
635 uint64_t get_const(struct regstat *cur,signed char reg)
636 {
637   int hr;
638   if(!reg) return 0;
639   for (hr=0;hr<HOST_REGS;hr++) {
640     if(cur->regmap[hr]==reg) {
641       return cur->constmap[hr];
642     }
643   }
644   printf("Unknown constant in r%d\n",reg);
645   exit(1);
646 }
647
648 // Least soon needed registers
649 // Look at the next ten instructions and see which registers
650 // will be used.  Try not to reallocate these.
651 void lsn(u_char hsn[], int i, int *preferred_reg)
652 {
653   int j;
654   int b=-1;
655   for(j=0;j<9;j++)
656   {
657     if(i+j>=slen) {
658       j=slen-i-1;
659       break;
660     }
661     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
662     {
663       // Don't go past an unconditonal jump
664       j++;
665       break;
666     }
667   }
668   for(;j>=0;j--)
669   {
670     if(rs1[i+j]) hsn[rs1[i+j]]=j;
671     if(rs2[i+j]) hsn[rs2[i+j]]=j;
672     if(rt1[i+j]) hsn[rt1[i+j]]=j;
673     if(rt2[i+j]) hsn[rt2[i+j]]=j;
674     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
675       // Stores can allocate zero
676       hsn[rs1[i+j]]=j;
677       hsn[rs2[i+j]]=j;
678     }
679     // On some architectures stores need invc_ptr
680     #if defined(HOST_IMM8)
681     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
682       hsn[INVCP]=j;
683     }
684     #endif
685     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
686     {
687       hsn[CCREG]=j;
688       b=j;
689     }
690   }
691   if(b>=0)
692   {
693     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
694     {
695       // Follow first branch
696       int t=(ba[i+b]-start)>>2;
697       j=7-b;if(t+j>=slen) j=slen-t-1;
698       for(;j>=0;j--)
699       {
700         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
701         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
702         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
703         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
704       }
705     }
706     // TODO: preferred register based on backward branch
707   }
708   // Delay slot should preferably not overwrite branch conditions or cycle count
709   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
710     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
711     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
712     hsn[CCREG]=1;
713     // ...or hash tables
714     hsn[RHASH]=1;
715     hsn[RHTBL]=1;
716   }
717   // Coprocessor load/store needs FTEMP, even if not declared
718   if(itype[i]==C1LS||itype[i]==C2LS) {
719     hsn[FTEMP]=0;
720   }
721   // Load L/R also uses FTEMP as a temporary register
722   if(itype[i]==LOADLR) {
723     hsn[FTEMP]=0;
724   }
725   // Also SWL/SWR/SDL/SDR
726   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
727     hsn[FTEMP]=0;
728   }
729   // Don't remove the TLB registers either
730   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
731     hsn[TLREG]=0;
732   }
733   // Don't remove the miniht registers
734   if(itype[i]==UJUMP||itype[i]==RJUMP)
735   {
736     hsn[RHASH]=0;
737     hsn[RHTBL]=0;
738   }
739 }
740
741 // We only want to allocate registers if we're going to use them again soon
742 int needed_again(int r, int i)
743 {
744   int j;
745   int b=-1;
746   int rn=10;
747   
748   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
749   {
750     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
751       return 0; // Don't need any registers if exiting the block
752   }
753   for(j=0;j<9;j++)
754   {
755     if(i+j>=slen) {
756       j=slen-i-1;
757       break;
758     }
759     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
760     {
761       // Don't go past an unconditonal jump
762       j++;
763       break;
764     }
765     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
766     {
767       break;
768     }
769   }
770   for(;j>=1;j--)
771   {
772     if(rs1[i+j]==r) rn=j;
773     if(rs2[i+j]==r) rn=j;
774     if((unneeded_reg[i+j]>>r)&1) rn=10;
775     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
776     {
777       b=j;
778     }
779   }
780   /*
781   if(b>=0)
782   {
783     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
784     {
785       // Follow first branch
786       int o=rn;
787       int t=(ba[i+b]-start)>>2;
788       j=7-b;if(t+j>=slen) j=slen-t-1;
789       for(;j>=0;j--)
790       {
791         if(!((unneeded_reg[t+j]>>r)&1)) {
792           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
793           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
794         }
795         else rn=o;
796       }
797     }
798   }*/
799   if(rn<10) return 1;
800   return 0;
801 }
802
803 // Try to match register allocations at the end of a loop with those
804 // at the beginning
805 int loop_reg(int i, int r, int hr)
806 {
807   int j,k;
808   for(j=0;j<9;j++)
809   {
810     if(i+j>=slen) {
811       j=slen-i-1;
812       break;
813     }
814     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
815     {
816       // Don't go past an unconditonal jump
817       j++;
818       break;
819     }
820   }
821   k=0;
822   if(i>0){
823     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
824       k--;
825   }
826   for(;k<j;k++)
827   {
828     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
829     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
830     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
831     {
832       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
833       {
834         int t=(ba[i+k]-start)>>2;
835         int reg=get_reg(regs[t].regmap_entry,r);
836         if(reg>=0) return reg;
837         //reg=get_reg(regs[t+1].regmap_entry,r);
838         //if(reg>=0) return reg;
839       }
840     }
841   }
842   return hr;
843 }
844
845
846 // Allocate every register, preserving source/target regs
847 void alloc_all(struct regstat *cur,int i)
848 {
849   int hr;
850   
851   for(hr=0;hr<HOST_REGS;hr++) {
852     if(hr!=EXCLUDE_REG) {
853       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
854          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
855       {
856         cur->regmap[hr]=-1;
857         cur->dirty&=~(1<<hr);
858       }
859       // Don't need zeros
860       if((cur->regmap[hr]&63)==0)
861       {
862         cur->regmap[hr]=-1;
863         cur->dirty&=~(1<<hr);
864       }
865     }
866   }
867 }
868
869 #ifndef FORCE32
870 void div64(int64_t dividend,int64_t divisor)
871 {
872   lo=dividend/divisor;
873   hi=dividend%divisor;
874   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
875   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
876 }
877 void divu64(uint64_t dividend,uint64_t divisor)
878 {
879   lo=dividend/divisor;
880   hi=dividend%divisor;
881   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
882   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
883 }
884
885 void mult64(uint64_t m1,uint64_t m2)
886 {
887    unsigned long long int op1, op2, op3, op4;
888    unsigned long long int result1, result2, result3, result4;
889    unsigned long long int temp1, temp2, temp3, temp4;
890    int sign = 0;
891    
892    if (m1 < 0)
893      {
894     op2 = -m1;
895     sign = 1 - sign;
896      }
897    else op2 = m1;
898    if (m2 < 0)
899      {
900     op4 = -m2;
901     sign = 1 - sign;
902      }
903    else op4 = m2;
904    
905    op1 = op2 & 0xFFFFFFFF;
906    op2 = (op2 >> 32) & 0xFFFFFFFF;
907    op3 = op4 & 0xFFFFFFFF;
908    op4 = (op4 >> 32) & 0xFFFFFFFF;
909    
910    temp1 = op1 * op3;
911    temp2 = (temp1 >> 32) + op1 * op4;
912    temp3 = op2 * op3;
913    temp4 = (temp3 >> 32) + op2 * op4;
914    
915    result1 = temp1 & 0xFFFFFFFF;
916    result2 = temp2 + (temp3 & 0xFFFFFFFF);
917    result3 = (result2 >> 32) + temp4;
918    result4 = (result3 >> 32);
919    
920    lo = result1 | (result2 << 32);
921    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
922    if (sign)
923      {
924     hi = ~hi;
925     if (!lo) hi++;
926     else lo = ~lo + 1;
927      }
928 }
929
930 void multu64(uint64_t m1,uint64_t m2)
931 {
932    unsigned long long int op1, op2, op3, op4;
933    unsigned long long int result1, result2, result3, result4;
934    unsigned long long int temp1, temp2, temp3, temp4;
935    
936    op1 = m1 & 0xFFFFFFFF;
937    op2 = (m1 >> 32) & 0xFFFFFFFF;
938    op3 = m2 & 0xFFFFFFFF;
939    op4 = (m2 >> 32) & 0xFFFFFFFF;
940    
941    temp1 = op1 * op3;
942    temp2 = (temp1 >> 32) + op1 * op4;
943    temp3 = op2 * op3;
944    temp4 = (temp3 >> 32) + op2 * op4;
945    
946    result1 = temp1 & 0xFFFFFFFF;
947    result2 = temp2 + (temp3 & 0xFFFFFFFF);
948    result3 = (result2 >> 32) + temp4;
949    result4 = (result3 >> 32);
950    
951    lo = result1 | (result2 << 32);
952    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
953    
954   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
955   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
956 }
957
958 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
959 {
960   if(bits) {
961     original<<=64-bits;
962     original>>=64-bits;
963     loaded<<=bits;
964     original|=loaded;
965   }
966   else original=loaded;
967   return original;
968 }
969 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
970 {
971   if(bits^56) {
972     original>>=64-(bits^56);
973     original<<=64-(bits^56);
974     loaded>>=bits^56;
975     original|=loaded;
976   }
977   else original=loaded;
978   return original;
979 }
980 #endif
981
982 #ifdef __i386__
983 #include "assem_x86.c"
984 #endif
985 #ifdef __x86_64__
986 #include "assem_x64.c"
987 #endif
988 #ifdef __arm__
989 #include "assem_arm.c"
990 #endif
991
992 // Add virtual address mapping to linked list
993 void ll_add(struct ll_entry **head,int vaddr,void *addr)
994 {
995   struct ll_entry *new_entry;
996   new_entry=malloc(sizeof(struct ll_entry));
997   assert(new_entry!=NULL);
998   new_entry->vaddr=vaddr;
999   new_entry->reg32=0;
1000   new_entry->addr=addr;
1001   new_entry->next=*head;
1002   *head=new_entry;
1003 }
1004
1005 // Add virtual address mapping for 32-bit compiled block
1006 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
1007 {
1008   ll_add(head,vaddr,addr);
1009 #ifndef FORCE32
1010   (*head)->reg32=reg32;
1011 #endif
1012 }
1013
1014 // Check if an address is already compiled
1015 // but don't return addresses which are about to expire from the cache
1016 void *check_addr(u_int vaddr)
1017 {
1018   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1019   if(ht_bin[0]==vaddr) {
1020     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1021       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1022   }
1023   if(ht_bin[2]==vaddr) {
1024     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1025       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1026   }
1027   u_int page=get_page(vaddr);
1028   struct ll_entry *head;
1029   head=jump_in[page];
1030   while(head!=NULL) {
1031     if(head->vaddr==vaddr&&head->reg32==0) {
1032       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1033         // Update existing entry with current address
1034         if(ht_bin[0]==vaddr) {
1035           ht_bin[1]=(int)head->addr;
1036           return head->addr;
1037         }
1038         if(ht_bin[2]==vaddr) {
1039           ht_bin[3]=(int)head->addr;
1040           return head->addr;
1041         }
1042         // Insert into hash table with low priority.
1043         // Don't evict existing entries, as they are probably
1044         // addresses that are being accessed frequently.
1045         if(ht_bin[0]==-1) {
1046           ht_bin[1]=(int)head->addr;
1047           ht_bin[0]=vaddr;
1048         }else if(ht_bin[2]==-1) {
1049           ht_bin[3]=(int)head->addr;
1050           ht_bin[2]=vaddr;
1051         }
1052         return head->addr;
1053       }
1054     }
1055     head=head->next;
1056   }
1057   return 0;
1058 }
1059
1060 void remove_hash(int vaddr)
1061 {
1062   //printf("remove hash: %x\n",vaddr);
1063   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1064   if(ht_bin[2]==vaddr) {
1065     ht_bin[2]=ht_bin[3]=-1;
1066   }
1067   if(ht_bin[0]==vaddr) {
1068     ht_bin[0]=ht_bin[2];
1069     ht_bin[1]=ht_bin[3];
1070     ht_bin[2]=ht_bin[3]=-1;
1071   }
1072 }
1073
1074 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1075 {
1076   struct ll_entry *next;
1077   while(*head) {
1078     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1079        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1080     {
1081       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1082       remove_hash((*head)->vaddr);
1083       next=(*head)->next;
1084       free(*head);
1085       *head=next;
1086     }
1087     else
1088     {
1089       head=&((*head)->next);
1090     }
1091   }
1092 }
1093
1094 // Remove all entries from linked list
1095 void ll_clear(struct ll_entry **head)
1096 {
1097   struct ll_entry *cur;
1098   struct ll_entry *next;
1099   if(cur=*head) {
1100     *head=0;
1101     while(cur) {
1102       next=cur->next;
1103       free(cur);
1104       cur=next;
1105     }
1106   }
1107 }
1108
1109 // Dereference the pointers and remove if it matches
1110 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1111 {
1112   while(head) {
1113     int ptr=get_pointer(head->addr);
1114     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1115     if(((ptr>>shift)==(addr>>shift)) ||
1116        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1117     {
1118       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1119       u_int host_addr=(u_int)kill_pointer(head->addr);
1120       #ifdef __arm__
1121         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1122       #endif
1123     }
1124     head=head->next;
1125   }
1126 }
1127
1128 // This is called when we write to a compiled block (see do_invstub)
1129 void invalidate_page(u_int page)
1130 {
1131   struct ll_entry *head;
1132   struct ll_entry *next;
1133   head=jump_in[page];
1134   jump_in[page]=0;
1135   while(head!=NULL) {
1136     inv_debug("INVALIDATE: %x\n",head->vaddr);
1137     remove_hash(head->vaddr);
1138     next=head->next;
1139     free(head);
1140     head=next;
1141   }
1142   head=jump_out[page];
1143   jump_out[page]=0;
1144   while(head!=NULL) {
1145     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1146     u_int host_addr=(u_int)kill_pointer(head->addr);
1147     #ifdef __arm__
1148       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1149     #endif
1150     next=head->next;
1151     free(head);
1152     head=next;
1153   }
1154 }
1155
1156 static void invalidate_block_range(u_int block, u_int first, u_int last)
1157 {
1158   u_int page=get_page(block<<12);
1159   //printf("first=%d last=%d\n",first,last);
1160   invalidate_page(page);
1161   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1162   assert(last<page+5);
1163   // Invalidate the adjacent pages if a block crosses a 4K boundary
1164   while(first<page) {
1165     invalidate_page(first);
1166     first++;
1167   }
1168   for(first=page+1;first<last;first++) {
1169     invalidate_page(first);
1170   }
1171   #ifdef __arm__
1172     do_clear_cache();
1173   #endif
1174   
1175   // Don't trap writes
1176   invalid_code[block]=1;
1177 #ifndef DISABLE_TLB
1178   // If there is a valid TLB entry for this page, remove write protect
1179   if(tlb_LUT_w[block]) {
1180     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1181     // CHECK: Is this right?
1182     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1183     u_int real_block=tlb_LUT_w[block]>>12;
1184     invalid_code[real_block]=1;
1185     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1186   }
1187   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1188 #endif
1189
1190   #ifdef USE_MINI_HT
1191   memset(mini_ht,-1,sizeof(mini_ht));
1192   #endif
1193 }
1194
1195 void invalidate_block(u_int block)
1196 {
1197   u_int page=get_page(block<<12);
1198   u_int vpage=get_vpage(block<<12);
1199   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1200   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1201   u_int first,last;
1202   first=last=page;
1203   struct ll_entry *head;
1204   head=jump_dirty[vpage];
1205   //printf("page=%d vpage=%d\n",page,vpage);
1206   while(head!=NULL) {
1207     u_int start,end;
1208     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1209       get_bounds((int)head->addr,&start,&end);
1210       //printf("start: %x end: %x\n",start,end);
1211       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1212         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1213           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1214           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1215         }
1216       }
1217 #ifndef DISABLE_TLB
1218       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1219         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1220           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1221           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1222         }
1223       }
1224 #endif
1225     }
1226     head=head->next;
1227   }
1228   invalidate_block_range(block,first,last);
1229 }
1230
1231 void invalidate_addr(u_int addr)
1232 {
1233 #ifdef PCSX
1234   //static int rhits;
1235   // this check is done by the caller
1236   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1237   u_int page=get_vpage(addr);
1238   if(page<2048) { // RAM
1239     struct ll_entry *head;
1240     u_int addr_min=~0, addr_max=0;
1241     int mask=RAM_SIZE-1;
1242     int pg1;
1243     inv_code_start=addr&~0xfff;
1244     inv_code_end=addr|0xfff;
1245     pg1=page;
1246     if (pg1>0) {
1247       // must check previous page too because of spans..
1248       pg1--;
1249       inv_code_start-=0x1000;
1250     }
1251     for(;pg1<=page;pg1++) {
1252       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1253         u_int start,end;
1254         get_bounds((int)head->addr,&start,&end);
1255         if((start&mask)<=(addr&mask)&&(addr&mask)<(end&mask)) {
1256           if(start<addr_min) addr_min=start;
1257           if(end>addr_max) addr_max=end;
1258         }
1259         else if(addr<start) {
1260           if(start<inv_code_end)
1261             inv_code_end=start-1;
1262         }
1263         else {
1264           if(end>inv_code_start)
1265             inv_code_start=end;
1266         }
1267       }
1268     }
1269     if (addr_min!=~0) {
1270       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1271       inv_code_start=inv_code_end=~0;
1272       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1273       return;
1274     }
1275     else {
1276       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1277       return;
1278     }
1279   }
1280 #endif
1281   invalidate_block(addr>>12);
1282 }
1283
1284 // This is called when loading a save state.
1285 // Anything could have changed, so invalidate everything.
1286 void invalidate_all_pages()
1287 {
1288   u_int page,n;
1289   for(page=0;page<4096;page++)
1290     invalidate_page(page);
1291   for(page=0;page<1048576;page++)
1292     if(!invalid_code[page]) {
1293       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1294       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1295     }
1296   #ifdef __arm__
1297   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1298   #endif
1299   #ifdef USE_MINI_HT
1300   memset(mini_ht,-1,sizeof(mini_ht));
1301   #endif
1302   #ifndef DISABLE_TLB
1303   // TLB
1304   for(page=0;page<0x100000;page++) {
1305     if(tlb_LUT_r[page]) {
1306       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1307       if(!tlb_LUT_w[page]||!invalid_code[page])
1308         memory_map[page]|=0x40000000; // Write protect
1309     }
1310     else memory_map[page]=-1;
1311     if(page==0x80000) page=0xC0000;
1312   }
1313   tlb_hacks();
1314   #endif
1315 }
1316
1317 // Add an entry to jump_out after making a link
1318 void add_link(u_int vaddr,void *src)
1319 {
1320   u_int page=get_page(vaddr);
1321   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1322   int *ptr=(int *)(src+4);
1323   assert((*ptr&0x0fff0000)==0x059f0000);
1324   ll_add(jump_out+page,vaddr,src);
1325   //int ptr=get_pointer(src);
1326   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1327 }
1328
1329 // If a code block was found to be unmodified (bit was set in
1330 // restore_candidate) and it remains unmodified (bit is clear
1331 // in invalid_code) then move the entries for that 4K page from
1332 // the dirty list to the clean list.
1333 void clean_blocks(u_int page)
1334 {
1335   struct ll_entry *head;
1336   inv_debug("INV: clean_blocks page=%d\n",page);
1337   head=jump_dirty[page];
1338   while(head!=NULL) {
1339     if(!invalid_code[head->vaddr>>12]) {
1340       // Don't restore blocks which are about to expire from the cache
1341       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1342         u_int start,end;
1343         if(verify_dirty((int)head->addr)) {
1344           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1345           u_int i;
1346           u_int inv=0;
1347           get_bounds((int)head->addr,&start,&end);
1348           if(start-(u_int)rdram<RAM_SIZE) {
1349             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1350               inv|=invalid_code[i];
1351             }
1352           }
1353 #ifndef DISABLE_TLB
1354           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1355             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1356             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1357             if(addr<start||addr>=end) inv=1;
1358           }
1359 #endif
1360           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1361             inv=1;
1362           }
1363           if(!inv) {
1364             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1365             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1366               u_int ppage=page;
1367 #ifndef DISABLE_TLB
1368               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1369 #endif
1370               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1371               //printf("page=%x, addr=%x\n",page,head->vaddr);
1372               //assert(head->vaddr>>12==(page|0x80000));
1373               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1374               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1375               if(!head->reg32) {
1376                 if(ht_bin[0]==head->vaddr) {
1377                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1378                 }
1379                 if(ht_bin[2]==head->vaddr) {
1380                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1381                 }
1382               }
1383             }
1384           }
1385         }
1386       }
1387     }
1388     head=head->next;
1389   }
1390 }
1391
1392
1393 void mov_alloc(struct regstat *current,int i)
1394 {
1395   // Note: Don't need to actually alloc the source registers
1396   if((~current->is32>>rs1[i])&1) {
1397     //alloc_reg64(current,i,rs1[i]);
1398     alloc_reg64(current,i,rt1[i]);
1399     current->is32&=~(1LL<<rt1[i]);
1400   } else {
1401     //alloc_reg(current,i,rs1[i]);
1402     alloc_reg(current,i,rt1[i]);
1403     current->is32|=(1LL<<rt1[i]);
1404   }
1405   clear_const(current,rs1[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void shiftimm_alloc(struct regstat *current,int i)
1411 {
1412   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1413   {
1414     if(rt1[i]) {
1415       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1416       else lt1[i]=rs1[i];
1417       alloc_reg(current,i,rt1[i]);
1418       current->is32|=1LL<<rt1[i];
1419       dirty_reg(current,rt1[i]);
1420       if(is_const(current,rs1[i])) {
1421         int v=get_const(current,rs1[i]);
1422         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1423         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1424         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1425       }
1426       else clear_const(current,rt1[i]);
1427     }
1428   }
1429   else
1430   {
1431     clear_const(current,rs1[i]);
1432     clear_const(current,rt1[i]);
1433   }
1434
1435   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1436   {
1437     if(rt1[i]) {
1438       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1439       alloc_reg64(current,i,rt1[i]);
1440       current->is32&=~(1LL<<rt1[i]);
1441       dirty_reg(current,rt1[i]);
1442     }
1443   }
1444   if(opcode2[i]==0x3c) // DSLL32
1445   {
1446     if(rt1[i]) {
1447       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1448       alloc_reg64(current,i,rt1[i]);
1449       current->is32&=~(1LL<<rt1[i]);
1450       dirty_reg(current,rt1[i]);
1451     }
1452   }
1453   if(opcode2[i]==0x3e) // DSRL32
1454   {
1455     if(rt1[i]) {
1456       alloc_reg64(current,i,rs1[i]);
1457       if(imm[i]==32) {
1458         alloc_reg64(current,i,rt1[i]);
1459         current->is32&=~(1LL<<rt1[i]);
1460       } else {
1461         alloc_reg(current,i,rt1[i]);
1462         current->is32|=1LL<<rt1[i];
1463       }
1464       dirty_reg(current,rt1[i]);
1465     }
1466   }
1467   if(opcode2[i]==0x3f) // DSRA32
1468   {
1469     if(rt1[i]) {
1470       alloc_reg64(current,i,rs1[i]);
1471       alloc_reg(current,i,rt1[i]);
1472       current->is32|=1LL<<rt1[i];
1473       dirty_reg(current,rt1[i]);
1474     }
1475   }
1476 }
1477
1478 void shift_alloc(struct regstat *current,int i)
1479 {
1480   if(rt1[i]) {
1481     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1482     {
1483       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1484       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1485       alloc_reg(current,i,rt1[i]);
1486       if(rt1[i]==rs2[i]) {
1487         alloc_reg_temp(current,i,-1);
1488         minimum_free_regs[i]=1;
1489       }
1490       current->is32|=1LL<<rt1[i];
1491     } else { // DSLLV/DSRLV/DSRAV
1492       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1493       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1494       alloc_reg64(current,i,rt1[i]);
1495       current->is32&=~(1LL<<rt1[i]);
1496       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1497       {
1498         alloc_reg_temp(current,i,-1);
1499         minimum_free_regs[i]=1;
1500       }
1501     }
1502     clear_const(current,rs1[i]);
1503     clear_const(current,rs2[i]);
1504     clear_const(current,rt1[i]);
1505     dirty_reg(current,rt1[i]);
1506   }
1507 }
1508
1509 void alu_alloc(struct regstat *current,int i)
1510 {
1511   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1512     if(rt1[i]) {
1513       if(rs1[i]&&rs2[i]) {
1514         alloc_reg(current,i,rs1[i]);
1515         alloc_reg(current,i,rs2[i]);
1516       }
1517       else {
1518         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1520       }
1521       alloc_reg(current,i,rt1[i]);
1522     }
1523     current->is32|=1LL<<rt1[i];
1524   }
1525   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1526     if(rt1[i]) {
1527       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1528       {
1529         alloc_reg64(current,i,rs1[i]);
1530         alloc_reg64(current,i,rs2[i]);
1531         alloc_reg(current,i,rt1[i]);
1532       } else {
1533         alloc_reg(current,i,rs1[i]);
1534         alloc_reg(current,i,rs2[i]);
1535         alloc_reg(current,i,rt1[i]);
1536       }
1537     }
1538     current->is32|=1LL<<rt1[i];
1539   }
1540   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1541     if(rt1[i]) {
1542       if(rs1[i]&&rs2[i]) {
1543         alloc_reg(current,i,rs1[i]);
1544         alloc_reg(current,i,rs2[i]);
1545       }
1546       else
1547       {
1548         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1549         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1550       }
1551       alloc_reg(current,i,rt1[i]);
1552       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1553       {
1554         if(!((current->uu>>rt1[i])&1)) {
1555           alloc_reg64(current,i,rt1[i]);
1556         }
1557         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1558           if(rs1[i]&&rs2[i]) {
1559             alloc_reg64(current,i,rs1[i]);
1560             alloc_reg64(current,i,rs2[i]);
1561           }
1562           else
1563           {
1564             // Is is really worth it to keep 64-bit values in registers?
1565             #ifdef NATIVE_64BIT
1566             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1567             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1568             #endif
1569           }
1570         }
1571         current->is32&=~(1LL<<rt1[i]);
1572       } else {
1573         current->is32|=1LL<<rt1[i];
1574       }
1575     }
1576   }
1577   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1578     if(rt1[i]) {
1579       if(rs1[i]&&rs2[i]) {
1580         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1581           alloc_reg64(current,i,rs1[i]);
1582           alloc_reg64(current,i,rs2[i]);
1583           alloc_reg64(current,i,rt1[i]);
1584         } else {
1585           alloc_reg(current,i,rs1[i]);
1586           alloc_reg(current,i,rs2[i]);
1587           alloc_reg(current,i,rt1[i]);
1588         }
1589       }
1590       else {
1591         alloc_reg(current,i,rt1[i]);
1592         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1593           // DADD used as move, or zeroing
1594           // If we have a 64-bit source, then make the target 64 bits too
1595           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1596             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1597             alloc_reg64(current,i,rt1[i]);
1598           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1599             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1600             alloc_reg64(current,i,rt1[i]);
1601           }
1602           if(opcode2[i]>=0x2e&&rs2[i]) {
1603             // DSUB used as negation - 64-bit result
1604             // If we have a 32-bit register, extend it to 64 bits
1605             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1606             alloc_reg64(current,i,rt1[i]);
1607           }
1608         }
1609       }
1610       if(rs1[i]&&rs2[i]) {
1611         current->is32&=~(1LL<<rt1[i]);
1612       } else if(rs1[i]) {
1613         current->is32&=~(1LL<<rt1[i]);
1614         if((current->is32>>rs1[i])&1)
1615           current->is32|=1LL<<rt1[i];
1616       } else if(rs2[i]) {
1617         current->is32&=~(1LL<<rt1[i]);
1618         if((current->is32>>rs2[i])&1)
1619           current->is32|=1LL<<rt1[i];
1620       } else {
1621         current->is32|=1LL<<rt1[i];
1622       }
1623     }
1624   }
1625   clear_const(current,rs1[i]);
1626   clear_const(current,rs2[i]);
1627   clear_const(current,rt1[i]);
1628   dirty_reg(current,rt1[i]);
1629 }
1630
1631 void imm16_alloc(struct regstat *current,int i)
1632 {
1633   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1634   else lt1[i]=rs1[i];
1635   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1636   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1637     current->is32&=~(1LL<<rt1[i]);
1638     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1639       // TODO: Could preserve the 32-bit flag if the immediate is zero
1640       alloc_reg64(current,i,rt1[i]);
1641       alloc_reg64(current,i,rs1[i]);
1642     }
1643     clear_const(current,rs1[i]);
1644     clear_const(current,rt1[i]);
1645   }
1646   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1647     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1648     current->is32|=1LL<<rt1[i];
1649     clear_const(current,rs1[i]);
1650     clear_const(current,rt1[i]);
1651   }
1652   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1653     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1654       if(rs1[i]!=rt1[i]) {
1655         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1656         alloc_reg64(current,i,rt1[i]);
1657         current->is32&=~(1LL<<rt1[i]);
1658       }
1659     }
1660     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1661     if(is_const(current,rs1[i])) {
1662       int v=get_const(current,rs1[i]);
1663       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1664       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1665       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1666     }
1667     else clear_const(current,rt1[i]);
1668   }
1669   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1670     if(is_const(current,rs1[i])) {
1671       int v=get_const(current,rs1[i]);
1672       set_const(current,rt1[i],v+imm[i]);
1673     }
1674     else clear_const(current,rt1[i]);
1675     current->is32|=1LL<<rt1[i];
1676   }
1677   else {
1678     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1679     current->is32|=1LL<<rt1[i];
1680   }
1681   dirty_reg(current,rt1[i]);
1682 }
1683
1684 void load_alloc(struct regstat *current,int i)
1685 {
1686   clear_const(current,rt1[i]);
1687   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1688   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1689   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1690   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1691     alloc_reg(current,i,rt1[i]);
1692     assert(get_reg(current->regmap,rt1[i])>=0);
1693     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1694     {
1695       current->is32&=~(1LL<<rt1[i]);
1696       alloc_reg64(current,i,rt1[i]);
1697     }
1698     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1699     {
1700       current->is32&=~(1LL<<rt1[i]);
1701       alloc_reg64(current,i,rt1[i]);
1702       alloc_all(current,i);
1703       alloc_reg64(current,i,FTEMP);
1704       minimum_free_regs[i]=HOST_REGS;
1705     }
1706     else current->is32|=1LL<<rt1[i];
1707     dirty_reg(current,rt1[i]);
1708     // If using TLB, need a register for pointer to the mapping table
1709     if(using_tlb) alloc_reg(current,i,TLREG);
1710     // LWL/LWR need a temporary register for the old value
1711     if(opcode[i]==0x22||opcode[i]==0x26)
1712     {
1713       alloc_reg(current,i,FTEMP);
1714       alloc_reg_temp(current,i,-1);
1715       minimum_free_regs[i]=1;
1716     }
1717   }
1718   else
1719   {
1720     // Load to r0 or unneeded register (dummy load)
1721     // but we still need a register to calculate the address
1722     if(opcode[i]==0x22||opcode[i]==0x26)
1723     {
1724       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1725     }
1726     // If using TLB, need a register for pointer to the mapping table
1727     if(using_tlb) alloc_reg(current,i,TLREG);
1728     alloc_reg_temp(current,i,-1);
1729     minimum_free_regs[i]=1;
1730     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1731     {
1732       alloc_all(current,i);
1733       alloc_reg64(current,i,FTEMP);
1734       minimum_free_regs[i]=HOST_REGS;
1735     }
1736   }
1737 }
1738
1739 void store_alloc(struct regstat *current,int i)
1740 {
1741   clear_const(current,rs2[i]);
1742   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1743   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1744   alloc_reg(current,i,rs2[i]);
1745   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1746     alloc_reg64(current,i,rs2[i]);
1747     if(rs2[i]) alloc_reg(current,i,FTEMP);
1748   }
1749   // If using TLB, need a register for pointer to the mapping table
1750   if(using_tlb) alloc_reg(current,i,TLREG);
1751   #if defined(HOST_IMM8)
1752   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1753   else alloc_reg(current,i,INVCP);
1754   #endif
1755   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1756     alloc_reg(current,i,FTEMP);
1757   }
1758   // We need a temporary register for address generation
1759   alloc_reg_temp(current,i,-1);
1760   minimum_free_regs[i]=1;
1761 }
1762
1763 void c1ls_alloc(struct regstat *current,int i)
1764 {
1765   //clear_const(current,rs1[i]); // FIXME
1766   clear_const(current,rt1[i]);
1767   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1768   alloc_reg(current,i,CSREG); // Status
1769   alloc_reg(current,i,FTEMP);
1770   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1771     alloc_reg64(current,i,FTEMP);
1772   }
1773   // If using TLB, need a register for pointer to the mapping table
1774   if(using_tlb) alloc_reg(current,i,TLREG);
1775   #if defined(HOST_IMM8)
1776   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1777   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1778     alloc_reg(current,i,INVCP);
1779   #endif
1780   // We need a temporary register for address generation
1781   alloc_reg_temp(current,i,-1);
1782 }
1783
1784 void c2ls_alloc(struct regstat *current,int i)
1785 {
1786   clear_const(current,rt1[i]);
1787   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1788   alloc_reg(current,i,FTEMP);
1789   // If using TLB, need a register for pointer to the mapping table
1790   if(using_tlb) alloc_reg(current,i,TLREG);
1791   #if defined(HOST_IMM8)
1792   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1793   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1794     alloc_reg(current,i,INVCP);
1795   #endif
1796   // We need a temporary register for address generation
1797   alloc_reg_temp(current,i,-1);
1798   minimum_free_regs[i]=1;
1799 }
1800
1801 #ifndef multdiv_alloc
1802 void multdiv_alloc(struct regstat *current,int i)
1803 {
1804   //  case 0x18: MULT
1805   //  case 0x19: MULTU
1806   //  case 0x1A: DIV
1807   //  case 0x1B: DIVU
1808   //  case 0x1C: DMULT
1809   //  case 0x1D: DMULTU
1810   //  case 0x1E: DDIV
1811   //  case 0x1F: DDIVU
1812   clear_const(current,rs1[i]);
1813   clear_const(current,rs2[i]);
1814   if(rs1[i]&&rs2[i])
1815   {
1816     if((opcode2[i]&4)==0) // 32-bit
1817     {
1818       current->u&=~(1LL<<HIREG);
1819       current->u&=~(1LL<<LOREG);
1820       alloc_reg(current,i,HIREG);
1821       alloc_reg(current,i,LOREG);
1822       alloc_reg(current,i,rs1[i]);
1823       alloc_reg(current,i,rs2[i]);
1824       current->is32|=1LL<<HIREG;
1825       current->is32|=1LL<<LOREG;
1826       dirty_reg(current,HIREG);
1827       dirty_reg(current,LOREG);
1828     }
1829     else // 64-bit
1830     {
1831       current->u&=~(1LL<<HIREG);
1832       current->u&=~(1LL<<LOREG);
1833       current->uu&=~(1LL<<HIREG);
1834       current->uu&=~(1LL<<LOREG);
1835       alloc_reg64(current,i,HIREG);
1836       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1837       alloc_reg64(current,i,rs1[i]);
1838       alloc_reg64(current,i,rs2[i]);
1839       alloc_all(current,i);
1840       current->is32&=~(1LL<<HIREG);
1841       current->is32&=~(1LL<<LOREG);
1842       dirty_reg(current,HIREG);
1843       dirty_reg(current,LOREG);
1844       minimum_free_regs[i]=HOST_REGS;
1845     }
1846   }
1847   else
1848   {
1849     // Multiply by zero is zero.
1850     // MIPS does not have a divide by zero exception.
1851     // The result is undefined, we return zero.
1852     alloc_reg(current,i,HIREG);
1853     alloc_reg(current,i,LOREG);
1854     current->is32|=1LL<<HIREG;
1855     current->is32|=1LL<<LOREG;
1856     dirty_reg(current,HIREG);
1857     dirty_reg(current,LOREG);
1858   }
1859 }
1860 #endif
1861
1862 void cop0_alloc(struct regstat *current,int i)
1863 {
1864   if(opcode2[i]==0) // MFC0
1865   {
1866     if(rt1[i]) {
1867       clear_const(current,rt1[i]);
1868       alloc_all(current,i);
1869       alloc_reg(current,i,rt1[i]);
1870       current->is32|=1LL<<rt1[i];
1871       dirty_reg(current,rt1[i]);
1872     }
1873   }
1874   else if(opcode2[i]==4) // MTC0
1875   {
1876     if(rs1[i]){
1877       clear_const(current,rs1[i]);
1878       alloc_reg(current,i,rs1[i]);
1879       alloc_all(current,i);
1880     }
1881     else {
1882       alloc_all(current,i); // FIXME: Keep r0
1883       current->u&=~1LL;
1884       alloc_reg(current,i,0);
1885     }
1886   }
1887   else
1888   {
1889     // TLBR/TLBWI/TLBWR/TLBP/ERET
1890     assert(opcode2[i]==0x10);
1891     alloc_all(current,i);
1892   }
1893   minimum_free_regs[i]=HOST_REGS;
1894 }
1895
1896 void cop1_alloc(struct regstat *current,int i)
1897 {
1898   alloc_reg(current,i,CSREG); // Load status
1899   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1900   {
1901     if(rt1[i]){
1902       clear_const(current,rt1[i]);
1903       if(opcode2[i]==1) {
1904         alloc_reg64(current,i,rt1[i]); // DMFC1
1905         current->is32&=~(1LL<<rt1[i]);
1906       }else{
1907         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1908         current->is32|=1LL<<rt1[i];
1909       }
1910       dirty_reg(current,rt1[i]);
1911     }
1912     alloc_reg_temp(current,i,-1);
1913   }
1914   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1915   {
1916     if(rs1[i]){
1917       clear_const(current,rs1[i]);
1918       if(opcode2[i]==5)
1919         alloc_reg64(current,i,rs1[i]); // DMTC1
1920       else
1921         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1922       alloc_reg_temp(current,i,-1);
1923     }
1924     else {
1925       current->u&=~1LL;
1926       alloc_reg(current,i,0);
1927       alloc_reg_temp(current,i,-1);
1928     }
1929   }
1930   minimum_free_regs[i]=1;
1931 }
1932 void fconv_alloc(struct regstat *current,int i)
1933 {
1934   alloc_reg(current,i,CSREG); // Load status
1935   alloc_reg_temp(current,i,-1);
1936   minimum_free_regs[i]=1;
1937 }
1938 void float_alloc(struct regstat *current,int i)
1939 {
1940   alloc_reg(current,i,CSREG); // Load status
1941   alloc_reg_temp(current,i,-1);
1942   minimum_free_regs[i]=1;
1943 }
1944 void c2op_alloc(struct regstat *current,int i)
1945 {
1946   alloc_reg_temp(current,i,-1);
1947 }
1948 void fcomp_alloc(struct regstat *current,int i)
1949 {
1950   alloc_reg(current,i,CSREG); // Load status
1951   alloc_reg(current,i,FSREG); // Load flags
1952   dirty_reg(current,FSREG); // Flag will be modified
1953   alloc_reg_temp(current,i,-1);
1954   minimum_free_regs[i]=1;
1955 }
1956
1957 void syscall_alloc(struct regstat *current,int i)
1958 {
1959   alloc_cc(current,i);
1960   dirty_reg(current,CCREG);
1961   alloc_all(current,i);
1962   minimum_free_regs[i]=HOST_REGS;
1963   current->isconst=0;
1964 }
1965
1966 void delayslot_alloc(struct regstat *current,int i)
1967 {
1968   switch(itype[i]) {
1969     case UJUMP:
1970     case CJUMP:
1971     case SJUMP:
1972     case RJUMP:
1973     case FJUMP:
1974     case SYSCALL:
1975     case HLECALL:
1976     case SPAN:
1977       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1978       printf("Disabled speculative precompilation\n");
1979       stop_after_jal=1;
1980       break;
1981     case IMM16:
1982       imm16_alloc(current,i);
1983       break;
1984     case LOAD:
1985     case LOADLR:
1986       load_alloc(current,i);
1987       break;
1988     case STORE:
1989     case STORELR:
1990       store_alloc(current,i);
1991       break;
1992     case ALU:
1993       alu_alloc(current,i);
1994       break;
1995     case SHIFT:
1996       shift_alloc(current,i);
1997       break;
1998     case MULTDIV:
1999       multdiv_alloc(current,i);
2000       break;
2001     case SHIFTIMM:
2002       shiftimm_alloc(current,i);
2003       break;
2004     case MOV:
2005       mov_alloc(current,i);
2006       break;
2007     case COP0:
2008       cop0_alloc(current,i);
2009       break;
2010     case COP1:
2011     case COP2:
2012       cop1_alloc(current,i);
2013       break;
2014     case C1LS:
2015       c1ls_alloc(current,i);
2016       break;
2017     case C2LS:
2018       c2ls_alloc(current,i);
2019       break;
2020     case FCONV:
2021       fconv_alloc(current,i);
2022       break;
2023     case FLOAT:
2024       float_alloc(current,i);
2025       break;
2026     case FCOMP:
2027       fcomp_alloc(current,i);
2028       break;
2029     case C2OP:
2030       c2op_alloc(current,i);
2031       break;
2032   }
2033 }
2034
2035 // Special case where a branch and delay slot span two pages in virtual memory
2036 static void pagespan_alloc(struct regstat *current,int i)
2037 {
2038   current->isconst=0;
2039   current->wasconst=0;
2040   regs[i].wasconst=0;
2041   minimum_free_regs[i]=HOST_REGS;
2042   alloc_all(current,i);
2043   alloc_cc(current,i);
2044   dirty_reg(current,CCREG);
2045   if(opcode[i]==3) // JAL
2046   {
2047     alloc_reg(current,i,31);
2048     dirty_reg(current,31);
2049   }
2050   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2051   {
2052     alloc_reg(current,i,rs1[i]);
2053     if (rt1[i]!=0) {
2054       alloc_reg(current,i,rt1[i]);
2055       dirty_reg(current,rt1[i]);
2056     }
2057   }
2058   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2059   {
2060     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2061     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2062     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2063     {
2064       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2065       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2066     }
2067   }
2068   else
2069   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2070   {
2071     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2072     if(!((current->is32>>rs1[i])&1))
2073     {
2074       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2075     }
2076   }
2077   else
2078   if(opcode[i]==0x11) // BC1
2079   {
2080     alloc_reg(current,i,FSREG);
2081     alloc_reg(current,i,CSREG);
2082   }
2083   //else ...
2084 }
2085
2086 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2087 {
2088   stubs[stubcount][0]=type;
2089   stubs[stubcount][1]=addr;
2090   stubs[stubcount][2]=retaddr;
2091   stubs[stubcount][3]=a;
2092   stubs[stubcount][4]=b;
2093   stubs[stubcount][5]=c;
2094   stubs[stubcount][6]=d;
2095   stubs[stubcount][7]=e;
2096   stubcount++;
2097 }
2098
2099 // Write out a single register
2100 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2101 {
2102   int hr;
2103   for(hr=0;hr<HOST_REGS;hr++) {
2104     if(hr!=EXCLUDE_REG) {
2105       if((regmap[hr]&63)==r) {
2106         if((dirty>>hr)&1) {
2107           if(regmap[hr]<64) {
2108             emit_storereg(r,hr);
2109 #ifndef FORCE32
2110             if((is32>>regmap[hr])&1) {
2111               emit_sarimm(hr,31,hr);
2112               emit_storereg(r|64,hr);
2113             }
2114 #endif
2115           }else{
2116             emit_storereg(r|64,hr);
2117           }
2118         }
2119       }
2120     }
2121   }
2122 }
2123
2124 int mchecksum()
2125 {
2126   //if(!tracedebug) return 0;
2127   int i;
2128   int sum=0;
2129   for(i=0;i<2097152;i++) {
2130     unsigned int temp=sum;
2131     sum<<=1;
2132     sum|=(~temp)>>31;
2133     sum^=((u_int *)rdram)[i];
2134   }
2135   return sum;
2136 }
2137 int rchecksum()
2138 {
2139   int i;
2140   int sum=0;
2141   for(i=0;i<64;i++)
2142     sum^=((u_int *)reg)[i];
2143   return sum;
2144 }
2145 void rlist()
2146 {
2147   int i;
2148   printf("TRACE: ");
2149   for(i=0;i<32;i++)
2150     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2151   printf("\n");
2152 #ifndef DISABLE_COP1
2153   printf("TRACE: ");
2154   for(i=0;i<32;i++)
2155     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2156   printf("\n");
2157 #endif
2158 }
2159
2160 void enabletrace()
2161 {
2162   tracedebug=1;
2163 }
2164
2165 void memdebug(int i)
2166 {
2167   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2168   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2169   //rlist();
2170   //if(tracedebug) {
2171   //if(Count>=-2084597794) {
2172   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2173   //if(0) {
2174     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2175     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2176     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2177     rlist();
2178     #ifdef __i386__
2179     printf("TRACE: %x\n",(&i)[-1]);
2180     #endif
2181     #ifdef __arm__
2182     int j;
2183     printf("TRACE: %x \n",(&j)[10]);
2184     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2185     #endif
2186     //fflush(stdout);
2187   }
2188   //printf("TRACE: %x\n",(&i)[-1]);
2189 }
2190
2191 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2192 {
2193   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2194 }
2195
2196 void alu_assemble(int i,struct regstat *i_regs)
2197 {
2198   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2199     if(rt1[i]) {
2200       signed char s1,s2,t;
2201       t=get_reg(i_regs->regmap,rt1[i]);
2202       if(t>=0) {
2203         s1=get_reg(i_regs->regmap,rs1[i]);
2204         s2=get_reg(i_regs->regmap,rs2[i]);
2205         if(rs1[i]&&rs2[i]) {
2206           assert(s1>=0);
2207           assert(s2>=0);
2208           if(opcode2[i]&2) emit_sub(s1,s2,t);
2209           else emit_add(s1,s2,t);
2210         }
2211         else if(rs1[i]) {
2212           if(s1>=0) emit_mov(s1,t);
2213           else emit_loadreg(rs1[i],t);
2214         }
2215         else if(rs2[i]) {
2216           if(s2>=0) {
2217             if(opcode2[i]&2) emit_neg(s2,t);
2218             else emit_mov(s2,t);
2219           }
2220           else {
2221             emit_loadreg(rs2[i],t);
2222             if(opcode2[i]&2) emit_neg(t,t);
2223           }
2224         }
2225         else emit_zeroreg(t);
2226       }
2227     }
2228   }
2229   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2230     if(rt1[i]) {
2231       signed char s1l,s2l,s1h,s2h,tl,th;
2232       tl=get_reg(i_regs->regmap,rt1[i]);
2233       th=get_reg(i_regs->regmap,rt1[i]|64);
2234       if(tl>=0) {
2235         s1l=get_reg(i_regs->regmap,rs1[i]);
2236         s2l=get_reg(i_regs->regmap,rs2[i]);
2237         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2238         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2239         if(rs1[i]&&rs2[i]) {
2240           assert(s1l>=0);
2241           assert(s2l>=0);
2242           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2243           else emit_adds(s1l,s2l,tl);
2244           if(th>=0) {
2245             #ifdef INVERTED_CARRY
2246             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2247             #else
2248             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2249             #endif
2250             else emit_add(s1h,s2h,th);
2251           }
2252         }
2253         else if(rs1[i]) {
2254           if(s1l>=0) emit_mov(s1l,tl);
2255           else emit_loadreg(rs1[i],tl);
2256           if(th>=0) {
2257             if(s1h>=0) emit_mov(s1h,th);
2258             else emit_loadreg(rs1[i]|64,th);
2259           }
2260         }
2261         else if(rs2[i]) {
2262           if(s2l>=0) {
2263             if(opcode2[i]&2) emit_negs(s2l,tl);
2264             else emit_mov(s2l,tl);
2265           }
2266           else {
2267             emit_loadreg(rs2[i],tl);
2268             if(opcode2[i]&2) emit_negs(tl,tl);
2269           }
2270           if(th>=0) {
2271             #ifdef INVERTED_CARRY
2272             if(s2h>=0) emit_mov(s2h,th);
2273             else emit_loadreg(rs2[i]|64,th);
2274             if(opcode2[i]&2) {
2275               emit_adcimm(-1,th); // x86 has inverted carry flag
2276               emit_not(th,th);
2277             }
2278             #else
2279             if(opcode2[i]&2) {
2280               if(s2h>=0) emit_rscimm(s2h,0,th);
2281               else {
2282                 emit_loadreg(rs2[i]|64,th);
2283                 emit_rscimm(th,0,th);
2284               }
2285             }else{
2286               if(s2h>=0) emit_mov(s2h,th);
2287               else emit_loadreg(rs2[i]|64,th);
2288             }
2289             #endif
2290           }
2291         }
2292         else {
2293           emit_zeroreg(tl);
2294           if(th>=0) emit_zeroreg(th);
2295         }
2296       }
2297     }
2298   }
2299   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2300     if(rt1[i]) {
2301       signed char s1l,s1h,s2l,s2h,t;
2302       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2303       {
2304         t=get_reg(i_regs->regmap,rt1[i]);
2305         //assert(t>=0);
2306         if(t>=0) {
2307           s1l=get_reg(i_regs->regmap,rs1[i]);
2308           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2309           s2l=get_reg(i_regs->regmap,rs2[i]);
2310           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2311           if(rs2[i]==0) // rx<r0
2312           {
2313             assert(s1h>=0);
2314             if(opcode2[i]==0x2a) // SLT
2315               emit_shrimm(s1h,31,t);
2316             else // SLTU (unsigned can not be less than zero)
2317               emit_zeroreg(t);
2318           }
2319           else if(rs1[i]==0) // r0<rx
2320           {
2321             assert(s2h>=0);
2322             if(opcode2[i]==0x2a) // SLT
2323               emit_set_gz64_32(s2h,s2l,t);
2324             else // SLTU (set if not zero)
2325               emit_set_nz64_32(s2h,s2l,t);
2326           }
2327           else {
2328             assert(s1l>=0);assert(s1h>=0);
2329             assert(s2l>=0);assert(s2h>=0);
2330             if(opcode2[i]==0x2a) // SLT
2331               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2332             else // SLTU
2333               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2334           }
2335         }
2336       } else {
2337         t=get_reg(i_regs->regmap,rt1[i]);
2338         //assert(t>=0);
2339         if(t>=0) {
2340           s1l=get_reg(i_regs->regmap,rs1[i]);
2341           s2l=get_reg(i_regs->regmap,rs2[i]);
2342           if(rs2[i]==0) // rx<r0
2343           {
2344             assert(s1l>=0);
2345             if(opcode2[i]==0x2a) // SLT
2346               emit_shrimm(s1l,31,t);
2347             else // SLTU (unsigned can not be less than zero)
2348               emit_zeroreg(t);
2349           }
2350           else if(rs1[i]==0) // r0<rx
2351           {
2352             assert(s2l>=0);
2353             if(opcode2[i]==0x2a) // SLT
2354               emit_set_gz32(s2l,t);
2355             else // SLTU (set if not zero)
2356               emit_set_nz32(s2l,t);
2357           }
2358           else{
2359             assert(s1l>=0);assert(s2l>=0);
2360             if(opcode2[i]==0x2a) // SLT
2361               emit_set_if_less32(s1l,s2l,t);
2362             else // SLTU
2363               emit_set_if_carry32(s1l,s2l,t);
2364           }
2365         }
2366       }
2367     }
2368   }
2369   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2370     if(rt1[i]) {
2371       signed char s1l,s1h,s2l,s2h,th,tl;
2372       tl=get_reg(i_regs->regmap,rt1[i]);
2373       th=get_reg(i_regs->regmap,rt1[i]|64);
2374       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2375       {
2376         assert(tl>=0);
2377         if(tl>=0) {
2378           s1l=get_reg(i_regs->regmap,rs1[i]);
2379           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2380           s2l=get_reg(i_regs->regmap,rs2[i]);
2381           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2382           if(rs1[i]&&rs2[i]) {
2383             assert(s1l>=0);assert(s1h>=0);
2384             assert(s2l>=0);assert(s2h>=0);
2385             if(opcode2[i]==0x24) { // AND
2386               emit_and(s1l,s2l,tl);
2387               emit_and(s1h,s2h,th);
2388             } else
2389             if(opcode2[i]==0x25) { // OR
2390               emit_or(s1l,s2l,tl);
2391               emit_or(s1h,s2h,th);
2392             } else
2393             if(opcode2[i]==0x26) { // XOR
2394               emit_xor(s1l,s2l,tl);
2395               emit_xor(s1h,s2h,th);
2396             } else
2397             if(opcode2[i]==0x27) { // NOR
2398               emit_or(s1l,s2l,tl);
2399               emit_or(s1h,s2h,th);
2400               emit_not(tl,tl);
2401               emit_not(th,th);
2402             }
2403           }
2404           else
2405           {
2406             if(opcode2[i]==0x24) { // AND
2407               emit_zeroreg(tl);
2408               emit_zeroreg(th);
2409             } else
2410             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2411               if(rs1[i]){
2412                 if(s1l>=0) emit_mov(s1l,tl);
2413                 else emit_loadreg(rs1[i],tl);
2414                 if(s1h>=0) emit_mov(s1h,th);
2415                 else emit_loadreg(rs1[i]|64,th);
2416               }
2417               else
2418               if(rs2[i]){
2419                 if(s2l>=0) emit_mov(s2l,tl);
2420                 else emit_loadreg(rs2[i],tl);
2421                 if(s2h>=0) emit_mov(s2h,th);
2422                 else emit_loadreg(rs2[i]|64,th);
2423               }
2424               else{
2425                 emit_zeroreg(tl);
2426                 emit_zeroreg(th);
2427               }
2428             } else
2429             if(opcode2[i]==0x27) { // NOR
2430               if(rs1[i]){
2431                 if(s1l>=0) emit_not(s1l,tl);
2432                 else{
2433                   emit_loadreg(rs1[i],tl);
2434                   emit_not(tl,tl);
2435                 }
2436                 if(s1h>=0) emit_not(s1h,th);
2437                 else{
2438                   emit_loadreg(rs1[i]|64,th);
2439                   emit_not(th,th);
2440                 }
2441               }
2442               else
2443               if(rs2[i]){
2444                 if(s2l>=0) emit_not(s2l,tl);
2445                 else{
2446                   emit_loadreg(rs2[i],tl);
2447                   emit_not(tl,tl);
2448                 }
2449                 if(s2h>=0) emit_not(s2h,th);
2450                 else{
2451                   emit_loadreg(rs2[i]|64,th);
2452                   emit_not(th,th);
2453                 }
2454               }
2455               else {
2456                 emit_movimm(-1,tl);
2457                 emit_movimm(-1,th);
2458               }
2459             }
2460           }
2461         }
2462       }
2463       else
2464       {
2465         // 32 bit
2466         if(tl>=0) {
2467           s1l=get_reg(i_regs->regmap,rs1[i]);
2468           s2l=get_reg(i_regs->regmap,rs2[i]);
2469           if(rs1[i]&&rs2[i]) {
2470             assert(s1l>=0);
2471             assert(s2l>=0);
2472             if(opcode2[i]==0x24) { // AND
2473               emit_and(s1l,s2l,tl);
2474             } else
2475             if(opcode2[i]==0x25) { // OR
2476               emit_or(s1l,s2l,tl);
2477             } else
2478             if(opcode2[i]==0x26) { // XOR
2479               emit_xor(s1l,s2l,tl);
2480             } else
2481             if(opcode2[i]==0x27) { // NOR
2482               emit_or(s1l,s2l,tl);
2483               emit_not(tl,tl);
2484             }
2485           }
2486           else
2487           {
2488             if(opcode2[i]==0x24) { // AND
2489               emit_zeroreg(tl);
2490             } else
2491             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2492               if(rs1[i]){
2493                 if(s1l>=0) emit_mov(s1l,tl);
2494                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2495               }
2496               else
2497               if(rs2[i]){
2498                 if(s2l>=0) emit_mov(s2l,tl);
2499                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2500               }
2501               else emit_zeroreg(tl);
2502             } else
2503             if(opcode2[i]==0x27) { // NOR
2504               if(rs1[i]){
2505                 if(s1l>=0) emit_not(s1l,tl);
2506                 else {
2507                   emit_loadreg(rs1[i],tl);
2508                   emit_not(tl,tl);
2509                 }
2510               }
2511               else
2512               if(rs2[i]){
2513                 if(s2l>=0) emit_not(s2l,tl);
2514                 else {
2515                   emit_loadreg(rs2[i],tl);
2516                   emit_not(tl,tl);
2517                 }
2518               }
2519               else emit_movimm(-1,tl);
2520             }
2521           }
2522         }
2523       }
2524     }
2525   }
2526 }
2527
2528 void imm16_assemble(int i,struct regstat *i_regs)
2529 {
2530   if (opcode[i]==0x0f) { // LUI
2531     if(rt1[i]) {
2532       signed char t;
2533       t=get_reg(i_regs->regmap,rt1[i]);
2534       //assert(t>=0);
2535       if(t>=0) {
2536         if(!((i_regs->isconst>>t)&1))
2537           emit_movimm(imm[i]<<16,t);
2538       }
2539     }
2540   }
2541   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2542     if(rt1[i]) {
2543       signed char s,t;
2544       t=get_reg(i_regs->regmap,rt1[i]);
2545       s=get_reg(i_regs->regmap,rs1[i]);
2546       if(rs1[i]) {
2547         //assert(t>=0);
2548         //assert(s>=0);
2549         if(t>=0) {
2550           if(!((i_regs->isconst>>t)&1)) {
2551             if(s<0) {
2552               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2553               emit_addimm(t,imm[i],t);
2554             }else{
2555               if(!((i_regs->wasconst>>s)&1))
2556                 emit_addimm(s,imm[i],t);
2557               else
2558                 emit_movimm(constmap[i][s]+imm[i],t);
2559             }
2560           }
2561         }
2562       } else {
2563         if(t>=0) {
2564           if(!((i_regs->isconst>>t)&1))
2565             emit_movimm(imm[i],t);
2566         }
2567       }
2568     }
2569   }
2570   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2571     if(rt1[i]) {
2572       signed char sh,sl,th,tl;
2573       th=get_reg(i_regs->regmap,rt1[i]|64);
2574       tl=get_reg(i_regs->regmap,rt1[i]);
2575       sh=get_reg(i_regs->regmap,rs1[i]|64);
2576       sl=get_reg(i_regs->regmap,rs1[i]);
2577       if(tl>=0) {
2578         if(rs1[i]) {
2579           assert(sh>=0);
2580           assert(sl>=0);
2581           if(th>=0) {
2582             emit_addimm64_32(sh,sl,imm[i],th,tl);
2583           }
2584           else {
2585             emit_addimm(sl,imm[i],tl);
2586           }
2587         } else {
2588           emit_movimm(imm[i],tl);
2589           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2590         }
2591       }
2592     }
2593   }
2594   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2595     if(rt1[i]) {
2596       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2597       signed char sh,sl,t;
2598       t=get_reg(i_regs->regmap,rt1[i]);
2599       sh=get_reg(i_regs->regmap,rs1[i]|64);
2600       sl=get_reg(i_regs->regmap,rs1[i]);
2601       //assert(t>=0);
2602       if(t>=0) {
2603         if(rs1[i]>0) {
2604           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2605           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2606             if(opcode[i]==0x0a) { // SLTI
2607               if(sl<0) {
2608                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2609                 emit_slti32(t,imm[i],t);
2610               }else{
2611                 emit_slti32(sl,imm[i],t);
2612               }
2613             }
2614             else { // SLTIU
2615               if(sl<0) {
2616                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2617                 emit_sltiu32(t,imm[i],t);
2618               }else{
2619                 emit_sltiu32(sl,imm[i],t);
2620               }
2621             }
2622           }else{ // 64-bit
2623             assert(sl>=0);
2624             if(opcode[i]==0x0a) // SLTI
2625               emit_slti64_32(sh,sl,imm[i],t);
2626             else // SLTIU
2627               emit_sltiu64_32(sh,sl,imm[i],t);
2628           }
2629         }else{
2630           // SLTI(U) with r0 is just stupid,
2631           // nonetheless examples can be found
2632           if(opcode[i]==0x0a) // SLTI
2633             if(0<imm[i]) emit_movimm(1,t);
2634             else emit_zeroreg(t);
2635           else // SLTIU
2636           {
2637             if(imm[i]) emit_movimm(1,t);
2638             else emit_zeroreg(t);
2639           }
2640         }
2641       }
2642     }
2643   }
2644   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2645     if(rt1[i]) {
2646       signed char sh,sl,th,tl;
2647       th=get_reg(i_regs->regmap,rt1[i]|64);
2648       tl=get_reg(i_regs->regmap,rt1[i]);
2649       sh=get_reg(i_regs->regmap,rs1[i]|64);
2650       sl=get_reg(i_regs->regmap,rs1[i]);
2651       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2652         if(opcode[i]==0x0c) //ANDI
2653         {
2654           if(rs1[i]) {
2655             if(sl<0) {
2656               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2657               emit_andimm(tl,imm[i],tl);
2658             }else{
2659               if(!((i_regs->wasconst>>sl)&1))
2660                 emit_andimm(sl,imm[i],tl);
2661               else
2662                 emit_movimm(constmap[i][sl]&imm[i],tl);
2663             }
2664           }
2665           else
2666             emit_zeroreg(tl);
2667           if(th>=0) emit_zeroreg(th);
2668         }
2669         else
2670         {
2671           if(rs1[i]) {
2672             if(sl<0) {
2673               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2674             }
2675             if(th>=0) {
2676               if(sh<0) {
2677                 emit_loadreg(rs1[i]|64,th);
2678               }else{
2679                 emit_mov(sh,th);
2680               }
2681             }
2682             if(opcode[i]==0x0d) //ORI
2683             if(sl<0) {
2684               emit_orimm(tl,imm[i],tl);
2685             }else{
2686               if(!((i_regs->wasconst>>sl)&1))
2687                 emit_orimm(sl,imm[i],tl);
2688               else
2689                 emit_movimm(constmap[i][sl]|imm[i],tl);
2690             }
2691             if(opcode[i]==0x0e) //XORI
2692             if(sl<0) {
2693               emit_xorimm(tl,imm[i],tl);
2694             }else{
2695               if(!((i_regs->wasconst>>sl)&1))
2696                 emit_xorimm(sl,imm[i],tl);
2697               else
2698                 emit_movimm(constmap[i][sl]^imm[i],tl);
2699             }
2700           }
2701           else {
2702             emit_movimm(imm[i],tl);
2703             if(th>=0) emit_zeroreg(th);
2704           }
2705         }
2706       }
2707     }
2708   }
2709 }
2710
2711 void shiftimm_assemble(int i,struct regstat *i_regs)
2712 {
2713   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2714   {
2715     if(rt1[i]) {
2716       signed char s,t;
2717       t=get_reg(i_regs->regmap,rt1[i]);
2718       s=get_reg(i_regs->regmap,rs1[i]);
2719       //assert(t>=0);
2720       if(t>=0&&!((i_regs->isconst>>t)&1)){
2721         if(rs1[i]==0)
2722         {
2723           emit_zeroreg(t);
2724         }
2725         else
2726         {
2727           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2728           if(imm[i]) {
2729             if(opcode2[i]==0) // SLL
2730             {
2731               emit_shlimm(s<0?t:s,imm[i],t);
2732             }
2733             if(opcode2[i]==2) // SRL
2734             {
2735               emit_shrimm(s<0?t:s,imm[i],t);
2736             }
2737             if(opcode2[i]==3) // SRA
2738             {
2739               emit_sarimm(s<0?t:s,imm[i],t);
2740             }
2741           }else{
2742             // Shift by zero
2743             if(s>=0 && s!=t) emit_mov(s,t);
2744           }
2745         }
2746       }
2747       //emit_storereg(rt1[i],t); //DEBUG
2748     }
2749   }
2750   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2751   {
2752     if(rt1[i]) {
2753       signed char sh,sl,th,tl;
2754       th=get_reg(i_regs->regmap,rt1[i]|64);
2755       tl=get_reg(i_regs->regmap,rt1[i]);
2756       sh=get_reg(i_regs->regmap,rs1[i]|64);
2757       sl=get_reg(i_regs->regmap,rs1[i]);
2758       if(tl>=0) {
2759         if(rs1[i]==0)
2760         {
2761           emit_zeroreg(tl);
2762           if(th>=0) emit_zeroreg(th);
2763         }
2764         else
2765         {
2766           assert(sl>=0);
2767           assert(sh>=0);
2768           if(imm[i]) {
2769             if(opcode2[i]==0x38) // DSLL
2770             {
2771               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2772               emit_shlimm(sl,imm[i],tl);
2773             }
2774             if(opcode2[i]==0x3a) // DSRL
2775             {
2776               emit_shrdimm(sl,sh,imm[i],tl);
2777               if(th>=0) emit_shrimm(sh,imm[i],th);
2778             }
2779             if(opcode2[i]==0x3b) // DSRA
2780             {
2781               emit_shrdimm(sl,sh,imm[i],tl);
2782               if(th>=0) emit_sarimm(sh,imm[i],th);
2783             }
2784           }else{
2785             // Shift by zero
2786             if(sl!=tl) emit_mov(sl,tl);
2787             if(th>=0&&sh!=th) emit_mov(sh,th);
2788           }
2789         }
2790       }
2791     }
2792   }
2793   if(opcode2[i]==0x3c) // DSLL32
2794   {
2795     if(rt1[i]) {
2796       signed char sl,tl,th;
2797       tl=get_reg(i_regs->regmap,rt1[i]);
2798       th=get_reg(i_regs->regmap,rt1[i]|64);
2799       sl=get_reg(i_regs->regmap,rs1[i]);
2800       if(th>=0||tl>=0){
2801         assert(tl>=0);
2802         assert(th>=0);
2803         assert(sl>=0);
2804         emit_mov(sl,th);
2805         emit_zeroreg(tl);
2806         if(imm[i]>32)
2807         {
2808           emit_shlimm(th,imm[i]&31,th);
2809         }
2810       }
2811     }
2812   }
2813   if(opcode2[i]==0x3e) // DSRL32
2814   {
2815     if(rt1[i]) {
2816       signed char sh,tl,th;
2817       tl=get_reg(i_regs->regmap,rt1[i]);
2818       th=get_reg(i_regs->regmap,rt1[i]|64);
2819       sh=get_reg(i_regs->regmap,rs1[i]|64);
2820       if(tl>=0){
2821         assert(sh>=0);
2822         emit_mov(sh,tl);
2823         if(th>=0) emit_zeroreg(th);
2824         if(imm[i]>32)
2825         {
2826           emit_shrimm(tl,imm[i]&31,tl);
2827         }
2828       }
2829     }
2830   }
2831   if(opcode2[i]==0x3f) // DSRA32
2832   {
2833     if(rt1[i]) {
2834       signed char sh,tl;
2835       tl=get_reg(i_regs->regmap,rt1[i]);
2836       sh=get_reg(i_regs->regmap,rs1[i]|64);
2837       if(tl>=0){
2838         assert(sh>=0);
2839         emit_mov(sh,tl);
2840         if(imm[i]>32)
2841         {
2842           emit_sarimm(tl,imm[i]&31,tl);
2843         }
2844       }
2845     }
2846   }
2847 }
2848
2849 #ifndef shift_assemble
2850 void shift_assemble(int i,struct regstat *i_regs)
2851 {
2852   printf("Need shift_assemble for this architecture.\n");
2853   exit(1);
2854 }
2855 #endif
2856
2857 void load_assemble(int i,struct regstat *i_regs)
2858 {
2859   int s,th,tl,addr,map=-1;
2860   int offset;
2861   int jaddr=0;
2862   int memtarget=0,c=0;
2863   int fastload_reg_override=0;
2864   u_int hr,reglist=0;
2865   th=get_reg(i_regs->regmap,rt1[i]|64);
2866   tl=get_reg(i_regs->regmap,rt1[i]);
2867   s=get_reg(i_regs->regmap,rs1[i]);
2868   offset=imm[i];
2869   for(hr=0;hr<HOST_REGS;hr++) {
2870     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2871   }
2872   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2873   if(s>=0) {
2874     c=(i_regs->wasconst>>s)&1;
2875     if (c) {
2876       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2877       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2878     }
2879   }
2880   //printf("load_assemble: c=%d\n",c);
2881   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2882   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2883 #ifdef PCSX
2884   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2885     ||rt1[i]==0) {
2886       // could be FIFO, must perform the read
2887       // ||dummy read
2888       assem_debug("(forced read)\n");
2889       tl=get_reg(i_regs->regmap,-1);
2890       assert(tl>=0);
2891   }
2892 #endif
2893   if(offset||s<0||c) addr=tl;
2894   else addr=s;
2895   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2896  if(tl>=0) {
2897   //printf("load_assemble: c=%d\n",c);
2898   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2899   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2900   reglist&=~(1<<tl);
2901   if(th>=0) reglist&=~(1<<th);
2902   if(!using_tlb) {
2903     if(!c) {
2904       #ifdef RAM_OFFSET
2905       map=get_reg(i_regs->regmap,ROREG);
2906       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2907       #endif
2908 //#define R29_HACK 1
2909       #ifdef R29_HACK
2910       // Strmnnrmn's speed hack
2911       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2912       #endif
2913       {
2914         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2915       }
2916     }
2917   }else{ // using tlb
2918     int x=0;
2919     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2920     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2921     map=get_reg(i_regs->regmap,TLREG);
2922     assert(map>=0);
2923     reglist&=~(1<<map);
2924     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2925     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2926   }
2927   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2928   if (opcode[i]==0x20) { // LB
2929     if(!c||memtarget) {
2930       if(!dummy) {
2931         #ifdef HOST_IMM_ADDR32
2932         if(c)
2933           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2934         else
2935         #endif
2936         {
2937           //emit_xorimm(addr,3,tl);
2938           //gen_tlb_addr_r(tl,map);
2939           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2940           int x=0,a=tl;
2941 #ifdef BIG_ENDIAN_MIPS
2942           if(!c) emit_xorimm(addr,3,tl);
2943           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2944 #else
2945           if(!c) a=addr;
2946 #endif
2947           if(fastload_reg_override) a=fastload_reg_override;
2948
2949           emit_movsbl_indexed_tlb(x,a,map,tl);
2950         }
2951       }
2952       if(jaddr)
2953         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2954     }
2955     else
2956       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2957   }
2958   if (opcode[i]==0x21) { // LH
2959     if(!c||memtarget) {
2960       if(!dummy) {
2961         #ifdef HOST_IMM_ADDR32
2962         if(c)
2963           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2964         else
2965         #endif
2966         {
2967           int x=0,a=tl;
2968 #ifdef BIG_ENDIAN_MIPS
2969           if(!c) emit_xorimm(addr,2,tl);
2970           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2971 #else
2972           if(!c) a=addr;
2973 #endif
2974           if(fastload_reg_override) a=fastload_reg_override;
2975           //#ifdef
2976           //emit_movswl_indexed_tlb(x,tl,map,tl);
2977           //else
2978           if(map>=0) {
2979             gen_tlb_addr_r(a,map);
2980             emit_movswl_indexed(x,a,tl);
2981           }else{
2982             #ifdef RAM_OFFSET
2983             emit_movswl_indexed(x,a,tl);
2984             #else
2985             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2986             #endif
2987           }
2988         }
2989       }
2990       if(jaddr)
2991         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2992     }
2993     else
2994       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2995   }
2996   if (opcode[i]==0x23) { // LW
2997     if(!c||memtarget) {
2998       if(!dummy) {
2999         int a=addr;
3000         if(fastload_reg_override) a=fastload_reg_override;
3001         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3002         #ifdef HOST_IMM_ADDR32
3003         if(c)
3004           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3005         else
3006         #endif
3007         emit_readword_indexed_tlb(0,a,map,tl);
3008       }
3009       if(jaddr)
3010         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3011     }
3012     else
3013       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3014   }
3015   if (opcode[i]==0x24) { // LBU
3016     if(!c||memtarget) {
3017       if(!dummy) {
3018         #ifdef HOST_IMM_ADDR32
3019         if(c)
3020           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
3021         else
3022         #endif
3023         {
3024           //emit_xorimm(addr,3,tl);
3025           //gen_tlb_addr_r(tl,map);
3026           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
3027           int x=0,a=tl;
3028 #ifdef BIG_ENDIAN_MIPS
3029           if(!c) emit_xorimm(addr,3,tl);
3030           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3031 #else
3032           if(!c) a=addr;
3033 #endif
3034           if(fastload_reg_override) a=fastload_reg_override;
3035
3036           emit_movzbl_indexed_tlb(x,a,map,tl);
3037         }
3038       }
3039       if(jaddr)
3040         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3041     }
3042     else
3043       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3044   }
3045   if (opcode[i]==0x25) { // LHU
3046     if(!c||memtarget) {
3047       if(!dummy) {
3048         #ifdef HOST_IMM_ADDR32
3049         if(c)
3050           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3051         else
3052         #endif
3053         {
3054           int x=0,a=tl;
3055 #ifdef BIG_ENDIAN_MIPS
3056           if(!c) emit_xorimm(addr,2,tl);
3057           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3058 #else
3059           if(!c) a=addr;
3060 #endif
3061           if(fastload_reg_override) a=fastload_reg_override;
3062           //#ifdef
3063           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3064           //#else
3065           if(map>=0) {
3066             gen_tlb_addr_r(a,map);
3067             emit_movzwl_indexed(x,a,tl);
3068           }else{
3069             #ifdef RAM_OFFSET
3070             emit_movzwl_indexed(x,a,tl);
3071             #else
3072             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3073             #endif
3074           }
3075         }
3076       }
3077       if(jaddr)
3078         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3079     }
3080     else
3081       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3082   }
3083   if (opcode[i]==0x27) { // LWU
3084     assert(th>=0);
3085     if(!c||memtarget) {
3086       if(!dummy) {
3087         int a=addr;
3088         if(fastload_reg_override) a=fastload_reg_override;
3089         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3090         #ifdef HOST_IMM_ADDR32
3091         if(c)
3092           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3093         else
3094         #endif
3095         emit_readword_indexed_tlb(0,a,map,tl);
3096       }
3097       if(jaddr)
3098         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3099     }
3100     else {
3101       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3102     }
3103     emit_zeroreg(th);
3104   }
3105   if (opcode[i]==0x37) { // LD
3106     if(!c||memtarget) {
3107       if(!dummy) {
3108         int a=addr;
3109         if(fastload_reg_override) a=fastload_reg_override;
3110         //gen_tlb_addr_r(tl,map);
3111         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3112         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3113         #ifdef HOST_IMM_ADDR32
3114         if(c)
3115           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3116         else
3117         #endif
3118         emit_readdword_indexed_tlb(0,a,map,th,tl);
3119       }
3120       if(jaddr)
3121         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3122     }
3123     else
3124       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3125   }
3126  }
3127   //emit_storereg(rt1[i],tl); // DEBUG
3128   //if(opcode[i]==0x23)
3129   //if(opcode[i]==0x24)
3130   //if(opcode[i]==0x23||opcode[i]==0x24)
3131   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3132   {
3133     //emit_pusha();
3134     save_regs(0x100f);
3135         emit_readword((int)&last_count,ECX);
3136         #ifdef __i386__
3137         if(get_reg(i_regs->regmap,CCREG)<0)
3138           emit_loadreg(CCREG,HOST_CCREG);
3139         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3140         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3141         emit_writeword(HOST_CCREG,(int)&Count);
3142         #endif
3143         #ifdef __arm__
3144         if(get_reg(i_regs->regmap,CCREG)<0)
3145           emit_loadreg(CCREG,0);
3146         else
3147           emit_mov(HOST_CCREG,0);
3148         emit_add(0,ECX,0);
3149         emit_addimm(0,2*ccadj[i],0);
3150         emit_writeword(0,(int)&Count);
3151         #endif
3152     emit_call((int)memdebug);
3153     //emit_popa();
3154     restore_regs(0x100f);
3155   }/**/
3156 }
3157
3158 #ifndef loadlr_assemble
3159 void loadlr_assemble(int i,struct regstat *i_regs)
3160 {
3161   printf("Need loadlr_assemble for this architecture.\n");
3162   exit(1);
3163 }
3164 #endif
3165
3166 void store_assemble(int i,struct regstat *i_regs)
3167 {
3168   int s,th,tl,map=-1;
3169   int addr,temp;
3170   int offset;
3171   int jaddr=0,jaddr2,type;
3172   int memtarget=0,c=0;
3173   int agr=AGEN1+(i&1);
3174   int faststore_reg_override=0;
3175   u_int hr,reglist=0;
3176   th=get_reg(i_regs->regmap,rs2[i]|64);
3177   tl=get_reg(i_regs->regmap,rs2[i]);
3178   s=get_reg(i_regs->regmap,rs1[i]);
3179   temp=get_reg(i_regs->regmap,agr);
3180   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3181   offset=imm[i];
3182   if(s>=0) {
3183     c=(i_regs->wasconst>>s)&1;
3184     if(c) {
3185       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3186       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3187     }
3188   }
3189   assert(tl>=0);
3190   assert(temp>=0);
3191   for(hr=0;hr<HOST_REGS;hr++) {
3192     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3193   }
3194   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3195   if(offset||s<0||c) addr=temp;
3196   else addr=s;
3197   if(!using_tlb) {
3198     if(!c) {
3199       #ifndef PCSX
3200       #ifdef R29_HACK
3201       // Strmnnrmn's speed hack
3202       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3203       #endif
3204       emit_cmpimm(addr,RAM_SIZE);
3205       #ifdef DESTRUCTIVE_SHIFT
3206       if(s==addr) emit_mov(s,temp);
3207       #endif
3208       #ifdef R29_HACK
3209       memtarget=1;
3210       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3211       #endif
3212       {
3213         jaddr=(int)out;
3214         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3215         // Hint to branch predictor that the branch is unlikely to be taken
3216         if(rs1[i]>=28)
3217           emit_jno_unlikely(0);
3218         else
3219         #endif
3220         emit_jno(0);
3221       }
3222       #else
3223         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3224       #endif
3225     }
3226   }else{ // using tlb
3227     int x=0;
3228     if (opcode[i]==0x28) x=3; // SB
3229     if (opcode[i]==0x29) x=2; // SH
3230     map=get_reg(i_regs->regmap,TLREG);
3231     assert(map>=0);
3232     reglist&=~(1<<map);
3233     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3234     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3235   }
3236
3237   if (opcode[i]==0x28) { // SB
3238     if(!c||memtarget) {
3239       int x=0,a=temp;
3240 #ifdef BIG_ENDIAN_MIPS
3241       if(!c) emit_xorimm(addr,3,temp);
3242       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3243 #else
3244       if(!c) a=addr;
3245 #endif
3246       if(faststore_reg_override) a=faststore_reg_override;
3247       //gen_tlb_addr_w(temp,map);
3248       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3249       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3250     }
3251     type=STOREB_STUB;
3252   }
3253   if (opcode[i]==0x29) { // SH
3254     if(!c||memtarget) {
3255       int x=0,a=temp;
3256 #ifdef BIG_ENDIAN_MIPS
3257       if(!c) emit_xorimm(addr,2,temp);
3258       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3259 #else
3260       if(!c) a=addr;
3261 #endif
3262       if(faststore_reg_override) a=faststore_reg_override;
3263       //#ifdef
3264       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3265       //#else
3266       if(map>=0) {
3267         gen_tlb_addr_w(a,map);
3268         emit_writehword_indexed(tl,x,a);
3269       }else
3270         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3271     }
3272     type=STOREH_STUB;
3273   }
3274   if (opcode[i]==0x2B) { // SW
3275     if(!c||memtarget) {
3276       int a=addr;
3277       if(faststore_reg_override) a=faststore_reg_override;
3278       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3279       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3280     }
3281     type=STOREW_STUB;
3282   }
3283   if (opcode[i]==0x3F) { // SD
3284     if(!c||memtarget) {
3285       int a=addr;
3286       if(faststore_reg_override) a=faststore_reg_override;
3287       if(rs2[i]) {
3288         assert(th>=0);
3289         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3290         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3291         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3292       }else{
3293         // Store zero
3294         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3295         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3296         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3297       }
3298     }
3299     type=STORED_STUB;
3300   }
3301 #ifdef PCSX
3302   if(jaddr) {
3303     // PCSX store handlers don't check invcode again
3304     reglist|=1<<addr;
3305     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3306     jaddr=0;
3307   }
3308 #endif
3309   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3310     if(!c||memtarget) {
3311       #ifdef DESTRUCTIVE_SHIFT
3312       // The x86 shift operation is 'destructive'; it overwrites the
3313       // source register, so we need to make a copy first and use that.
3314       addr=temp;
3315       #endif
3316       #if defined(HOST_IMM8)
3317       int ir=get_reg(i_regs->regmap,INVCP);
3318       assert(ir>=0);
3319       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3320       #else
3321       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3322       #endif
3323       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3324       emit_callne(invalidate_addr_reg[addr]);
3325       #else
3326       jaddr2=(int)out;
3327       emit_jne(0);
3328       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3329       #endif
3330     }
3331   }
3332   if(jaddr) {
3333     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3334   } else if(c&&!memtarget) {
3335     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3336   }
3337   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3338   //if(opcode[i]==0x2B || opcode[i]==0x28)
3339   //if(opcode[i]==0x2B || opcode[i]==0x29)
3340   //if(opcode[i]==0x2B)
3341   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3342   {
3343     #ifdef __i386__
3344     emit_pusha();
3345     #endif
3346     #ifdef __arm__
3347     save_regs(0x100f);
3348     #endif
3349         emit_readword((int)&last_count,ECX);
3350         #ifdef __i386__
3351         if(get_reg(i_regs->regmap,CCREG)<0)
3352           emit_loadreg(CCREG,HOST_CCREG);
3353         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3354         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3355         emit_writeword(HOST_CCREG,(int)&Count);
3356         #endif
3357         #ifdef __arm__
3358         if(get_reg(i_regs->regmap,CCREG)<0)
3359           emit_loadreg(CCREG,0);
3360         else
3361           emit_mov(HOST_CCREG,0);
3362         emit_add(0,ECX,0);
3363         emit_addimm(0,2*ccadj[i],0);
3364         emit_writeword(0,(int)&Count);
3365         #endif
3366     emit_call((int)memdebug);
3367     #ifdef __i386__
3368     emit_popa();
3369     #endif
3370     #ifdef __arm__
3371     restore_regs(0x100f);
3372     #endif
3373   }/**/
3374 }
3375
3376 void storelr_assemble(int i,struct regstat *i_regs)
3377 {
3378   int s,th,tl;
3379   int temp;
3380   int temp2;
3381   int offset;
3382   int jaddr=0,jaddr2;
3383   int case1,case2,case3;
3384   int done0,done1,done2;
3385   int memtarget=0,c=0;
3386   int agr=AGEN1+(i&1);
3387   u_int hr,reglist=0;
3388   th=get_reg(i_regs->regmap,rs2[i]|64);
3389   tl=get_reg(i_regs->regmap,rs2[i]);
3390   s=get_reg(i_regs->regmap,rs1[i]);
3391   temp=get_reg(i_regs->regmap,agr);
3392   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3393   offset=imm[i];
3394   if(s>=0) {
3395     c=(i_regs->isconst>>s)&1;
3396     if(c) {
3397       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3398       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3399     }
3400   }
3401   assert(tl>=0);
3402   for(hr=0;hr<HOST_REGS;hr++) {
3403     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3404   }
3405   assert(temp>=0);
3406   if(!using_tlb) {
3407     if(!c) {
3408       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3409       if(!offset&&s!=temp) emit_mov(s,temp);
3410       jaddr=(int)out;
3411       emit_jno(0);
3412     }
3413     else
3414     {
3415       if(!memtarget||!rs1[i]) {
3416         jaddr=(int)out;
3417         emit_jmp(0);
3418       }
3419     }
3420     #ifdef RAM_OFFSET
3421     int map=get_reg(i_regs->regmap,ROREG);
3422     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3423     gen_tlb_addr_w(temp,map);
3424     #else
3425     if((u_int)rdram!=0x80000000) 
3426       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3427     #endif
3428   }else{ // using tlb
3429     int map=get_reg(i_regs->regmap,TLREG);
3430     assert(map>=0);
3431     reglist&=~(1<<map);
3432     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3433     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3434     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3435     if(!jaddr&&!memtarget) {
3436       jaddr=(int)out;
3437       emit_jmp(0);
3438     }
3439     gen_tlb_addr_w(temp,map);
3440   }
3441
3442   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3443     temp2=get_reg(i_regs->regmap,FTEMP);
3444     if(!rs2[i]) temp2=th=tl;
3445   }
3446
3447 #ifndef BIG_ENDIAN_MIPS
3448     emit_xorimm(temp,3,temp);
3449 #endif
3450   emit_testimm(temp,2);
3451   case2=(int)out;
3452   emit_jne(0);
3453   emit_testimm(temp,1);
3454   case1=(int)out;
3455   emit_jne(0);
3456   // 0
3457   if (opcode[i]==0x2A) { // SWL
3458     emit_writeword_indexed(tl,0,temp);
3459   }
3460   if (opcode[i]==0x2E) { // SWR
3461     emit_writebyte_indexed(tl,3,temp);
3462   }
3463   if (opcode[i]==0x2C) { // SDL
3464     emit_writeword_indexed(th,0,temp);
3465     if(rs2[i]) emit_mov(tl,temp2);
3466   }
3467   if (opcode[i]==0x2D) { // SDR
3468     emit_writebyte_indexed(tl,3,temp);
3469     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3470   }
3471   done0=(int)out;
3472   emit_jmp(0);
3473   // 1
3474   set_jump_target(case1,(int)out);
3475   if (opcode[i]==0x2A) { // SWL
3476     // Write 3 msb into three least significant bytes
3477     if(rs2[i]) emit_rorimm(tl,8,tl);
3478     emit_writehword_indexed(tl,-1,temp);
3479     if(rs2[i]) emit_rorimm(tl,16,tl);
3480     emit_writebyte_indexed(tl,1,temp);
3481     if(rs2[i]) emit_rorimm(tl,8,tl);
3482   }
3483   if (opcode[i]==0x2E) { // SWR
3484     // Write two lsb into two most significant bytes
3485     emit_writehword_indexed(tl,1,temp);
3486   }
3487   if (opcode[i]==0x2C) { // SDL
3488     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3489     // Write 3 msb into three least significant bytes
3490     if(rs2[i]) emit_rorimm(th,8,th);
3491     emit_writehword_indexed(th,-1,temp);
3492     if(rs2[i]) emit_rorimm(th,16,th);
3493     emit_writebyte_indexed(th,1,temp);
3494     if(rs2[i]) emit_rorimm(th,8,th);
3495   }
3496   if (opcode[i]==0x2D) { // SDR
3497     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3498     // Write two lsb into two most significant bytes
3499     emit_writehword_indexed(tl,1,temp);
3500   }
3501   done1=(int)out;
3502   emit_jmp(0);
3503   // 2
3504   set_jump_target(case2,(int)out);
3505   emit_testimm(temp,1);
3506   case3=(int)out;
3507   emit_jne(0);
3508   if (opcode[i]==0x2A) { // SWL
3509     // Write two msb into two least significant bytes
3510     if(rs2[i]) emit_rorimm(tl,16,tl);
3511     emit_writehword_indexed(tl,-2,temp);
3512     if(rs2[i]) emit_rorimm(tl,16,tl);
3513   }
3514   if (opcode[i]==0x2E) { // SWR
3515     // Write 3 lsb into three most significant bytes
3516     emit_writebyte_indexed(tl,-1,temp);
3517     if(rs2[i]) emit_rorimm(tl,8,tl);
3518     emit_writehword_indexed(tl,0,temp);
3519     if(rs2[i]) emit_rorimm(tl,24,tl);
3520   }
3521   if (opcode[i]==0x2C) { // SDL
3522     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3523     // Write two msb into two least significant bytes
3524     if(rs2[i]) emit_rorimm(th,16,th);
3525     emit_writehword_indexed(th,-2,temp);
3526     if(rs2[i]) emit_rorimm(th,16,th);
3527   }
3528   if (opcode[i]==0x2D) { // SDR
3529     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3530     // Write 3 lsb into three most significant bytes
3531     emit_writebyte_indexed(tl,-1,temp);
3532     if(rs2[i]) emit_rorimm(tl,8,tl);
3533     emit_writehword_indexed(tl,0,temp);
3534     if(rs2[i]) emit_rorimm(tl,24,tl);
3535   }
3536   done2=(int)out;
3537   emit_jmp(0);
3538   // 3
3539   set_jump_target(case3,(int)out);
3540   if (opcode[i]==0x2A) { // SWL
3541     // Write msb into least significant byte
3542     if(rs2[i]) emit_rorimm(tl,24,tl);
3543     emit_writebyte_indexed(tl,-3,temp);
3544     if(rs2[i]) emit_rorimm(tl,8,tl);
3545   }
3546   if (opcode[i]==0x2E) { // SWR
3547     // Write entire word
3548     emit_writeword_indexed(tl,-3,temp);
3549   }
3550   if (opcode[i]==0x2C) { // SDL
3551     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3552     // Write msb into least significant byte
3553     if(rs2[i]) emit_rorimm(th,24,th);
3554     emit_writebyte_indexed(th,-3,temp);
3555     if(rs2[i]) emit_rorimm(th,8,th);
3556   }
3557   if (opcode[i]==0x2D) { // SDR
3558     if(rs2[i]) emit_mov(th,temp2);
3559     // Write entire word
3560     emit_writeword_indexed(tl,-3,temp);
3561   }
3562   set_jump_target(done0,(int)out);
3563   set_jump_target(done1,(int)out);
3564   set_jump_target(done2,(int)out);
3565   if (opcode[i]==0x2C) { // SDL
3566     emit_testimm(temp,4);
3567     done0=(int)out;
3568     emit_jne(0);
3569     emit_andimm(temp,~3,temp);
3570     emit_writeword_indexed(temp2,4,temp);
3571     set_jump_target(done0,(int)out);
3572   }
3573   if (opcode[i]==0x2D) { // SDR
3574     emit_testimm(temp,4);
3575     done0=(int)out;
3576     emit_jeq(0);
3577     emit_andimm(temp,~3,temp);
3578     emit_writeword_indexed(temp2,-4,temp);
3579     set_jump_target(done0,(int)out);
3580   }
3581   if(!c||!memtarget)
3582     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3583   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3584     #ifdef RAM_OFFSET
3585     int map=get_reg(i_regs->regmap,ROREG);
3586     if(map<0) map=HOST_TEMPREG;
3587     gen_orig_addr_w(temp,map);
3588     #else
3589     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3590     #endif
3591     #if defined(HOST_IMM8)
3592     int ir=get_reg(i_regs->regmap,INVCP);
3593     assert(ir>=0);
3594     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3595     #else
3596     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3597     #endif
3598     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3599     emit_callne(invalidate_addr_reg[temp]);
3600     #else
3601     jaddr2=(int)out;
3602     emit_jne(0);
3603     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3604     #endif
3605   }
3606   /*
3607     emit_pusha();
3608     //save_regs(0x100f);
3609         emit_readword((int)&last_count,ECX);
3610         if(get_reg(i_regs->regmap,CCREG)<0)
3611           emit_loadreg(CCREG,HOST_CCREG);
3612         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3613         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3614         emit_writeword(HOST_CCREG,(int)&Count);
3615     emit_call((int)memdebug);
3616     emit_popa();
3617     //restore_regs(0x100f);
3618   /**/
3619 }
3620
3621 void c1ls_assemble(int i,struct regstat *i_regs)
3622 {
3623 #ifndef DISABLE_COP1
3624   int s,th,tl;
3625   int temp,ar;
3626   int map=-1;
3627   int offset;
3628   int c=0;
3629   int jaddr,jaddr2=0,jaddr3,type;
3630   int agr=AGEN1+(i&1);
3631   u_int hr,reglist=0;
3632   th=get_reg(i_regs->regmap,FTEMP|64);
3633   tl=get_reg(i_regs->regmap,FTEMP);
3634   s=get_reg(i_regs->regmap,rs1[i]);
3635   temp=get_reg(i_regs->regmap,agr);
3636   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3637   offset=imm[i];
3638   assert(tl>=0);
3639   assert(rs1[i]>0);
3640   assert(temp>=0);
3641   for(hr=0;hr<HOST_REGS;hr++) {
3642     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3643   }
3644   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3645   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3646   {
3647     // Loads use a temporary register which we need to save
3648     reglist|=1<<temp;
3649   }
3650   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3651     ar=temp;
3652   else // LWC1/LDC1
3653     ar=tl;
3654   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3655   //else c=(i_regs->wasconst>>s)&1;
3656   if(s>=0) c=(i_regs->wasconst>>s)&1;
3657   // Check cop1 unusable
3658   if(!cop1_usable) {
3659     signed char rs=get_reg(i_regs->regmap,CSREG);
3660     assert(rs>=0);
3661     emit_testimm(rs,0x20000000);
3662     jaddr=(int)out;
3663     emit_jeq(0);
3664     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3665     cop1_usable=1;
3666   }
3667   if (opcode[i]==0x39) { // SWC1 (get float address)
3668     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3669   }
3670   if (opcode[i]==0x3D) { // SDC1 (get double address)
3671     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3672   }
3673   // Generate address + offset
3674   if(!using_tlb) {
3675     if(!c)
3676       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3677   }
3678   else
3679   {
3680     map=get_reg(i_regs->regmap,TLREG);
3681     assert(map>=0);
3682     reglist&=~(1<<map);
3683     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3684       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3685     }
3686     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3687       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3688     }
3689   }
3690   if (opcode[i]==0x39) { // SWC1 (read float)
3691     emit_readword_indexed(0,tl,tl);
3692   }
3693   if (opcode[i]==0x3D) { // SDC1 (read double)
3694     emit_readword_indexed(4,tl,th);
3695     emit_readword_indexed(0,tl,tl);
3696   }
3697   if (opcode[i]==0x31) { // LWC1 (get target address)
3698     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3699   }
3700   if (opcode[i]==0x35) { // LDC1 (get target address)
3701     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3702   }
3703   if(!using_tlb) {
3704     if(!c) {
3705       jaddr2=(int)out;
3706       emit_jno(0);
3707     }
3708     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3709       jaddr2=(int)out;
3710       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3711     }
3712     #ifdef DESTRUCTIVE_SHIFT
3713     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3714       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3715     }
3716     #endif
3717   }else{
3718     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3719       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3720     }
3721     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3722       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3723     }
3724   }
3725   if (opcode[i]==0x31) { // LWC1
3726     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3727     //gen_tlb_addr_r(ar,map);
3728     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3729     #ifdef HOST_IMM_ADDR32
3730     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3731     else
3732     #endif
3733     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3734     type=LOADW_STUB;
3735   }
3736   if (opcode[i]==0x35) { // LDC1
3737     assert(th>=0);
3738     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3739     //gen_tlb_addr_r(ar,map);
3740     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3741     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3742     #ifdef HOST_IMM_ADDR32
3743     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3744     else
3745     #endif
3746     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3747     type=LOADD_STUB;
3748   }
3749   if (opcode[i]==0x39) { // SWC1
3750     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3751     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3752     type=STOREW_STUB;
3753   }
3754   if (opcode[i]==0x3D) { // SDC1
3755     assert(th>=0);
3756     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3757     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3758     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3759     type=STORED_STUB;
3760   }
3761   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3762     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3763       #ifndef DESTRUCTIVE_SHIFT
3764       temp=offset||c||s<0?ar:s;
3765       #endif
3766       #if defined(HOST_IMM8)
3767       int ir=get_reg(i_regs->regmap,INVCP);
3768       assert(ir>=0);
3769       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3770       #else
3771       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3772       #endif
3773       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3774       emit_callne(invalidate_addr_reg[temp]);
3775       #else
3776       jaddr3=(int)out;
3777       emit_jne(0);
3778       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3779       #endif
3780     }
3781   }
3782   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3783   if (opcode[i]==0x31) { // LWC1 (write float)
3784     emit_writeword_indexed(tl,0,temp);
3785   }
3786   if (opcode[i]==0x35) { // LDC1 (write double)
3787     emit_writeword_indexed(th,4,temp);
3788     emit_writeword_indexed(tl,0,temp);
3789   }
3790   //if(opcode[i]==0x39)
3791   /*if(opcode[i]==0x39||opcode[i]==0x31)
3792   {
3793     emit_pusha();
3794         emit_readword((int)&last_count,ECX);
3795         if(get_reg(i_regs->regmap,CCREG)<0)
3796           emit_loadreg(CCREG,HOST_CCREG);
3797         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3798         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3799         emit_writeword(HOST_CCREG,(int)&Count);
3800     emit_call((int)memdebug);
3801     emit_popa();
3802   }/**/
3803 #else
3804   cop1_unusable(i, i_regs);
3805 #endif
3806 }
3807
3808 void c2ls_assemble(int i,struct regstat *i_regs)
3809 {
3810   int s,tl;
3811   int ar;
3812   int offset;
3813   int memtarget=0,c=0;
3814   int jaddr2=0,jaddr3,type;
3815   int agr=AGEN1+(i&1);
3816   int fastio_reg_override=0;
3817   u_int hr,reglist=0;
3818   u_int copr=(source[i]>>16)&0x1f;
3819   s=get_reg(i_regs->regmap,rs1[i]);
3820   tl=get_reg(i_regs->regmap,FTEMP);
3821   offset=imm[i];
3822   assert(rs1[i]>0);
3823   assert(tl>=0);
3824   assert(!using_tlb);
3825
3826   for(hr=0;hr<HOST_REGS;hr++) {
3827     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3828   }
3829   if(i_regs->regmap[HOST_CCREG]==CCREG)
3830     reglist&=~(1<<HOST_CCREG);
3831
3832   // get the address
3833   if (opcode[i]==0x3a) { // SWC2
3834     ar=get_reg(i_regs->regmap,agr);
3835     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3836     reglist|=1<<ar;
3837   } else { // LWC2
3838     ar=tl;
3839   }
3840   if(s>=0) c=(i_regs->wasconst>>s)&1;
3841   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3842   if (!offset&&!c&&s>=0) ar=s;
3843   assert(ar>=0);
3844
3845   if (opcode[i]==0x3a) { // SWC2
3846     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3847     type=STOREW_STUB;
3848   }
3849   else
3850     type=LOADW_STUB;
3851
3852   if(c&&!memtarget) {
3853     jaddr2=(int)out;
3854     emit_jmp(0); // inline_readstub/inline_writestub?
3855   }
3856   else {
3857     if(!c) {
3858       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3859     }
3860     if (opcode[i]==0x32) { // LWC2
3861       #ifdef HOST_IMM_ADDR32
3862       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3863       else
3864       #endif
3865       int a=ar;
3866       if(fastio_reg_override) a=fastio_reg_override;
3867       emit_readword_indexed(0,a,tl);
3868     }
3869     if (opcode[i]==0x3a) { // SWC2
3870       #ifdef DESTRUCTIVE_SHIFT
3871       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3872       #endif
3873       int a=ar;
3874       if(fastio_reg_override) a=fastio_reg_override;
3875       emit_writeword_indexed(tl,0,a);
3876     }
3877   }
3878   if(jaddr2)
3879     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3880   if(opcode[i]==0x3a) // SWC2
3881   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3882 #if defined(HOST_IMM8)
3883     int ir=get_reg(i_regs->regmap,INVCP);
3884     assert(ir>=0);
3885     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3886 #else
3887     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3888 #endif
3889     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3890     emit_callne(invalidate_addr_reg[ar]);
3891     #else
3892     jaddr3=(int)out;
3893     emit_jne(0);
3894     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3895     #endif
3896   }
3897   if (opcode[i]==0x32) { // LWC2
3898     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3899   }
3900 }
3901
3902 #ifndef multdiv_assemble
3903 void multdiv_assemble(int i,struct regstat *i_regs)
3904 {
3905   printf("Need multdiv_assemble for this architecture.\n");
3906   exit(1);
3907 }
3908 #endif
3909
3910 void mov_assemble(int i,struct regstat *i_regs)
3911 {
3912   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3913   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3914   if(rt1[i]) {
3915     signed char sh,sl,th,tl;
3916     th=get_reg(i_regs->regmap,rt1[i]|64);
3917     tl=get_reg(i_regs->regmap,rt1[i]);
3918     //assert(tl>=0);
3919     if(tl>=0) {
3920       sh=get_reg(i_regs->regmap,rs1[i]|64);
3921       sl=get_reg(i_regs->regmap,rs1[i]);
3922       if(sl>=0) emit_mov(sl,tl);
3923       else emit_loadreg(rs1[i],tl);
3924       if(th>=0) {
3925         if(sh>=0) emit_mov(sh,th);
3926         else emit_loadreg(rs1[i]|64,th);
3927       }
3928     }
3929   }
3930 }
3931
3932 #ifndef fconv_assemble
3933 void fconv_assemble(int i,struct regstat *i_regs)
3934 {
3935   printf("Need fconv_assemble for this architecture.\n");
3936   exit(1);
3937 }
3938 #endif
3939
3940 #if 0
3941 void float_assemble(int i,struct regstat *i_regs)
3942 {
3943   printf("Need float_assemble for this architecture.\n");
3944   exit(1);
3945 }
3946 #endif
3947
3948 void syscall_assemble(int i,struct regstat *i_regs)
3949 {
3950   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3951   assert(ccreg==HOST_CCREG);
3952   assert(!is_delayslot);
3953   emit_movimm(start+i*4,EAX); // Get PC
3954   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3955   emit_jmp((int)jump_syscall_hle); // XXX
3956 }
3957
3958 void hlecall_assemble(int i,struct regstat *i_regs)
3959 {
3960   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3961   assert(ccreg==HOST_CCREG);
3962   assert(!is_delayslot);
3963   emit_movimm(start+i*4+4,0); // Get PC
3964   emit_movimm((int)psxHLEt[source[i]&7],1);
3965   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3966   emit_jmp((int)jump_hlecall);
3967 }
3968
3969 void intcall_assemble(int i,struct regstat *i_regs)
3970 {
3971   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3972   assert(ccreg==HOST_CCREG);
3973   assert(!is_delayslot);
3974   emit_movimm(start+i*4,0); // Get PC
3975   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3976   emit_jmp((int)jump_intcall);
3977 }
3978
3979 void ds_assemble(int i,struct regstat *i_regs)
3980 {
3981   speculate_register_values(i);
3982   is_delayslot=1;
3983   switch(itype[i]) {
3984     case ALU:
3985       alu_assemble(i,i_regs);break;
3986     case IMM16:
3987       imm16_assemble(i,i_regs);break;
3988     case SHIFT:
3989       shift_assemble(i,i_regs);break;
3990     case SHIFTIMM:
3991       shiftimm_assemble(i,i_regs);break;
3992     case LOAD:
3993       load_assemble(i,i_regs);break;
3994     case LOADLR:
3995       loadlr_assemble(i,i_regs);break;
3996     case STORE:
3997       store_assemble(i,i_regs);break;
3998     case STORELR:
3999       storelr_assemble(i,i_regs);break;
4000     case COP0:
4001       cop0_assemble(i,i_regs);break;
4002     case COP1:
4003       cop1_assemble(i,i_regs);break;
4004     case C1LS:
4005       c1ls_assemble(i,i_regs);break;
4006     case COP2:
4007       cop2_assemble(i,i_regs);break;
4008     case C2LS:
4009       c2ls_assemble(i,i_regs);break;
4010     case C2OP:
4011       c2op_assemble(i,i_regs);break;
4012     case FCONV:
4013       fconv_assemble(i,i_regs);break;
4014     case FLOAT:
4015       float_assemble(i,i_regs);break;
4016     case FCOMP:
4017       fcomp_assemble(i,i_regs);break;
4018     case MULTDIV:
4019       multdiv_assemble(i,i_regs);break;
4020     case MOV:
4021       mov_assemble(i,i_regs);break;
4022     case SYSCALL:
4023     case HLECALL:
4024     case INTCALL:
4025     case SPAN:
4026     case UJUMP:
4027     case RJUMP:
4028     case CJUMP:
4029     case SJUMP:
4030     case FJUMP:
4031       printf("Jump in the delay slot.  This is probably a bug.\n");
4032   }
4033   is_delayslot=0;
4034 }
4035
4036 // Is the branch target a valid internal jump?
4037 int internal_branch(uint64_t i_is32,int addr)
4038 {
4039   if(addr&1) return 0; // Indirect (register) jump
4040   if(addr>=start && addr<start+slen*4-4)
4041   {
4042     int t=(addr-start)>>2;
4043     // Delay slots are not valid branch targets
4044     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4045     // 64 -> 32 bit transition requires a recompile
4046     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4047     {
4048       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4049       else printf("optimizable: yes\n");
4050     }*/
4051     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4052 #ifndef FORCE32
4053     if(requires_32bit[t]&~i_is32) return 0;
4054     else
4055 #endif
4056       return 1;
4057   }
4058   return 0;
4059 }
4060
4061 #ifndef wb_invalidate
4062 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4063   uint64_t u,uint64_t uu)
4064 {
4065   int hr;
4066   for(hr=0;hr<HOST_REGS;hr++) {
4067     if(hr!=EXCLUDE_REG) {
4068       if(pre[hr]!=entry[hr]) {
4069         if(pre[hr]>=0) {
4070           if((dirty>>hr)&1) {
4071             if(get_reg(entry,pre[hr])<0) {
4072               if(pre[hr]<64) {
4073                 if(!((u>>pre[hr])&1)) {
4074                   emit_storereg(pre[hr],hr);
4075                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4076                     emit_sarimm(hr,31,hr);
4077                     emit_storereg(pre[hr]|64,hr);
4078                   }
4079                 }
4080               }else{
4081                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4082                   emit_storereg(pre[hr],hr);
4083                 }
4084               }
4085             }
4086           }
4087         }
4088       }
4089     }
4090   }
4091   // Move from one register to another (no writeback)
4092   for(hr=0;hr<HOST_REGS;hr++) {
4093     if(hr!=EXCLUDE_REG) {
4094       if(pre[hr]!=entry[hr]) {
4095         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4096           int nr;
4097           if((nr=get_reg(entry,pre[hr]))>=0) {
4098             emit_mov(hr,nr);
4099           }
4100         }
4101       }
4102     }
4103   }
4104 }
4105 #endif
4106
4107 // Load the specified registers
4108 // This only loads the registers given as arguments because
4109 // we don't want to load things that will be overwritten
4110 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4111 {
4112   int hr;
4113   // Load 32-bit regs
4114   for(hr=0;hr<HOST_REGS;hr++) {
4115     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4116       if(entry[hr]!=regmap[hr]) {
4117         if(regmap[hr]==rs1||regmap[hr]==rs2)
4118         {
4119           if(regmap[hr]==0) {
4120             emit_zeroreg(hr);
4121           }
4122           else
4123           {
4124             emit_loadreg(regmap[hr],hr);
4125           }
4126         }
4127       }
4128     }
4129   }
4130   //Load 64-bit regs
4131   for(hr=0;hr<HOST_REGS;hr++) {
4132     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4133       if(entry[hr]!=regmap[hr]) {
4134         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4135         {
4136           assert(regmap[hr]!=64);
4137           if((is32>>(regmap[hr]&63))&1) {
4138             int lr=get_reg(regmap,regmap[hr]-64);
4139             if(lr>=0)
4140               emit_sarimm(lr,31,hr);
4141             else
4142               emit_loadreg(regmap[hr],hr);
4143           }
4144           else
4145           {
4146             emit_loadreg(regmap[hr],hr);
4147           }
4148         }
4149       }
4150     }
4151   }
4152 }
4153
4154 // Load registers prior to the start of a loop
4155 // so that they are not loaded within the loop
4156 static void loop_preload(signed char pre[],signed char entry[])
4157 {
4158   int hr;
4159   for(hr=0;hr<HOST_REGS;hr++) {
4160     if(hr!=EXCLUDE_REG) {
4161       if(pre[hr]!=entry[hr]) {
4162         if(entry[hr]>=0) {
4163           if(get_reg(pre,entry[hr])<0) {
4164             assem_debug("loop preload:\n");
4165             //printf("loop preload: %d\n",hr);
4166             if(entry[hr]==0) {
4167               emit_zeroreg(hr);
4168             }
4169             else if(entry[hr]<TEMPREG)
4170             {
4171               emit_loadreg(entry[hr],hr);
4172             }
4173             else if(entry[hr]-64<TEMPREG)
4174             {
4175               emit_loadreg(entry[hr],hr);
4176             }
4177           }
4178         }
4179       }
4180     }
4181   }
4182 }
4183
4184 // Generate address for load/store instruction
4185 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4186 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4187 {
4188   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4189     int ra=-1;
4190     int agr=AGEN1+(i&1);
4191     int mgr=MGEN1+(i&1);
4192     if(itype[i]==LOAD) {
4193       ra=get_reg(i_regs->regmap,rt1[i]);
4194       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4195       assert(ra>=0);
4196     }
4197     if(itype[i]==LOADLR) {
4198       ra=get_reg(i_regs->regmap,FTEMP);
4199     }
4200     if(itype[i]==STORE||itype[i]==STORELR) {
4201       ra=get_reg(i_regs->regmap,agr);
4202       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4203     }
4204     if(itype[i]==C1LS||itype[i]==C2LS) {
4205       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4206         ra=get_reg(i_regs->regmap,FTEMP);
4207       else { // SWC1/SDC1/SWC2/SDC2
4208         ra=get_reg(i_regs->regmap,agr);
4209         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4210       }
4211     }
4212     int rs=get_reg(i_regs->regmap,rs1[i]);
4213     int rm=get_reg(i_regs->regmap,TLREG);
4214     if(ra>=0) {
4215       int offset=imm[i];
4216       int c=(i_regs->wasconst>>rs)&1;
4217       if(rs1[i]==0) {
4218         // Using r0 as a base address
4219         /*if(rm>=0) {
4220           if(!entry||entry[rm]!=mgr) {
4221             generate_map_const(offset,rm);
4222           } // else did it in the previous cycle
4223         }*/
4224         if(!entry||entry[ra]!=agr) {
4225           if (opcode[i]==0x22||opcode[i]==0x26) {
4226             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4227           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4228             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4229           }else{
4230             emit_movimm(offset,ra);
4231           }
4232         } // else did it in the previous cycle
4233       }
4234       else if(rs<0) {
4235         if(!entry||entry[ra]!=rs1[i])
4236           emit_loadreg(rs1[i],ra);
4237         //if(!entry||entry[ra]!=rs1[i])
4238         //  printf("poor load scheduling!\n");
4239       }
4240       else if(c) {
4241 #ifndef DISABLE_TLB
4242         if(rm>=0) {
4243           if(!entry||entry[rm]!=mgr) {
4244             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4245               // Stores to memory go thru the mapper to detect self-modifying
4246               // code, loads don't.
4247               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4248                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4249                 generate_map_const(constmap[i][rs]+offset,rm);
4250             }else{
4251               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4252                 generate_map_const(constmap[i][rs]+offset,rm);
4253             }
4254           }
4255         }
4256 #endif
4257         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4258           if(!entry||entry[ra]!=agr) {
4259             if (opcode[i]==0x22||opcode[i]==0x26) {
4260               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4261             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4262               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4263             }else{
4264               #ifdef HOST_IMM_ADDR32
4265               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4266                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4267               #endif
4268               emit_movimm(constmap[i][rs]+offset,ra);
4269               regs[i].loadedconst|=1<<ra;
4270             }
4271           } // else did it in the previous cycle
4272         } // else load_consts already did it
4273       }
4274       if(offset&&!c&&rs1[i]) {
4275         if(rs>=0) {
4276           emit_addimm(rs,offset,ra);
4277         }else{
4278           emit_addimm(ra,offset,ra);
4279         }
4280       }
4281     }
4282   }
4283   // Preload constants for next instruction
4284   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4285     int agr,ra;
4286     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4287     // Mapper entry
4288     agr=MGEN1+((i+1)&1);
4289     ra=get_reg(i_regs->regmap,agr);
4290     if(ra>=0) {
4291       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4292       int offset=imm[i+1];
4293       int c=(regs[i+1].wasconst>>rs)&1;
4294       if(c) {
4295         if(itype[i+1]==STORE||itype[i+1]==STORELR
4296            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4297           // Stores to memory go thru the mapper to detect self-modifying
4298           // code, loads don't.
4299           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4300              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4301             generate_map_const(constmap[i+1][rs]+offset,ra);
4302         }else{
4303           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4304             generate_map_const(constmap[i+1][rs]+offset,ra);
4305         }
4306       }
4307       /*else if(rs1[i]==0) {
4308         generate_map_const(offset,ra);
4309       }*/
4310     }
4311     #endif
4312     // Actual address
4313     agr=AGEN1+((i+1)&1);
4314     ra=get_reg(i_regs->regmap,agr);
4315     if(ra>=0) {
4316       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4317       int offset=imm[i+1];
4318       int c=(regs[i+1].wasconst>>rs)&1;
4319       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4320         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4321           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4322         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4323           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4324         }else{
4325           #ifdef HOST_IMM_ADDR32
4326           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4327              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4328           #endif
4329           emit_movimm(constmap[i+1][rs]+offset,ra);
4330           regs[i+1].loadedconst|=1<<ra;
4331         }
4332       }
4333       else if(rs1[i+1]==0) {
4334         // Using r0 as a base address
4335         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4336           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4337         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4338           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4339         }else{
4340           emit_movimm(offset,ra);
4341         }
4342       }
4343     }
4344   }
4345 }
4346
4347 int get_final_value(int hr, int i, int *value)
4348 {
4349   int reg=regs[i].regmap[hr];
4350   while(i<slen-1) {
4351     if(regs[i+1].regmap[hr]!=reg) break;
4352     if(!((regs[i+1].isconst>>hr)&1)) break;
4353     if(bt[i+1]) break;
4354     i++;
4355   }
4356   if(i<slen-1) {
4357     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4358       *value=constmap[i][hr];
4359       return 1;
4360     }
4361     if(!bt[i+1]) {
4362       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4363         // Load in delay slot, out-of-order execution
4364         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4365         {
4366           #ifdef HOST_IMM_ADDR32
4367           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4368           #endif
4369           // Precompute load address
4370           *value=constmap[i][hr]+imm[i+2];
4371           return 1;
4372         }
4373       }
4374       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4375       {
4376         #ifdef HOST_IMM_ADDR32
4377         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4378         #endif
4379         // Precompute load address
4380         *value=constmap[i][hr]+imm[i+1];
4381         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4382         return 1;
4383       }
4384     }
4385   }
4386   *value=constmap[i][hr];
4387   //printf("c=%x\n",(int)constmap[i][hr]);
4388   if(i==slen-1) return 1;
4389   if(reg<64) {
4390     return !((unneeded_reg[i+1]>>reg)&1);
4391   }else{
4392     return !((unneeded_reg_upper[i+1]>>reg)&1);
4393   }
4394 }
4395
4396 // Load registers with known constants
4397 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4398 {
4399   int hr,hr2;
4400   // propagate loaded constant flags
4401   if(i==0||bt[i])
4402     regs[i].loadedconst=0;
4403   else {
4404     for(hr=0;hr<HOST_REGS;hr++) {
4405       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4406          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4407       {
4408         regs[i].loadedconst|=1<<hr;
4409       }
4410     }
4411   }
4412   // Load 32-bit regs
4413   for(hr=0;hr<HOST_REGS;hr++) {
4414     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4415       //if(entry[hr]!=regmap[hr]) {
4416       if(!((regs[i].loadedconst>>hr)&1)) {
4417         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4418           int value,similar=0;
4419           if(get_final_value(hr,i,&value)) {
4420             // see if some other register has similar value
4421             for(hr2=0;hr2<HOST_REGS;hr2++) {
4422               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4423                 if(is_similar_value(value,constmap[i][hr2])) {
4424                   similar=1;
4425                   break;
4426                 }
4427               }
4428             }
4429             if(similar) {
4430               int value2;
4431               if(get_final_value(hr2,i,&value2)) // is this needed?
4432                 emit_movimm_from(value2,hr2,value,hr);
4433               else
4434                 emit_movimm(value,hr);
4435             }
4436             else if(value==0) {
4437               emit_zeroreg(hr);
4438             }
4439             else {
4440               emit_movimm(value,hr);
4441             }
4442           }
4443           regs[i].loadedconst|=1<<hr;
4444         }
4445       }
4446     }
4447   }
4448   // Load 64-bit regs
4449   for(hr=0;hr<HOST_REGS;hr++) {
4450     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4451       //if(entry[hr]!=regmap[hr]) {
4452       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4453         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4454           if((is32>>(regmap[hr]&63))&1) {
4455             int lr=get_reg(regmap,regmap[hr]-64);
4456             assert(lr>=0);
4457             emit_sarimm(lr,31,hr);
4458           }
4459           else
4460           {
4461             int value;
4462             if(get_final_value(hr,i,&value)) {
4463               if(value==0) {
4464                 emit_zeroreg(hr);
4465               }
4466               else {
4467                 emit_movimm(value,hr);
4468               }
4469             }
4470           }
4471         }
4472       }
4473     }
4474   }
4475 }
4476 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4477 {
4478   int hr;
4479   // Load 32-bit regs
4480   for(hr=0;hr<HOST_REGS;hr++) {
4481     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4482       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4483         int value=constmap[i][hr];
4484         if(value==0) {
4485           emit_zeroreg(hr);
4486         }
4487         else {
4488           emit_movimm(value,hr);
4489         }
4490       }
4491     }
4492   }
4493   // Load 64-bit regs
4494   for(hr=0;hr<HOST_REGS;hr++) {
4495     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4496       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4497         if((is32>>(regmap[hr]&63))&1) {
4498           int lr=get_reg(regmap,regmap[hr]-64);
4499           assert(lr>=0);
4500           emit_sarimm(lr,31,hr);
4501         }
4502         else
4503         {
4504           int value=constmap[i][hr];
4505           if(value==0) {
4506             emit_zeroreg(hr);
4507           }
4508           else {
4509             emit_movimm(value,hr);
4510           }
4511         }
4512       }
4513     }
4514   }
4515 }
4516
4517 // Write out all dirty registers (except cycle count)
4518 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4519 {
4520   int hr;
4521   for(hr=0;hr<HOST_REGS;hr++) {
4522     if(hr!=EXCLUDE_REG) {
4523       if(i_regmap[hr]>0) {
4524         if(i_regmap[hr]!=CCREG) {
4525           if((i_dirty>>hr)&1) {
4526             if(i_regmap[hr]<64) {
4527               emit_storereg(i_regmap[hr],hr);
4528 #ifndef FORCE32
4529               if( ((i_is32>>i_regmap[hr])&1) ) {
4530                 #ifdef DESTRUCTIVE_WRITEBACK
4531                 emit_sarimm(hr,31,hr);
4532                 emit_storereg(i_regmap[hr]|64,hr);
4533                 #else
4534                 emit_sarimm(hr,31,HOST_TEMPREG);
4535                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4536                 #endif
4537               }
4538 #endif
4539             }else{
4540               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4541                 emit_storereg(i_regmap[hr],hr);
4542               }
4543             }
4544           }
4545         }
4546       }
4547     }
4548   }
4549 }
4550 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4551 // This writes the registers not written by store_regs_bt
4552 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4553 {
4554   int hr;
4555   int t=(addr-start)>>2;
4556   for(hr=0;hr<HOST_REGS;hr++) {
4557     if(hr!=EXCLUDE_REG) {
4558       if(i_regmap[hr]>0) {
4559         if(i_regmap[hr]!=CCREG) {
4560           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4561             if((i_dirty>>hr)&1) {
4562               if(i_regmap[hr]<64) {
4563                 emit_storereg(i_regmap[hr],hr);
4564 #ifndef FORCE32
4565                 if( ((i_is32>>i_regmap[hr])&1) ) {
4566                   #ifdef DESTRUCTIVE_WRITEBACK
4567                   emit_sarimm(hr,31,hr);
4568                   emit_storereg(i_regmap[hr]|64,hr);
4569                   #else
4570                   emit_sarimm(hr,31,HOST_TEMPREG);
4571                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4572                   #endif
4573                 }
4574 #endif
4575               }else{
4576                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4577                   emit_storereg(i_regmap[hr],hr);
4578                 }
4579               }
4580             }
4581           }
4582         }
4583       }
4584     }
4585   }
4586 }
4587
4588 // Load all registers (except cycle count)
4589 void load_all_regs(signed char i_regmap[])
4590 {
4591   int hr;
4592   for(hr=0;hr<HOST_REGS;hr++) {
4593     if(hr!=EXCLUDE_REG) {
4594       if(i_regmap[hr]==0) {
4595         emit_zeroreg(hr);
4596       }
4597       else
4598       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4599       {
4600         emit_loadreg(i_regmap[hr],hr);
4601       }
4602     }
4603   }
4604 }
4605
4606 // Load all current registers also needed by next instruction
4607 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4608 {
4609   int hr;
4610   for(hr=0;hr<HOST_REGS;hr++) {
4611     if(hr!=EXCLUDE_REG) {
4612       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4613         if(i_regmap[hr]==0) {
4614           emit_zeroreg(hr);
4615         }
4616         else
4617         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4618         {
4619           emit_loadreg(i_regmap[hr],hr);
4620         }
4621       }
4622     }
4623   }
4624 }
4625
4626 // Load all regs, storing cycle count if necessary
4627 void load_regs_entry(int t)
4628 {
4629   int hr;
4630   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4631   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4632   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4633     emit_storereg(CCREG,HOST_CCREG);
4634   }
4635   // Load 32-bit regs
4636   for(hr=0;hr<HOST_REGS;hr++) {
4637     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4638       if(regs[t].regmap_entry[hr]==0) {
4639         emit_zeroreg(hr);
4640       }
4641       else if(regs[t].regmap_entry[hr]!=CCREG)
4642       {
4643         emit_loadreg(regs[t].regmap_entry[hr],hr);
4644       }
4645     }
4646   }
4647   // Load 64-bit regs
4648   for(hr=0;hr<HOST_REGS;hr++) {
4649     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4650       assert(regs[t].regmap_entry[hr]!=64);
4651       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4652         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4653         if(lr<0) {
4654           emit_loadreg(regs[t].regmap_entry[hr],hr);
4655         }
4656         else
4657         {
4658           emit_sarimm(lr,31,hr);
4659         }
4660       }
4661       else
4662       {
4663         emit_loadreg(regs[t].regmap_entry[hr],hr);
4664       }
4665     }
4666   }
4667 }
4668
4669 // Store dirty registers prior to branch
4670 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4671 {
4672   if(internal_branch(i_is32,addr))
4673   {
4674     int t=(addr-start)>>2;
4675     int hr;
4676     for(hr=0;hr<HOST_REGS;hr++) {
4677       if(hr!=EXCLUDE_REG) {
4678         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4679           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4680             if((i_dirty>>hr)&1) {
4681               if(i_regmap[hr]<64) {
4682                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4683                   emit_storereg(i_regmap[hr],hr);
4684                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4685                     #ifdef DESTRUCTIVE_WRITEBACK
4686                     emit_sarimm(hr,31,hr);
4687                     emit_storereg(i_regmap[hr]|64,hr);
4688                     #else
4689                     emit_sarimm(hr,31,HOST_TEMPREG);
4690                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4691                     #endif
4692                   }
4693                 }
4694               }else{
4695                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4696                   emit_storereg(i_regmap[hr],hr);
4697                 }
4698               }
4699             }
4700           }
4701         }
4702       }
4703     }
4704   }
4705   else
4706   {
4707     // Branch out of this block, write out all dirty regs
4708     wb_dirtys(i_regmap,i_is32,i_dirty);
4709   }
4710 }
4711
4712 // Load all needed registers for branch target
4713 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4714 {
4715   //if(addr>=start && addr<(start+slen*4))
4716   if(internal_branch(i_is32,addr))
4717   {
4718     int t=(addr-start)>>2;
4719     int hr;
4720     // Store the cycle count before loading something else
4721     if(i_regmap[HOST_CCREG]!=CCREG) {
4722       assert(i_regmap[HOST_CCREG]==-1);
4723     }
4724     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4725       emit_storereg(CCREG,HOST_CCREG);
4726     }
4727     // Load 32-bit regs
4728     for(hr=0;hr<HOST_REGS;hr++) {
4729       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4730         #ifdef DESTRUCTIVE_WRITEBACK
4731         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4732         #else
4733         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4734         #endif
4735           if(regs[t].regmap_entry[hr]==0) {
4736             emit_zeroreg(hr);
4737           }
4738           else if(regs[t].regmap_entry[hr]!=CCREG)
4739           {
4740             emit_loadreg(regs[t].regmap_entry[hr],hr);
4741           }
4742         }
4743       }
4744     }
4745     //Load 64-bit regs
4746     for(hr=0;hr<HOST_REGS;hr++) {
4747       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4748         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4749           assert(regs[t].regmap_entry[hr]!=64);
4750           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4751             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4752             if(lr<0) {
4753               emit_loadreg(regs[t].regmap_entry[hr],hr);
4754             }
4755             else
4756             {
4757               emit_sarimm(lr,31,hr);
4758             }
4759           }
4760           else
4761           {
4762             emit_loadreg(regs[t].regmap_entry[hr],hr);
4763           }
4764         }
4765         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4766           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4767           assert(lr>=0);
4768           emit_sarimm(lr,31,hr);
4769         }
4770       }
4771     }
4772   }
4773 }
4774
4775 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4776 {
4777   if(addr>=start && addr<start+slen*4-4)
4778   {
4779     int t=(addr-start)>>2;
4780     int hr;
4781     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4782     for(hr=0;hr<HOST_REGS;hr++)
4783     {
4784       if(hr!=EXCLUDE_REG)
4785       {
4786         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4787         {
4788           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4789           {
4790             return 0;
4791           }
4792           else 
4793           if((i_dirty>>hr)&1)
4794           {
4795             if(i_regmap[hr]<TEMPREG)
4796             {
4797               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4798                 return 0;
4799             }
4800             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4801             {
4802               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4803                 return 0;
4804             }
4805           }
4806         }
4807         else // Same register but is it 32-bit or dirty?
4808         if(i_regmap[hr]>=0)
4809         {
4810           if(!((regs[t].dirty>>hr)&1))
4811           {
4812             if((i_dirty>>hr)&1)
4813             {
4814               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4815               {
4816                 //printf("%x: dirty no match\n",addr);
4817                 return 0;
4818               }
4819             }
4820           }
4821           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4822           {
4823             //printf("%x: is32 no match\n",addr);
4824             return 0;
4825           }
4826         }
4827       }
4828     }
4829     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4830 #ifndef FORCE32
4831     if(requires_32bit[t]&~i_is32) return 0;
4832 #endif
4833     // Delay slots are not valid branch targets
4834     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4835     // Delay slots require additional processing, so do not match
4836     if(is_ds[t]) return 0;
4837   }
4838   else
4839   {
4840     int hr;
4841     for(hr=0;hr<HOST_REGS;hr++)
4842     {
4843       if(hr!=EXCLUDE_REG)
4844       {
4845         if(i_regmap[hr]>=0)
4846         {
4847           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4848           {
4849             if((i_dirty>>hr)&1)
4850             {
4851               return 0;
4852             }
4853           }
4854         }
4855       }
4856     }
4857   }
4858   return 1;
4859 }
4860
4861 // Used when a branch jumps into the delay slot of another branch
4862 void ds_assemble_entry(int i)
4863 {
4864   int t=(ba[i]-start)>>2;
4865   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4866   assem_debug("Assemble delay slot at %x\n",ba[i]);
4867   assem_debug("<->\n");
4868   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4869     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4870   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4871   address_generation(t,&regs[t],regs[t].regmap_entry);
4872   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4873     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4874   cop1_usable=0;
4875   is_delayslot=0;
4876   switch(itype[t]) {
4877     case ALU:
4878       alu_assemble(t,&regs[t]);break;
4879     case IMM16:
4880       imm16_assemble(t,&regs[t]);break;
4881     case SHIFT:
4882       shift_assemble(t,&regs[t]);break;
4883     case SHIFTIMM:
4884       shiftimm_assemble(t,&regs[t]);break;
4885     case LOAD:
4886       load_assemble(t,&regs[t]);break;
4887     case LOADLR:
4888       loadlr_assemble(t,&regs[t]);break;
4889     case STORE:
4890       store_assemble(t,&regs[t]);break;
4891     case STORELR:
4892       storelr_assemble(t,&regs[t]);break;
4893     case COP0:
4894       cop0_assemble(t,&regs[t]);break;
4895     case COP1:
4896       cop1_assemble(t,&regs[t]);break;
4897     case C1LS:
4898       c1ls_assemble(t,&regs[t]);break;
4899     case COP2:
4900       cop2_assemble(t,&regs[t]);break;
4901     case C2LS:
4902       c2ls_assemble(t,&regs[t]);break;
4903     case C2OP:
4904       c2op_assemble(t,&regs[t]);break;
4905     case FCONV:
4906       fconv_assemble(t,&regs[t]);break;
4907     case FLOAT:
4908       float_assemble(t,&regs[t]);break;
4909     case FCOMP:
4910       fcomp_assemble(t,&regs[t]);break;
4911     case MULTDIV:
4912       multdiv_assemble(t,&regs[t]);break;
4913     case MOV:
4914       mov_assemble(t,&regs[t]);break;
4915     case SYSCALL:
4916     case HLECALL:
4917     case INTCALL:
4918     case SPAN:
4919     case UJUMP:
4920     case RJUMP:
4921     case CJUMP:
4922     case SJUMP:
4923     case FJUMP:
4924       printf("Jump in the delay slot.  This is probably a bug.\n");
4925   }
4926   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4927   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4928   if(internal_branch(regs[t].is32,ba[i]+4))
4929     assem_debug("branch: internal\n");
4930   else
4931     assem_debug("branch: external\n");
4932   assert(internal_branch(regs[t].is32,ba[i]+4));
4933   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4934   emit_jmp(0);
4935 }
4936
4937 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4938 {
4939   int count;
4940   int jaddr;
4941   int idle=0;
4942   if(itype[i]==RJUMP)
4943   {
4944     *adj=0;
4945   }
4946   //if(ba[i]>=start && ba[i]<(start+slen*4))
4947   if(internal_branch(branch_regs[i].is32,ba[i]))
4948   {
4949     int t=(ba[i]-start)>>2;
4950     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4951     else *adj=ccadj[t];
4952   }
4953   else
4954   {
4955     *adj=0;
4956   }
4957   count=ccadj[i];
4958   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4959     // Idle loop
4960     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4961     idle=(int)out;
4962     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4963     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4964     jaddr=(int)out;
4965     emit_jmp(0);
4966   }
4967   else if(*adj==0||invert) {
4968     emit_addimm_and_set_flags(CLOCK_ADJUST(count+2),HOST_CCREG);
4969     jaddr=(int)out;
4970     emit_jns(0);
4971   }
4972   else
4973   {
4974     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4975     jaddr=(int)out;
4976     emit_jns(0);
4977   }
4978   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4979 }
4980
4981 void do_ccstub(int n)
4982 {
4983   literal_pool(256);
4984   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4985   set_jump_target(stubs[n][1],(int)out);
4986   int i=stubs[n][4];
4987   if(stubs[n][6]==NULLDS) {
4988     // Delay slot instruction is nullified ("likely" branch)
4989     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4990   }
4991   else if(stubs[n][6]!=TAKEN) {
4992     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4993   }
4994   else {
4995     if(internal_branch(branch_regs[i].is32,ba[i]))
4996       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4997   }
4998   if(stubs[n][5]!=-1)
4999   {
5000     // Save PC as return address
5001     emit_movimm(stubs[n][5],EAX);
5002     emit_writeword(EAX,(int)&pcaddr);
5003   }
5004   else
5005   {
5006     // Return address depends on which way the branch goes
5007     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5008     {
5009       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5010       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5011       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5012       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5013       if(rs1[i]==0)
5014       {
5015         s1l=s2l;s1h=s2h;
5016         s2l=s2h=-1;
5017       }
5018       else if(rs2[i]==0)
5019       {
5020         s2l=s2h=-1;
5021       }
5022       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
5023         s1h=s2h=-1;
5024       }
5025       assert(s1l>=0);
5026       #ifdef DESTRUCTIVE_WRITEBACK
5027       if(rs1[i]) {
5028         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
5029           emit_loadreg(rs1[i],s1l);
5030       } 
5031       else {
5032         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
5033           emit_loadreg(rs2[i],s1l);
5034       }
5035       if(s2l>=0)
5036         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
5037           emit_loadreg(rs2[i],s2l);
5038       #endif
5039       int hr=0;
5040       int addr=-1,alt=-1,ntaddr=-1;
5041       while(hr<HOST_REGS)
5042       {
5043         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5044            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5045            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5046         {
5047           addr=hr++;break;
5048         }
5049         hr++;
5050       }
5051       while(hr<HOST_REGS)
5052       {
5053         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5054            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5055            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5056         {
5057           alt=hr++;break;
5058         }
5059         hr++;
5060       }
5061       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5062       {
5063         while(hr<HOST_REGS)
5064         {
5065           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5066              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5067              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5068           {
5069             ntaddr=hr;break;
5070           }
5071           hr++;
5072         }
5073         assert(hr<HOST_REGS);
5074       }
5075       if((opcode[i]&0x2f)==4) // BEQ
5076       {
5077         #ifdef HAVE_CMOV_IMM
5078         if(s1h<0) {
5079           if(s2l>=0) emit_cmp(s1l,s2l);
5080           else emit_test(s1l,s1l);
5081           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5082         }
5083         else
5084         #endif
5085         {
5086           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5087           if(s1h>=0) {
5088             if(s2h>=0) emit_cmp(s1h,s2h);
5089             else emit_test(s1h,s1h);
5090             emit_cmovne_reg(alt,addr);
5091           }
5092           if(s2l>=0) emit_cmp(s1l,s2l);
5093           else emit_test(s1l,s1l);
5094           emit_cmovne_reg(alt,addr);
5095         }
5096       }
5097       if((opcode[i]&0x2f)==5) // BNE
5098       {
5099         #ifdef HAVE_CMOV_IMM
5100         if(s1h<0) {
5101           if(s2l>=0) emit_cmp(s1l,s2l);
5102           else emit_test(s1l,s1l);
5103           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5104         }
5105         else
5106         #endif
5107         {
5108           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5109           if(s1h>=0) {
5110             if(s2h>=0) emit_cmp(s1h,s2h);
5111             else emit_test(s1h,s1h);
5112             emit_cmovne_reg(alt,addr);
5113           }
5114           if(s2l>=0) emit_cmp(s1l,s2l);
5115           else emit_test(s1l,s1l);
5116           emit_cmovne_reg(alt,addr);
5117         }
5118       }
5119       if((opcode[i]&0x2f)==6) // BLEZ
5120       {
5121         //emit_movimm(ba[i],alt);
5122         //emit_movimm(start+i*4+8,addr);
5123         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5124         emit_cmpimm(s1l,1);
5125         if(s1h>=0) emit_mov(addr,ntaddr);
5126         emit_cmovl_reg(alt,addr);
5127         if(s1h>=0) {
5128           emit_test(s1h,s1h);
5129           emit_cmovne_reg(ntaddr,addr);
5130           emit_cmovs_reg(alt,addr);
5131         }
5132       }
5133       if((opcode[i]&0x2f)==7) // BGTZ
5134       {
5135         //emit_movimm(ba[i],addr);
5136         //emit_movimm(start+i*4+8,ntaddr);
5137         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5138         emit_cmpimm(s1l,1);
5139         if(s1h>=0) emit_mov(addr,alt);
5140         emit_cmovl_reg(ntaddr,addr);
5141         if(s1h>=0) {
5142           emit_test(s1h,s1h);
5143           emit_cmovne_reg(alt,addr);
5144           emit_cmovs_reg(ntaddr,addr);
5145         }
5146       }
5147       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5148       {
5149         //emit_movimm(ba[i],alt);
5150         //emit_movimm(start+i*4+8,addr);
5151         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5152         if(s1h>=0) emit_test(s1h,s1h);
5153         else emit_test(s1l,s1l);
5154         emit_cmovs_reg(alt,addr);
5155       }
5156       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5157       {
5158         //emit_movimm(ba[i],addr);
5159         //emit_movimm(start+i*4+8,alt);
5160         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5161         if(s1h>=0) emit_test(s1h,s1h);
5162         else emit_test(s1l,s1l);
5163         emit_cmovs_reg(alt,addr);
5164       }
5165       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5166         if(source[i]&0x10000) // BC1T
5167         {
5168           //emit_movimm(ba[i],alt);
5169           //emit_movimm(start+i*4+8,addr);
5170           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5171           emit_testimm(s1l,0x800000);
5172           emit_cmovne_reg(alt,addr);
5173         }
5174         else // BC1F
5175         {
5176           //emit_movimm(ba[i],addr);
5177           //emit_movimm(start+i*4+8,alt);
5178           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5179           emit_testimm(s1l,0x800000);
5180           emit_cmovne_reg(alt,addr);
5181         }
5182       }
5183       emit_writeword(addr,(int)&pcaddr);
5184     }
5185     else
5186     if(itype[i]==RJUMP)
5187     {
5188       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5189       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5190         r=get_reg(branch_regs[i].regmap,RTEMP);
5191       }
5192       emit_writeword(r,(int)&pcaddr);
5193     }
5194     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5195   }
5196   // Update cycle count
5197   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5198   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5199   emit_call((int)cc_interrupt);
5200   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5201   if(stubs[n][6]==TAKEN) {
5202     if(internal_branch(branch_regs[i].is32,ba[i]))
5203       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5204     else if(itype[i]==RJUMP) {
5205       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5206         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5207       else
5208         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5209     }
5210   }else if(stubs[n][6]==NOTTAKEN) {
5211     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5212     else load_all_regs(branch_regs[i].regmap);
5213   }else if(stubs[n][6]==NULLDS) {
5214     // Delay slot instruction is nullified ("likely" branch)
5215     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5216     else load_all_regs(regs[i].regmap);
5217   }else{
5218     load_all_regs(branch_regs[i].regmap);
5219   }
5220   emit_jmp(stubs[n][2]); // return address
5221   
5222   /* This works but uses a lot of memory...
5223   emit_readword((int)&last_count,ECX);
5224   emit_add(HOST_CCREG,ECX,EAX);
5225   emit_writeword(EAX,(int)&Count);
5226   emit_call((int)gen_interupt);
5227   emit_readword((int)&Count,HOST_CCREG);
5228   emit_readword((int)&next_interupt,EAX);
5229   emit_readword((int)&pending_exception,EBX);
5230   emit_writeword(EAX,(int)&last_count);
5231   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5232   emit_test(EBX,EBX);
5233   int jne_instr=(int)out;
5234   emit_jne(0);
5235   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5236   load_all_regs(branch_regs[i].regmap);
5237   emit_jmp(stubs[n][2]); // return address
5238   set_jump_target(jne_instr,(int)out);
5239   emit_readword((int)&pcaddr,EAX);
5240   // Call get_addr_ht instead of doing the hash table here.
5241   // This code is executed infrequently and takes up a lot of space
5242   // so smaller is better.
5243   emit_storereg(CCREG,HOST_CCREG);
5244   emit_pushreg(EAX);
5245   emit_call((int)get_addr_ht);
5246   emit_loadreg(CCREG,HOST_CCREG);
5247   emit_addimm(ESP,4,ESP);
5248   emit_jmpreg(EAX);*/
5249 }
5250
5251 add_to_linker(int addr,int target,int ext)
5252 {
5253   link_addr[linkcount][0]=addr;
5254   link_addr[linkcount][1]=target;
5255   link_addr[linkcount][2]=ext;  
5256   linkcount++;
5257 }
5258
5259 static void ujump_assemble_write_ra(int i)
5260 {
5261   int rt;
5262   unsigned int return_address;
5263   rt=get_reg(branch_regs[i].regmap,31);
5264   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5265   //assert(rt>=0);
5266   return_address=start+i*4+8;
5267   if(rt>=0) {
5268     #ifdef USE_MINI_HT
5269     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5270       int temp=-1; // note: must be ds-safe
5271       #ifdef HOST_TEMPREG
5272       temp=HOST_TEMPREG;
5273       #endif
5274       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5275       else emit_movimm(return_address,rt);
5276     }
5277     else
5278     #endif
5279     {
5280       #ifdef REG_PREFETCH
5281       if(temp>=0) 
5282       {
5283         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5284       }
5285       #endif
5286       emit_movimm(return_address,rt); // PC into link register
5287       #ifdef IMM_PREFETCH
5288       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5289       #endif
5290     }
5291   }
5292 }
5293
5294 void ujump_assemble(int i,struct regstat *i_regs)
5295 {
5296   signed char *i_regmap=i_regs->regmap;
5297   int ra_done=0;
5298   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5299   address_generation(i+1,i_regs,regs[i].regmap_entry);
5300   #ifdef REG_PREFETCH
5301   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5302   if(rt1[i]==31&&temp>=0) 
5303   {
5304     int return_address=start+i*4+8;
5305     if(get_reg(branch_regs[i].regmap,31)>0) 
5306     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5307   }
5308   #endif
5309   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5310     ujump_assemble_write_ra(i); // writeback ra for DS
5311     ra_done=1;
5312   }
5313   ds_assemble(i+1,i_regs);
5314   uint64_t bc_unneeded=branch_regs[i].u;
5315   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5316   bc_unneeded|=1|(1LL<<rt1[i]);
5317   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5318   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5319                 bc_unneeded,bc_unneeded_upper);
5320   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5321   if(!ra_done&&rt1[i]==31)
5322     ujump_assemble_write_ra(i);
5323   int cc,adj;
5324   cc=get_reg(branch_regs[i].regmap,CCREG);
5325   assert(cc==HOST_CCREG);
5326   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5327   #ifdef REG_PREFETCH
5328   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5329   #endif
5330   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5331   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5332   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5333   if(internal_branch(branch_regs[i].is32,ba[i]))
5334     assem_debug("branch: internal\n");
5335   else
5336     assem_debug("branch: external\n");
5337   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5338     ds_assemble_entry(i);
5339   }
5340   else {
5341     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5342     emit_jmp(0);
5343   }
5344 }
5345
5346 static void rjump_assemble_write_ra(int i)
5347 {
5348   int rt,return_address;
5349   assert(rt1[i+1]!=rt1[i]);
5350   assert(rt2[i+1]!=rt1[i]);
5351   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5352   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5353   assert(rt>=0);
5354   return_address=start+i*4+8;
5355   #ifdef REG_PREFETCH
5356   if(temp>=0) 
5357   {
5358     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5359   }
5360   #endif
5361   emit_movimm(return_address,rt); // PC into link register
5362   #ifdef IMM_PREFETCH
5363   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5364   #endif
5365 }
5366
5367 void rjump_assemble(int i,struct regstat *i_regs)
5368 {
5369   signed char *i_regmap=i_regs->regmap;
5370   int temp;
5371   int rs,cc,adj;
5372   int ra_done=0;
5373   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5374   assert(rs>=0);
5375   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5376     // Delay slot abuse, make a copy of the branch address register
5377     temp=get_reg(branch_regs[i].regmap,RTEMP);
5378     assert(temp>=0);
5379     assert(regs[i].regmap[temp]==RTEMP);
5380     emit_mov(rs,temp);
5381     rs=temp;
5382   }
5383   address_generation(i+1,i_regs,regs[i].regmap_entry);
5384   #ifdef REG_PREFETCH
5385   if(rt1[i]==31) 
5386   {
5387     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5388       int return_address=start+i*4+8;
5389       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5390     }
5391   }
5392   #endif
5393   #ifdef USE_MINI_HT
5394   if(rs1[i]==31) {
5395     int rh=get_reg(regs[i].regmap,RHASH);
5396     if(rh>=0) do_preload_rhash(rh);
5397   }
5398   #endif
5399   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5400     rjump_assemble_write_ra(i);
5401     ra_done=1;
5402   }
5403   ds_assemble(i+1,i_regs);
5404   uint64_t bc_unneeded=branch_regs[i].u;
5405   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5406   bc_unneeded|=1|(1LL<<rt1[i]);
5407   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5408   bc_unneeded&=~(1LL<<rs1[i]);
5409   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5410                 bc_unneeded,bc_unneeded_upper);
5411   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5412   if(!ra_done&&rt1[i]!=0)
5413     rjump_assemble_write_ra(i);
5414   cc=get_reg(branch_regs[i].regmap,CCREG);
5415   assert(cc==HOST_CCREG);
5416   #ifdef USE_MINI_HT
5417   int rh=get_reg(branch_regs[i].regmap,RHASH);
5418   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5419   if(rs1[i]==31) {
5420     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5421     do_preload_rhtbl(ht);
5422     do_rhash(rs,rh);
5423   }
5424   #endif
5425   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5426   #ifdef DESTRUCTIVE_WRITEBACK
5427   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5428     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5429       emit_loadreg(rs1[i],rs);
5430     }
5431   }
5432   #endif
5433   #ifdef REG_PREFETCH
5434   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5435   #endif
5436   #ifdef USE_MINI_HT
5437   if(rs1[i]==31) {
5438     do_miniht_load(ht,rh);
5439   }
5440   #endif
5441   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5442   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5443   //assert(adj==0);
5444   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5445   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5446 #ifdef PCSX
5447   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5448     // special case for RFE
5449     emit_jmp(0);
5450   else
5451 #endif
5452   emit_jns(0);
5453   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5454   #ifdef USE_MINI_HT
5455   if(rs1[i]==31) {
5456     do_miniht_jump(rs,rh,ht);
5457   }
5458   else
5459   #endif
5460   {
5461     //if(rs!=EAX) emit_mov(rs,EAX);
5462     //emit_jmp((int)jump_vaddr_eax);
5463     emit_jmp(jump_vaddr_reg[rs]);
5464   }
5465   /* Check hash table
5466   temp=!rs;
5467   emit_mov(rs,temp);
5468   emit_shrimm(rs,16,rs);
5469   emit_xor(temp,rs,rs);
5470   emit_movzwl_reg(rs,rs);
5471   emit_shlimm(rs,4,rs);
5472   emit_cmpmem_indexed((int)hash_table,rs,temp);
5473   emit_jne((int)out+14);
5474   emit_readword_indexed((int)hash_table+4,rs,rs);
5475   emit_jmpreg(rs);
5476   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5477   emit_addimm_no_flags(8,rs);
5478   emit_jeq((int)out-17);
5479   // No hit on hash table, call compiler
5480   emit_pushreg(temp);
5481 //DEBUG >
5482 #ifdef DEBUG_CYCLE_COUNT
5483   emit_readword((int)&last_count,ECX);
5484   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5485   emit_readword((int)&next_interupt,ECX);
5486   emit_writeword(HOST_CCREG,(int)&Count);
5487   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5488   emit_writeword(ECX,(int)&last_count);
5489 #endif
5490 //DEBUG <
5491   emit_storereg(CCREG,HOST_CCREG);
5492   emit_call((int)get_addr);
5493   emit_loadreg(CCREG,HOST_CCREG);
5494   emit_addimm(ESP,4,ESP);
5495   emit_jmpreg(EAX);*/
5496   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5497   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5498   #endif
5499 }
5500
5501 void cjump_assemble(int i,struct regstat *i_regs)
5502 {
5503   signed char *i_regmap=i_regs->regmap;
5504   int cc;
5505   int match;
5506   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5507   assem_debug("match=%d\n",match);
5508   int s1h,s1l,s2h,s2l;
5509   int prev_cop1_usable=cop1_usable;
5510   int unconditional=0,nop=0;
5511   int only32=0;
5512   int invert=0;
5513   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5514   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5515   if(!match) invert=1;
5516   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5517   if(i>(ba[i]-start)>>2) invert=1;
5518   #endif
5519   
5520   if(ooo[i]) {
5521     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5522     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5523     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5524     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5525   }
5526   else {
5527     s1l=get_reg(i_regmap,rs1[i]);
5528     s1h=get_reg(i_regmap,rs1[i]|64);
5529     s2l=get_reg(i_regmap,rs2[i]);
5530     s2h=get_reg(i_regmap,rs2[i]|64);
5531   }
5532   if(rs1[i]==0&&rs2[i]==0)
5533   {
5534     if(opcode[i]&1) nop=1;
5535     else unconditional=1;
5536     //assert(opcode[i]!=5);
5537     //assert(opcode[i]!=7);
5538     //assert(opcode[i]!=0x15);
5539     //assert(opcode[i]!=0x17);
5540   }
5541   else if(rs1[i]==0)
5542   {
5543     s1l=s2l;s1h=s2h;
5544     s2l=s2h=-1;
5545     only32=(regs[i].was32>>rs2[i])&1;
5546   }
5547   else if(rs2[i]==0)
5548   {
5549     s2l=s2h=-1;
5550     only32=(regs[i].was32>>rs1[i])&1;
5551   }
5552   else {
5553     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5554   }
5555
5556   if(ooo[i]) {
5557     // Out of order execution (delay slot first)
5558     //printf("OOOE\n");
5559     address_generation(i+1,i_regs,regs[i].regmap_entry);
5560     ds_assemble(i+1,i_regs);
5561     int adj;
5562     uint64_t bc_unneeded=branch_regs[i].u;
5563     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5564     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5565     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5566     bc_unneeded|=1;
5567     bc_unneeded_upper|=1;
5568     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5569                   bc_unneeded,bc_unneeded_upper);
5570     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5571     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5572     cc=get_reg(branch_regs[i].regmap,CCREG);
5573     assert(cc==HOST_CCREG);
5574     if(unconditional) 
5575       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5576     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5577     //assem_debug("cycle count (adj)\n");
5578     if(unconditional) {
5579       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5580       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5581         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5582         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5583         if(internal)
5584           assem_debug("branch: internal\n");
5585         else
5586           assem_debug("branch: external\n");
5587         if(internal&&is_ds[(ba[i]-start)>>2]) {
5588           ds_assemble_entry(i);
5589         }
5590         else {
5591           add_to_linker((int)out,ba[i],internal);
5592           emit_jmp(0);
5593         }
5594         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5595         if(((u_int)out)&7) emit_addnop(0);
5596         #endif
5597       }
5598     }
5599     else if(nop) {
5600       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5601       int jaddr=(int)out;
5602       emit_jns(0);
5603       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5604     }
5605     else {
5606       int taken=0,nottaken=0,nottaken1=0;
5607       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5608       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5609       if(!only32)
5610       {
5611         assert(s1h>=0);
5612         if(opcode[i]==4) // BEQ
5613         {
5614           if(s2h>=0) emit_cmp(s1h,s2h);
5615           else emit_test(s1h,s1h);
5616           nottaken1=(int)out;
5617           emit_jne(1);
5618         }
5619         if(opcode[i]==5) // BNE
5620         {
5621           if(s2h>=0) emit_cmp(s1h,s2h);
5622           else emit_test(s1h,s1h);
5623           if(invert) taken=(int)out;
5624           else add_to_linker((int)out,ba[i],internal);
5625           emit_jne(0);
5626         }
5627         if(opcode[i]==6) // BLEZ
5628         {
5629           emit_test(s1h,s1h);
5630           if(invert) taken=(int)out;
5631           else add_to_linker((int)out,ba[i],internal);
5632           emit_js(0);
5633           nottaken1=(int)out;
5634           emit_jne(1);
5635         }
5636         if(opcode[i]==7) // BGTZ
5637         {
5638           emit_test(s1h,s1h);
5639           nottaken1=(int)out;
5640           emit_js(1);
5641           if(invert) taken=(int)out;
5642           else add_to_linker((int)out,ba[i],internal);
5643           emit_jne(0);
5644         }
5645       } // if(!only32)
5646           
5647       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5648       assert(s1l>=0);
5649       if(opcode[i]==4) // BEQ
5650       {
5651         if(s2l>=0) emit_cmp(s1l,s2l);
5652         else emit_test(s1l,s1l);
5653         if(invert){
5654           nottaken=(int)out;
5655           emit_jne(1);
5656         }else{
5657           add_to_linker((int)out,ba[i],internal);
5658           emit_jeq(0);
5659         }
5660       }
5661       if(opcode[i]==5) // BNE
5662       {
5663         if(s2l>=0) emit_cmp(s1l,s2l);
5664         else emit_test(s1l,s1l);
5665         if(invert){
5666           nottaken=(int)out;
5667           emit_jeq(1);
5668         }else{
5669           add_to_linker((int)out,ba[i],internal);
5670           emit_jne(0);
5671         }
5672       }
5673       if(opcode[i]==6) // BLEZ
5674       {
5675         emit_cmpimm(s1l,1);
5676         if(invert){
5677           nottaken=(int)out;
5678           emit_jge(1);
5679         }else{
5680           add_to_linker((int)out,ba[i],internal);
5681           emit_jl(0);
5682         }
5683       }
5684       if(opcode[i]==7) // BGTZ
5685       {
5686         emit_cmpimm(s1l,1);
5687         if(invert){
5688           nottaken=(int)out;
5689           emit_jl(1);
5690         }else{
5691           add_to_linker((int)out,ba[i],internal);
5692           emit_jge(0);
5693         }
5694       }
5695       if(invert) {
5696         if(taken) set_jump_target(taken,(int)out);
5697         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5698         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5699           if(adj) {
5700             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5701             add_to_linker((int)out,ba[i],internal);
5702           }else{
5703             emit_addnop(13);
5704             add_to_linker((int)out,ba[i],internal*2);
5705           }
5706           emit_jmp(0);
5707         }else
5708         #endif
5709         {
5710           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5711           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5712           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5713           if(internal)
5714             assem_debug("branch: internal\n");
5715           else
5716             assem_debug("branch: external\n");
5717           if(internal&&is_ds[(ba[i]-start)>>2]) {
5718             ds_assemble_entry(i);
5719           }
5720           else {
5721             add_to_linker((int)out,ba[i],internal);
5722             emit_jmp(0);
5723           }
5724         }
5725         set_jump_target(nottaken,(int)out);
5726       }
5727
5728       if(nottaken1) set_jump_target(nottaken1,(int)out);
5729       if(adj) {
5730         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5731       }
5732     } // (!unconditional)
5733   } // if(ooo)
5734   else
5735   {
5736     // In-order execution (branch first)
5737     //if(likely[i]) printf("IOL\n");
5738     //else
5739     //printf("IOE\n");
5740     int taken=0,nottaken=0,nottaken1=0;
5741     if(!unconditional&&!nop) {
5742       if(!only32)
5743       {
5744         assert(s1h>=0);
5745         if((opcode[i]&0x2f)==4) // BEQ
5746         {
5747           if(s2h>=0) emit_cmp(s1h,s2h);
5748           else emit_test(s1h,s1h);
5749           nottaken1=(int)out;
5750           emit_jne(2);
5751         }
5752         if((opcode[i]&0x2f)==5) // BNE
5753         {
5754           if(s2h>=0) emit_cmp(s1h,s2h);
5755           else emit_test(s1h,s1h);
5756           taken=(int)out;
5757           emit_jne(1);
5758         }
5759         if((opcode[i]&0x2f)==6) // BLEZ
5760         {
5761           emit_test(s1h,s1h);
5762           taken=(int)out;
5763           emit_js(1);
5764           nottaken1=(int)out;
5765           emit_jne(2);
5766         }
5767         if((opcode[i]&0x2f)==7) // BGTZ
5768         {
5769           emit_test(s1h,s1h);
5770           nottaken1=(int)out;
5771           emit_js(2);
5772           taken=(int)out;
5773           emit_jne(1);
5774         }
5775       } // if(!only32)
5776           
5777       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5778       assert(s1l>=0);
5779       if((opcode[i]&0x2f)==4) // BEQ
5780       {
5781         if(s2l>=0) emit_cmp(s1l,s2l);
5782         else emit_test(s1l,s1l);
5783         nottaken=(int)out;
5784         emit_jne(2);
5785       }
5786       if((opcode[i]&0x2f)==5) // BNE
5787       {
5788         if(s2l>=0) emit_cmp(s1l,s2l);
5789         else emit_test(s1l,s1l);
5790         nottaken=(int)out;
5791         emit_jeq(2);
5792       }
5793       if((opcode[i]&0x2f)==6) // BLEZ
5794       {
5795         emit_cmpimm(s1l,1);
5796         nottaken=(int)out;
5797         emit_jge(2);
5798       }
5799       if((opcode[i]&0x2f)==7) // BGTZ
5800       {
5801         emit_cmpimm(s1l,1);
5802         nottaken=(int)out;
5803         emit_jl(2);
5804       }
5805     } // if(!unconditional)
5806     int adj;
5807     uint64_t ds_unneeded=branch_regs[i].u;
5808     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5809     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5810     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5811     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5812     ds_unneeded|=1;
5813     ds_unneeded_upper|=1;
5814     // branch taken
5815     if(!nop) {
5816       if(taken) set_jump_target(taken,(int)out);
5817       assem_debug("1:\n");
5818       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5819                     ds_unneeded,ds_unneeded_upper);
5820       // load regs
5821       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5822       address_generation(i+1,&branch_regs[i],0);
5823       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5824       ds_assemble(i+1,&branch_regs[i]);
5825       cc=get_reg(branch_regs[i].regmap,CCREG);
5826       if(cc==-1) {
5827         emit_loadreg(CCREG,cc=HOST_CCREG);
5828         // CHECK: Is the following instruction (fall thru) allocated ok?
5829       }
5830       assert(cc==HOST_CCREG);
5831       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5832       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5833       assem_debug("cycle count (adj)\n");
5834       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5835       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5836       if(internal)
5837         assem_debug("branch: internal\n");
5838       else
5839         assem_debug("branch: external\n");
5840       if(internal&&is_ds[(ba[i]-start)>>2]) {
5841         ds_assemble_entry(i);
5842       }
5843       else {
5844         add_to_linker((int)out,ba[i],internal);
5845         emit_jmp(0);
5846       }
5847     }
5848     // branch not taken
5849     cop1_usable=prev_cop1_usable;
5850     if(!unconditional) {
5851       if(nottaken1) set_jump_target(nottaken1,(int)out);
5852       set_jump_target(nottaken,(int)out);
5853       assem_debug("2:\n");
5854       if(!likely[i]) {
5855         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5856                       ds_unneeded,ds_unneeded_upper);
5857         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5858         address_generation(i+1,&branch_regs[i],0);
5859         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5860         ds_assemble(i+1,&branch_regs[i]);
5861       }
5862       cc=get_reg(branch_regs[i].regmap,CCREG);
5863       if(cc==-1&&!likely[i]) {
5864         // Cycle count isn't in a register, temporarily load it then write it out
5865         emit_loadreg(CCREG,HOST_CCREG);
5866         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5867         int jaddr=(int)out;
5868         emit_jns(0);
5869         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5870         emit_storereg(CCREG,HOST_CCREG);
5871       }
5872       else{
5873         cc=get_reg(i_regmap,CCREG);
5874         assert(cc==HOST_CCREG);
5875         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5876         int jaddr=(int)out;
5877         emit_jns(0);
5878         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5879       }
5880     }
5881   }
5882 }
5883
5884 void sjump_assemble(int i,struct regstat *i_regs)
5885 {
5886   signed char *i_regmap=i_regs->regmap;
5887   int cc;
5888   int match;
5889   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5890   assem_debug("smatch=%d\n",match);
5891   int s1h,s1l;
5892   int prev_cop1_usable=cop1_usable;
5893   int unconditional=0,nevertaken=0;
5894   int only32=0;
5895   int invert=0;
5896   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5897   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5898   if(!match) invert=1;
5899   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5900   if(i>(ba[i]-start)>>2) invert=1;
5901   #endif
5902
5903   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5904   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5905
5906   if(ooo[i]) {
5907     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5908     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5909   }
5910   else {
5911     s1l=get_reg(i_regmap,rs1[i]);
5912     s1h=get_reg(i_regmap,rs1[i]|64);
5913   }
5914   if(rs1[i]==0)
5915   {
5916     if(opcode2[i]&1) unconditional=1;
5917     else nevertaken=1;
5918     // These are never taken (r0 is never less than zero)
5919     //assert(opcode2[i]!=0);
5920     //assert(opcode2[i]!=2);
5921     //assert(opcode2[i]!=0x10);
5922     //assert(opcode2[i]!=0x12);
5923   }
5924   else {
5925     only32=(regs[i].was32>>rs1[i])&1;
5926   }
5927
5928   if(ooo[i]) {
5929     // Out of order execution (delay slot first)
5930     //printf("OOOE\n");
5931     address_generation(i+1,i_regs,regs[i].regmap_entry);
5932     ds_assemble(i+1,i_regs);
5933     int adj;
5934     uint64_t bc_unneeded=branch_regs[i].u;
5935     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5936     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5937     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5938     bc_unneeded|=1;
5939     bc_unneeded_upper|=1;
5940     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5941                   bc_unneeded,bc_unneeded_upper);
5942     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5943     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5944     if(rt1[i]==31) {
5945       int rt,return_address;
5946       rt=get_reg(branch_regs[i].regmap,31);
5947       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5948       if(rt>=0) {
5949         // Save the PC even if the branch is not taken
5950         return_address=start+i*4+8;
5951         emit_movimm(return_address,rt); // PC into link register
5952         #ifdef IMM_PREFETCH
5953         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5954         #endif
5955       }
5956     }
5957     cc=get_reg(branch_regs[i].regmap,CCREG);
5958     assert(cc==HOST_CCREG);
5959     if(unconditional) 
5960       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5961     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5962     assem_debug("cycle count (adj)\n");
5963     if(unconditional) {
5964       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5965       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5966         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5967         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5968         if(internal)
5969           assem_debug("branch: internal\n");
5970         else
5971           assem_debug("branch: external\n");
5972         if(internal&&is_ds[(ba[i]-start)>>2]) {
5973           ds_assemble_entry(i);
5974         }
5975         else {
5976           add_to_linker((int)out,ba[i],internal);
5977           emit_jmp(0);
5978         }
5979         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5980         if(((u_int)out)&7) emit_addnop(0);
5981         #endif
5982       }
5983     }
5984     else if(nevertaken) {
5985       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5986       int jaddr=(int)out;
5987       emit_jns(0);
5988       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5989     }
5990     else {
5991       int nottaken=0;
5992       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5993       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5994       if(!only32)
5995       {
5996         assert(s1h>=0);
5997         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5998         {
5999           emit_test(s1h,s1h);
6000           if(invert){
6001             nottaken=(int)out;
6002             emit_jns(1);
6003           }else{
6004             add_to_linker((int)out,ba[i],internal);
6005             emit_js(0);
6006           }
6007         }
6008         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6009         {
6010           emit_test(s1h,s1h);
6011           if(invert){
6012             nottaken=(int)out;
6013             emit_js(1);
6014           }else{
6015             add_to_linker((int)out,ba[i],internal);
6016             emit_jns(0);
6017           }
6018         }
6019       } // if(!only32)
6020       else
6021       {
6022         assert(s1l>=0);
6023         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6024         {
6025           emit_test(s1l,s1l);
6026           if(invert){
6027             nottaken=(int)out;
6028             emit_jns(1);
6029           }else{
6030             add_to_linker((int)out,ba[i],internal);
6031             emit_js(0);
6032           }
6033         }
6034         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6035         {
6036           emit_test(s1l,s1l);
6037           if(invert){
6038             nottaken=(int)out;
6039             emit_js(1);
6040           }else{
6041             add_to_linker((int)out,ba[i],internal);
6042             emit_jns(0);
6043           }
6044         }
6045       } // if(!only32)
6046           
6047       if(invert) {
6048         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6049         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6050           if(adj) {
6051             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6052             add_to_linker((int)out,ba[i],internal);
6053           }else{
6054             emit_addnop(13);
6055             add_to_linker((int)out,ba[i],internal*2);
6056           }
6057           emit_jmp(0);
6058         }else
6059         #endif
6060         {
6061           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6062           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6063           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6064           if(internal)
6065             assem_debug("branch: internal\n");
6066           else
6067             assem_debug("branch: external\n");
6068           if(internal&&is_ds[(ba[i]-start)>>2]) {
6069             ds_assemble_entry(i);
6070           }
6071           else {
6072             add_to_linker((int)out,ba[i],internal);
6073             emit_jmp(0);
6074           }
6075         }
6076         set_jump_target(nottaken,(int)out);
6077       }
6078
6079       if(adj) {
6080         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6081       }
6082     } // (!unconditional)
6083   } // if(ooo)
6084   else
6085   {
6086     // In-order execution (branch first)
6087     //printf("IOE\n");
6088     int nottaken=0;
6089     if(rt1[i]==31) {
6090       int rt,return_address;
6091       rt=get_reg(branch_regs[i].regmap,31);
6092       if(rt>=0) {
6093         // Save the PC even if the branch is not taken
6094         return_address=start+i*4+8;
6095         emit_movimm(return_address,rt); // PC into link register
6096         #ifdef IMM_PREFETCH
6097         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6098         #endif
6099       }
6100     }
6101     if(!unconditional) {
6102       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6103       if(!only32)
6104       {
6105         assert(s1h>=0);
6106         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6107         {
6108           emit_test(s1h,s1h);
6109           nottaken=(int)out;
6110           emit_jns(1);
6111         }
6112         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6113         {
6114           emit_test(s1h,s1h);
6115           nottaken=(int)out;
6116           emit_js(1);
6117         }
6118       } // if(!only32)
6119       else
6120       {
6121         assert(s1l>=0);
6122         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6123         {
6124           emit_test(s1l,s1l);
6125           nottaken=(int)out;
6126           emit_jns(1);
6127         }
6128         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6129         {
6130           emit_test(s1l,s1l);
6131           nottaken=(int)out;
6132           emit_js(1);
6133         }
6134       }
6135     } // if(!unconditional)
6136     int adj;
6137     uint64_t ds_unneeded=branch_regs[i].u;
6138     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6139     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6140     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6141     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6142     ds_unneeded|=1;
6143     ds_unneeded_upper|=1;
6144     // branch taken
6145     if(!nevertaken) {
6146       //assem_debug("1:\n");
6147       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6148                     ds_unneeded,ds_unneeded_upper);
6149       // load regs
6150       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6151       address_generation(i+1,&branch_regs[i],0);
6152       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6153       ds_assemble(i+1,&branch_regs[i]);
6154       cc=get_reg(branch_regs[i].regmap,CCREG);
6155       if(cc==-1) {
6156         emit_loadreg(CCREG,cc=HOST_CCREG);
6157         // CHECK: Is the following instruction (fall thru) allocated ok?
6158       }
6159       assert(cc==HOST_CCREG);
6160       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6161       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6162       assem_debug("cycle count (adj)\n");
6163       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6164       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6165       if(internal)
6166         assem_debug("branch: internal\n");
6167       else
6168         assem_debug("branch: external\n");
6169       if(internal&&is_ds[(ba[i]-start)>>2]) {
6170         ds_assemble_entry(i);
6171       }
6172       else {
6173         add_to_linker((int)out,ba[i],internal);
6174         emit_jmp(0);
6175       }
6176     }
6177     // branch not taken
6178     cop1_usable=prev_cop1_usable;
6179     if(!unconditional) {
6180       set_jump_target(nottaken,(int)out);
6181       assem_debug("1:\n");
6182       if(!likely[i]) {
6183         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6184                       ds_unneeded,ds_unneeded_upper);
6185         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6186         address_generation(i+1,&branch_regs[i],0);
6187         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6188         ds_assemble(i+1,&branch_regs[i]);
6189       }
6190       cc=get_reg(branch_regs[i].regmap,CCREG);
6191       if(cc==-1&&!likely[i]) {
6192         // Cycle count isn't in a register, temporarily load it then write it out
6193         emit_loadreg(CCREG,HOST_CCREG);
6194         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6195         int jaddr=(int)out;
6196         emit_jns(0);
6197         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6198         emit_storereg(CCREG,HOST_CCREG);
6199       }
6200       else{
6201         cc=get_reg(i_regmap,CCREG);
6202         assert(cc==HOST_CCREG);
6203         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6204         int jaddr=(int)out;
6205         emit_jns(0);
6206         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6207       }
6208     }
6209   }
6210 }
6211
6212 void fjump_assemble(int i,struct regstat *i_regs)
6213 {
6214   signed char *i_regmap=i_regs->regmap;
6215   int cc;
6216   int match;
6217   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6218   assem_debug("fmatch=%d\n",match);
6219   int fs,cs;
6220   int eaddr;
6221   int invert=0;
6222   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6223   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6224   if(!match) invert=1;
6225   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6226   if(i>(ba[i]-start)>>2) invert=1;
6227   #endif
6228
6229   if(ooo[i]) {
6230     fs=get_reg(branch_regs[i].regmap,FSREG);
6231     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6232   }
6233   else {
6234     fs=get_reg(i_regmap,FSREG);
6235   }
6236
6237   // Check cop1 unusable
6238   if(!cop1_usable) {
6239     cs=get_reg(i_regmap,CSREG);
6240     assert(cs>=0);
6241     emit_testimm(cs,0x20000000);
6242     eaddr=(int)out;
6243     emit_jeq(0);
6244     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6245     cop1_usable=1;
6246   }
6247
6248   if(ooo[i]) {
6249     // Out of order execution (delay slot first)
6250     //printf("OOOE\n");
6251     ds_assemble(i+1,i_regs);
6252     int adj;
6253     uint64_t bc_unneeded=branch_regs[i].u;
6254     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6255     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6256     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6257     bc_unneeded|=1;
6258     bc_unneeded_upper|=1;
6259     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6260                   bc_unneeded,bc_unneeded_upper);
6261     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6262     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6263     cc=get_reg(branch_regs[i].regmap,CCREG);
6264     assert(cc==HOST_CCREG);
6265     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6266     assem_debug("cycle count (adj)\n");
6267     if(1) {
6268       int nottaken=0;
6269       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6270       if(1) {
6271         assert(fs>=0);
6272         emit_testimm(fs,0x800000);
6273         if(source[i]&0x10000) // BC1T
6274         {
6275           if(invert){
6276             nottaken=(int)out;
6277             emit_jeq(1);
6278           }else{
6279             add_to_linker((int)out,ba[i],internal);
6280             emit_jne(0);
6281           }
6282         }
6283         else // BC1F
6284           if(invert){
6285             nottaken=(int)out;
6286             emit_jne(1);
6287           }else{
6288             add_to_linker((int)out,ba[i],internal);
6289             emit_jeq(0);
6290           }
6291         {
6292         }
6293       } // if(!only32)
6294           
6295       if(invert) {
6296         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6297         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6298         else if(match) emit_addnop(13);
6299         #endif
6300         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6301         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6302         if(internal)
6303           assem_debug("branch: internal\n");
6304         else
6305           assem_debug("branch: external\n");
6306         if(internal&&is_ds[(ba[i]-start)>>2]) {
6307           ds_assemble_entry(i);
6308         }
6309         else {
6310           add_to_linker((int)out,ba[i],internal);
6311           emit_jmp(0);
6312         }
6313         set_jump_target(nottaken,(int)out);
6314       }
6315
6316       if(adj) {
6317         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6318       }
6319     } // (!unconditional)
6320   } // if(ooo)
6321   else
6322   {
6323     // In-order execution (branch first)
6324     //printf("IOE\n");
6325     int nottaken=0;
6326     if(1) {
6327       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6328       if(1) {
6329         assert(fs>=0);
6330         emit_testimm(fs,0x800000);
6331         if(source[i]&0x10000) // BC1T
6332         {
6333           nottaken=(int)out;
6334           emit_jeq(1);
6335         }
6336         else // BC1F
6337         {
6338           nottaken=(int)out;
6339           emit_jne(1);
6340         }
6341       }
6342     } // if(!unconditional)
6343     int adj;
6344     uint64_t ds_unneeded=branch_regs[i].u;
6345     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6346     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6347     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6348     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6349     ds_unneeded|=1;
6350     ds_unneeded_upper|=1;
6351     // branch taken
6352     //assem_debug("1:\n");
6353     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6354                   ds_unneeded,ds_unneeded_upper);
6355     // load regs
6356     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6357     address_generation(i+1,&branch_regs[i],0);
6358     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6359     ds_assemble(i+1,&branch_regs[i]);
6360     cc=get_reg(branch_regs[i].regmap,CCREG);
6361     if(cc==-1) {
6362       emit_loadreg(CCREG,cc=HOST_CCREG);
6363       // CHECK: Is the following instruction (fall thru) allocated ok?
6364     }
6365     assert(cc==HOST_CCREG);
6366     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6367     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6368     assem_debug("cycle count (adj)\n");
6369     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6370     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6371     if(internal)
6372       assem_debug("branch: internal\n");
6373     else
6374       assem_debug("branch: external\n");
6375     if(internal&&is_ds[(ba[i]-start)>>2]) {
6376       ds_assemble_entry(i);
6377     }
6378     else {
6379       add_to_linker((int)out,ba[i],internal);
6380       emit_jmp(0);
6381     }
6382
6383     // branch not taken
6384     if(1) { // <- FIXME (don't need this)
6385       set_jump_target(nottaken,(int)out);
6386       assem_debug("1:\n");
6387       if(!likely[i]) {
6388         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6389                       ds_unneeded,ds_unneeded_upper);
6390         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6391         address_generation(i+1,&branch_regs[i],0);
6392         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6393         ds_assemble(i+1,&branch_regs[i]);
6394       }
6395       cc=get_reg(branch_regs[i].regmap,CCREG);
6396       if(cc==-1&&!likely[i]) {
6397         // Cycle count isn't in a register, temporarily load it then write it out
6398         emit_loadreg(CCREG,HOST_CCREG);
6399         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6400         int jaddr=(int)out;
6401         emit_jns(0);
6402         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6403         emit_storereg(CCREG,HOST_CCREG);
6404       }
6405       else{
6406         cc=get_reg(i_regmap,CCREG);
6407         assert(cc==HOST_CCREG);
6408         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6409         int jaddr=(int)out;
6410         emit_jns(0);
6411         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6412       }
6413     }
6414   }
6415 }
6416
6417 static void pagespan_assemble(int i,struct regstat *i_regs)
6418 {
6419   int s1l=get_reg(i_regs->regmap,rs1[i]);
6420   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6421   int s2l=get_reg(i_regs->regmap,rs2[i]);
6422   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6423   void *nt_branch=NULL;
6424   int taken=0;
6425   int nottaken=0;
6426   int unconditional=0;
6427   if(rs1[i]==0)
6428   {
6429     s1l=s2l;s1h=s2h;
6430     s2l=s2h=-1;
6431   }
6432   else if(rs2[i]==0)
6433   {
6434     s2l=s2h=-1;
6435   }
6436   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6437     s1h=s2h=-1;
6438   }
6439   int hr=0;
6440   int addr,alt,ntaddr;
6441   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6442   else {
6443     while(hr<HOST_REGS)
6444     {
6445       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6446          (i_regs->regmap[hr]&63)!=rs1[i] &&
6447          (i_regs->regmap[hr]&63)!=rs2[i] )
6448       {
6449         addr=hr++;break;
6450       }
6451       hr++;
6452     }
6453   }
6454   while(hr<HOST_REGS)
6455   {
6456     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6457        (i_regs->regmap[hr]&63)!=rs1[i] &&
6458        (i_regs->regmap[hr]&63)!=rs2[i] )
6459     {
6460       alt=hr++;break;
6461     }
6462     hr++;
6463   }
6464   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6465   {
6466     while(hr<HOST_REGS)
6467     {
6468       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6469          (i_regs->regmap[hr]&63)!=rs1[i] &&
6470          (i_regs->regmap[hr]&63)!=rs2[i] )
6471       {
6472         ntaddr=hr;break;
6473       }
6474       hr++;
6475     }
6476   }
6477   assert(hr<HOST_REGS);
6478   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6479     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6480   }
6481   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6482   if(opcode[i]==2) // J
6483   {
6484     unconditional=1;
6485   }
6486   if(opcode[i]==3) // JAL
6487   {
6488     // TODO: mini_ht
6489     int rt=get_reg(i_regs->regmap,31);
6490     emit_movimm(start+i*4+8,rt);
6491     unconditional=1;
6492   }
6493   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6494   {
6495     emit_mov(s1l,addr);
6496     if(opcode2[i]==9) // JALR
6497     {
6498       int rt=get_reg(i_regs->regmap,rt1[i]);
6499       emit_movimm(start+i*4+8,rt);
6500     }
6501   }
6502   if((opcode[i]&0x3f)==4) // BEQ
6503   {
6504     if(rs1[i]==rs2[i])
6505     {
6506       unconditional=1;
6507     }
6508     else
6509     #ifdef HAVE_CMOV_IMM
6510     if(s1h<0) {
6511       if(s2l>=0) emit_cmp(s1l,s2l);
6512       else emit_test(s1l,s1l);
6513       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6514     }
6515     else
6516     #endif
6517     {
6518       assert(s1l>=0);
6519       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6520       if(s1h>=0) {
6521         if(s2h>=0) emit_cmp(s1h,s2h);
6522         else emit_test(s1h,s1h);
6523         emit_cmovne_reg(alt,addr);
6524       }
6525       if(s2l>=0) emit_cmp(s1l,s2l);
6526       else emit_test(s1l,s1l);
6527       emit_cmovne_reg(alt,addr);
6528     }
6529   }
6530   if((opcode[i]&0x3f)==5) // BNE
6531   {
6532     #ifdef HAVE_CMOV_IMM
6533     if(s1h<0) {
6534       if(s2l>=0) emit_cmp(s1l,s2l);
6535       else emit_test(s1l,s1l);
6536       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6537     }
6538     else
6539     #endif
6540     {
6541       assert(s1l>=0);
6542       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6543       if(s1h>=0) {
6544         if(s2h>=0) emit_cmp(s1h,s2h);
6545         else emit_test(s1h,s1h);
6546         emit_cmovne_reg(alt,addr);
6547       }
6548       if(s2l>=0) emit_cmp(s1l,s2l);
6549       else emit_test(s1l,s1l);
6550       emit_cmovne_reg(alt,addr);
6551     }
6552   }
6553   if((opcode[i]&0x3f)==0x14) // BEQL
6554   {
6555     if(s1h>=0) {
6556       if(s2h>=0) emit_cmp(s1h,s2h);
6557       else emit_test(s1h,s1h);
6558       nottaken=(int)out;
6559       emit_jne(0);
6560     }
6561     if(s2l>=0) emit_cmp(s1l,s2l);
6562     else emit_test(s1l,s1l);
6563     if(nottaken) set_jump_target(nottaken,(int)out);
6564     nottaken=(int)out;
6565     emit_jne(0);
6566   }
6567   if((opcode[i]&0x3f)==0x15) // BNEL
6568   {
6569     if(s1h>=0) {
6570       if(s2h>=0) emit_cmp(s1h,s2h);
6571       else emit_test(s1h,s1h);
6572       taken=(int)out;
6573       emit_jne(0);
6574     }
6575     if(s2l>=0) emit_cmp(s1l,s2l);
6576     else emit_test(s1l,s1l);
6577     nottaken=(int)out;
6578     emit_jeq(0);
6579     if(taken) set_jump_target(taken,(int)out);
6580   }
6581   if((opcode[i]&0x3f)==6) // BLEZ
6582   {
6583     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6584     emit_cmpimm(s1l,1);
6585     if(s1h>=0) emit_mov(addr,ntaddr);
6586     emit_cmovl_reg(alt,addr);
6587     if(s1h>=0) {
6588       emit_test(s1h,s1h);
6589       emit_cmovne_reg(ntaddr,addr);
6590       emit_cmovs_reg(alt,addr);
6591     }
6592   }
6593   if((opcode[i]&0x3f)==7) // BGTZ
6594   {
6595     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6596     emit_cmpimm(s1l,1);
6597     if(s1h>=0) emit_mov(addr,alt);
6598     emit_cmovl_reg(ntaddr,addr);
6599     if(s1h>=0) {
6600       emit_test(s1h,s1h);
6601       emit_cmovne_reg(alt,addr);
6602       emit_cmovs_reg(ntaddr,addr);
6603     }
6604   }
6605   if((opcode[i]&0x3f)==0x16) // BLEZL
6606   {
6607     assert((opcode[i]&0x3f)!=0x16);
6608   }
6609   if((opcode[i]&0x3f)==0x17) // BGTZL
6610   {
6611     assert((opcode[i]&0x3f)!=0x17);
6612   }
6613   assert(opcode[i]!=1); // BLTZ/BGEZ
6614
6615   //FIXME: Check CSREG
6616   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6617     if((source[i]&0x30000)==0) // BC1F
6618     {
6619       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6620       emit_testimm(s1l,0x800000);
6621       emit_cmovne_reg(alt,addr);
6622     }
6623     if((source[i]&0x30000)==0x10000) // BC1T
6624     {
6625       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6626       emit_testimm(s1l,0x800000);
6627       emit_cmovne_reg(alt,addr);
6628     }
6629     if((source[i]&0x30000)==0x20000) // BC1FL
6630     {
6631       emit_testimm(s1l,0x800000);
6632       nottaken=(int)out;
6633       emit_jne(0);
6634     }
6635     if((source[i]&0x30000)==0x30000) // BC1TL
6636     {
6637       emit_testimm(s1l,0x800000);
6638       nottaken=(int)out;
6639       emit_jeq(0);
6640     }
6641   }
6642
6643   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6644   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6645   if(likely[i]||unconditional)
6646   {
6647     emit_movimm(ba[i],HOST_BTREG);
6648   }
6649   else if(addr!=HOST_BTREG)
6650   {
6651     emit_mov(addr,HOST_BTREG);
6652   }
6653   void *branch_addr=out;
6654   emit_jmp(0);
6655   int target_addr=start+i*4+5;
6656   void *stub=out;
6657   void *compiled_target_addr=check_addr(target_addr);
6658   emit_extjump_ds((int)branch_addr,target_addr);
6659   if(compiled_target_addr) {
6660     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6661     add_link(target_addr,stub);
6662   }
6663   else set_jump_target((int)branch_addr,(int)stub);
6664   if(likely[i]) {
6665     // Not-taken path
6666     set_jump_target((int)nottaken,(int)out);
6667     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6668     void *branch_addr=out;
6669     emit_jmp(0);
6670     int target_addr=start+i*4+8;
6671     void *stub=out;
6672     void *compiled_target_addr=check_addr(target_addr);
6673     emit_extjump_ds((int)branch_addr,target_addr);
6674     if(compiled_target_addr) {
6675       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6676       add_link(target_addr,stub);
6677     }
6678     else set_jump_target((int)branch_addr,(int)stub);
6679   }
6680 }
6681
6682 // Assemble the delay slot for the above
6683 static void pagespan_ds()
6684 {
6685   assem_debug("initial delay slot:\n");
6686   u_int vaddr=start+1;
6687   u_int page=get_page(vaddr);
6688   u_int vpage=get_vpage(vaddr);
6689   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6690   do_dirty_stub_ds();
6691   ll_add(jump_in+page,vaddr,(void *)out);
6692   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6693   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6694     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6695   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6696     emit_writeword(HOST_BTREG,(int)&branch_target);
6697   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6698   address_generation(0,&regs[0],regs[0].regmap_entry);
6699   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6700     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6701   cop1_usable=0;
6702   is_delayslot=0;
6703   switch(itype[0]) {
6704     case ALU:
6705       alu_assemble(0,&regs[0]);break;
6706     case IMM16:
6707       imm16_assemble(0,&regs[0]);break;
6708     case SHIFT:
6709       shift_assemble(0,&regs[0]);break;
6710     case SHIFTIMM:
6711       shiftimm_assemble(0,&regs[0]);break;
6712     case LOAD:
6713       load_assemble(0,&regs[0]);break;
6714     case LOADLR:
6715       loadlr_assemble(0,&regs[0]);break;
6716     case STORE:
6717       store_assemble(0,&regs[0]);break;
6718     case STORELR:
6719       storelr_assemble(0,&regs[0]);break;
6720     case COP0:
6721       cop0_assemble(0,&regs[0]);break;
6722     case COP1:
6723       cop1_assemble(0,&regs[0]);break;
6724     case C1LS:
6725       c1ls_assemble(0,&regs[0]);break;
6726     case COP2:
6727       cop2_assemble(0,&regs[0]);break;
6728     case C2LS:
6729       c2ls_assemble(0,&regs[0]);break;
6730     case C2OP:
6731       c2op_assemble(0,&regs[0]);break;
6732     case FCONV:
6733       fconv_assemble(0,&regs[0]);break;
6734     case FLOAT:
6735       float_assemble(0,&regs[0]);break;
6736     case FCOMP:
6737       fcomp_assemble(0,&regs[0]);break;
6738     case MULTDIV:
6739       multdiv_assemble(0,&regs[0]);break;
6740     case MOV:
6741       mov_assemble(0,&regs[0]);break;
6742     case SYSCALL:
6743     case HLECALL:
6744     case INTCALL:
6745     case SPAN:
6746     case UJUMP:
6747     case RJUMP:
6748     case CJUMP:
6749     case SJUMP:
6750     case FJUMP:
6751       printf("Jump in the delay slot.  This is probably a bug.\n");
6752   }
6753   int btaddr=get_reg(regs[0].regmap,BTREG);
6754   if(btaddr<0) {
6755     btaddr=get_reg(regs[0].regmap,-1);
6756     emit_readword((int)&branch_target,btaddr);
6757   }
6758   assert(btaddr!=HOST_CCREG);
6759   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6760 #ifdef HOST_IMM8
6761   emit_movimm(start+4,HOST_TEMPREG);
6762   emit_cmp(btaddr,HOST_TEMPREG);
6763 #else
6764   emit_cmpimm(btaddr,start+4);
6765 #endif
6766   int branch=(int)out;
6767   emit_jeq(0);
6768   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6769   emit_jmp(jump_vaddr_reg[btaddr]);
6770   set_jump_target(branch,(int)out);
6771   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6772   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6773 }
6774
6775 // Basic liveness analysis for MIPS registers
6776 void unneeded_registers(int istart,int iend,int r)
6777 {
6778   int i;
6779   uint64_t u,uu,gte_u,b,bu,gte_bu;
6780   uint64_t temp_u,temp_uu,temp_gte_u=0;
6781   uint64_t tdep;
6782   uint64_t gte_u_unknown=0;
6783   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6784     gte_u_unknown=~0ll;
6785   if(iend==slen-1) {
6786     u=1;uu=1;
6787     gte_u=gte_u_unknown;
6788   }else{
6789     u=unneeded_reg[iend+1];
6790     uu=unneeded_reg_upper[iend+1];
6791     u=1;uu=1;
6792     gte_u=gte_unneeded[iend+1];
6793   }
6794
6795   for (i=iend;i>=istart;i--)
6796   {
6797     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6798     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6799     {
6800       // If subroutine call, flag return address as a possible branch target
6801       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6802       
6803       if(ba[i]<start || ba[i]>=(start+slen*4))
6804       {
6805         // Branch out of this block, flush all regs
6806         u=1;
6807         uu=1;
6808         gte_u=gte_u_unknown;
6809         /* Hexagon hack 
6810         if(itype[i]==UJUMP&&rt1[i]==31)
6811         {
6812           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6813         }
6814         if(itype[i]==RJUMP&&rs1[i]==31)
6815         {
6816           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6817         }
6818         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6819           if(itype[i]==UJUMP&&rt1[i]==31)
6820           {
6821             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6822             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6823           }
6824           if(itype[i]==RJUMP&&rs1[i]==31)
6825           {
6826             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6827             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6828           }
6829         }*/
6830         branch_unneeded_reg[i]=u;
6831         branch_unneeded_reg_upper[i]=uu;
6832         // Merge in delay slot
6833         tdep=(~uu>>rt1[i+1])&1;
6834         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6835         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6836         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6837         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6838         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6839         u|=1;uu|=1;
6840         gte_u|=gte_rt[i+1];
6841         gte_u&=~gte_rs[i+1];
6842         // If branch is "likely" (and conditional)
6843         // then we skip the delay slot on the fall-thru path
6844         if(likely[i]) {
6845           if(i<slen-1) {
6846             u&=unneeded_reg[i+2];
6847             uu&=unneeded_reg_upper[i+2];
6848             gte_u&=gte_unneeded[i+2];
6849           }
6850           else
6851           {
6852             u=1;
6853             uu=1;
6854             gte_u=gte_u_unknown;
6855           }
6856         }
6857       }
6858       else
6859       {
6860         // Internal branch, flag target
6861         bt[(ba[i]-start)>>2]=1;
6862         if(ba[i]<=start+i*4) {
6863           // Backward branch
6864           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6865           {
6866             // Unconditional branch
6867             temp_u=1;temp_uu=1;
6868             temp_gte_u=0;
6869           } else {
6870             // Conditional branch (not taken case)
6871             temp_u=unneeded_reg[i+2];
6872             temp_uu=unneeded_reg_upper[i+2];
6873             temp_gte_u&=gte_unneeded[i+2];
6874           }
6875           // Merge in delay slot
6876           tdep=(~temp_uu>>rt1[i+1])&1;
6877           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6878           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6879           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6880           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6881           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6882           temp_u|=1;temp_uu|=1;
6883           temp_gte_u|=gte_rt[i+1];
6884           temp_gte_u&=~gte_rs[i+1];
6885           // If branch is "likely" (and conditional)
6886           // then we skip the delay slot on the fall-thru path
6887           if(likely[i]) {
6888             if(i<slen-1) {
6889               temp_u&=unneeded_reg[i+2];
6890               temp_uu&=unneeded_reg_upper[i+2];
6891               temp_gte_u&=gte_unneeded[i+2];
6892             }
6893             else
6894             {
6895               temp_u=1;
6896               temp_uu=1;
6897               temp_gte_u=gte_u_unknown;
6898             }
6899           }
6900           tdep=(~temp_uu>>rt1[i])&1;
6901           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6902           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6903           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6904           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6905           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6906           temp_u|=1;temp_uu|=1;
6907           temp_gte_u|=gte_rt[i];
6908           temp_gte_u&=~gte_rs[i];
6909           unneeded_reg[i]=temp_u;
6910           unneeded_reg_upper[i]=temp_uu;
6911           gte_unneeded[i]=temp_gte_u;
6912           // Only go three levels deep.  This recursion can take an
6913           // excessive amount of time if there are a lot of nested loops.
6914           if(r<2) {
6915             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6916           }else{
6917             unneeded_reg[(ba[i]-start)>>2]=1;
6918             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6919             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6920           }
6921         } /*else*/ if(1) {
6922           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6923           {
6924             // Unconditional branch
6925             u=unneeded_reg[(ba[i]-start)>>2];
6926             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6927             gte_u=gte_unneeded[(ba[i]-start)>>2];
6928             branch_unneeded_reg[i]=u;
6929             branch_unneeded_reg_upper[i]=uu;
6930         //u=1;
6931         //uu=1;
6932         //branch_unneeded_reg[i]=u;
6933         //branch_unneeded_reg_upper[i]=uu;
6934             // Merge in delay slot
6935             tdep=(~uu>>rt1[i+1])&1;
6936             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6937             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6938             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6939             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6940             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6941             u|=1;uu|=1;
6942             gte_u|=gte_rt[i+1];
6943             gte_u&=~gte_rs[i+1];
6944           } else {
6945             // Conditional branch
6946             b=unneeded_reg[(ba[i]-start)>>2];
6947             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6948             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6949             branch_unneeded_reg[i]=b;
6950             branch_unneeded_reg_upper[i]=bu;
6951         //b=1;
6952         //bu=1;
6953         //branch_unneeded_reg[i]=b;
6954         //branch_unneeded_reg_upper[i]=bu;
6955             // Branch delay slot
6956             tdep=(~uu>>rt1[i+1])&1;
6957             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6958             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6959             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6960             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6961             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6962             b|=1;bu|=1;
6963             gte_bu|=gte_rt[i+1];
6964             gte_bu&=~gte_rs[i+1];
6965             // If branch is "likely" then we skip the
6966             // delay slot on the fall-thru path
6967             if(likely[i]) {
6968               u=b;
6969               uu=bu;
6970               gte_u=gte_bu;
6971               if(i<slen-1) {
6972                 u&=unneeded_reg[i+2];
6973                 uu&=unneeded_reg_upper[i+2];
6974                 gte_u&=gte_unneeded[i+2];
6975         //u=1;
6976         //uu=1;
6977               }
6978             } else {
6979               u&=b;
6980               uu&=bu;
6981               gte_u&=gte_bu;
6982         //u=1;
6983         //uu=1;
6984             }
6985             if(i<slen-1) {
6986               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6987               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6988         //branch_unneeded_reg[i]=1;
6989         //branch_unneeded_reg_upper[i]=1;
6990             } else {
6991               branch_unneeded_reg[i]=1;
6992               branch_unneeded_reg_upper[i]=1;
6993             }
6994           }
6995         }
6996       }
6997     }
6998     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6999     {
7000       // SYSCALL instruction (software interrupt)
7001       u=1;
7002       uu=1;
7003     }
7004     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7005     {
7006       // ERET instruction (return from interrupt)
7007       u=1;
7008       uu=1;
7009     }
7010     //u=uu=1; // DEBUG
7011     tdep=(~uu>>rt1[i])&1;
7012     // Written registers are unneeded
7013     u|=1LL<<rt1[i];
7014     u|=1LL<<rt2[i];
7015     uu|=1LL<<rt1[i];
7016     uu|=1LL<<rt2[i];
7017     gte_u|=gte_rt[i];
7018     // Accessed registers are needed
7019     u&=~(1LL<<rs1[i]);
7020     u&=~(1LL<<rs2[i]);
7021     uu&=~(1LL<<us1[i]);
7022     uu&=~(1LL<<us2[i]);
7023     gte_u&=~gte_rs[i];
7024     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
7025       gte_u|=gte_rs[i];  // MFC2/CFC2 to dead register, unneeded
7026     // Source-target dependencies
7027     uu&=~(tdep<<dep1[i]);
7028     uu&=~(tdep<<dep2[i]);
7029     // R0 is always unneeded
7030     u|=1;uu|=1;
7031     // Save it
7032     unneeded_reg[i]=u;
7033     unneeded_reg_upper[i]=uu;
7034     gte_unneeded[i]=gte_u;
7035     /*
7036     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
7037     printf("U:");
7038     int r;
7039     for(r=1;r<=CCREG;r++) {
7040       if((unneeded_reg[i]>>r)&1) {
7041         if(r==HIREG) printf(" HI");
7042         else if(r==LOREG) printf(" LO");
7043         else printf(" r%d",r);
7044       }
7045     }
7046     printf(" UU:");
7047     for(r=1;r<=CCREG;r++) {
7048       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
7049         if(r==HIREG) printf(" HI");
7050         else if(r==LOREG) printf(" LO");
7051         else printf(" r%d",r);
7052       }
7053     }
7054     printf("\n");*/
7055   }
7056 #ifdef FORCE32
7057   for (i=iend;i>=istart;i--)
7058   {
7059     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7060   }
7061 #endif
7062 }
7063
7064 // Identify registers which are likely to contain 32-bit values
7065 // This is used to predict whether any branches will jump to a
7066 // location with 64-bit values in registers.
7067 static void provisional_32bit()
7068 {
7069   int i,j;
7070   uint64_t is32=1;
7071   uint64_t lastbranch=1;
7072   
7073   for(i=0;i<slen;i++)
7074   {
7075     if(i>0) {
7076       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7077         if(i>1) is32=lastbranch;
7078         else is32=1;
7079       }
7080     }
7081     if(i>1)
7082     {
7083       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7084         if(likely[i-2]) {
7085           if(i>2) is32=lastbranch;
7086           else is32=1;
7087         }
7088       }
7089       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7090       {
7091         if(rs1[i-2]==0||rs2[i-2]==0)
7092         {
7093           if(rs1[i-2]) {
7094             is32|=1LL<<rs1[i-2];
7095           }
7096           if(rs2[i-2]) {
7097             is32|=1LL<<rs2[i-2];
7098           }
7099         }
7100       }
7101     }
7102     // If something jumps here with 64-bit values
7103     // then promote those registers to 64 bits
7104     if(bt[i])
7105     {
7106       uint64_t temp_is32=is32;
7107       for(j=i-1;j>=0;j--)
7108       {
7109         if(ba[j]==start+i*4) 
7110           //temp_is32&=branch_regs[j].is32;
7111           temp_is32&=p32[j];
7112       }
7113       for(j=i;j<slen;j++)
7114       {
7115         if(ba[j]==start+i*4) 
7116           temp_is32=1;
7117       }
7118       is32=temp_is32;
7119     }
7120     int type=itype[i];
7121     int op=opcode[i];
7122     int op2=opcode2[i];
7123     int rt=rt1[i];
7124     int s1=rs1[i];
7125     int s2=rs2[i];
7126     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7127       // Branches don't write registers, consider the delay slot instead.
7128       type=itype[i+1];
7129       op=opcode[i+1];
7130       op2=opcode2[i+1];
7131       rt=rt1[i+1];
7132       s1=rs1[i+1];
7133       s2=rs2[i+1];
7134       lastbranch=is32;
7135     }
7136     switch(type) {
7137       case LOAD:
7138         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7139            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7140           is32&=~(1LL<<rt);
7141         else
7142           is32|=1LL<<rt;
7143         break;
7144       case STORE:
7145       case STORELR:
7146         break;
7147       case LOADLR:
7148         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7149         if(op==0x22) is32|=1LL<<rt; // LWL
7150         break;
7151       case IMM16:
7152         if (op==0x08||op==0x09|| // ADDI/ADDIU
7153             op==0x0a||op==0x0b|| // SLTI/SLTIU
7154             op==0x0c|| // ANDI
7155             op==0x0f)  // LUI
7156         {
7157           is32|=1LL<<rt;
7158         }
7159         if(op==0x18||op==0x19) { // DADDI/DADDIU
7160           is32&=~(1LL<<rt);
7161           //if(imm[i]==0)
7162           //  is32|=((is32>>s1)&1LL)<<rt;
7163         }
7164         if(op==0x0d||op==0x0e) { // ORI/XORI
7165           uint64_t sr=((is32>>s1)&1LL);
7166           is32&=~(1LL<<rt);
7167           is32|=sr<<rt;
7168         }
7169         break;
7170       case UJUMP:
7171         break;
7172       case RJUMP:
7173         break;
7174       case CJUMP:
7175         break;
7176       case SJUMP:
7177         break;
7178       case FJUMP:
7179         break;
7180       case ALU:
7181         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7182           is32|=1LL<<rt;
7183         }
7184         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7185           is32|=1LL<<rt;
7186         }
7187         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7188           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7189           is32&=~(1LL<<rt);
7190           is32|=sr<<rt;
7191         }
7192         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7193           if(s1==0&&s2==0) {
7194             is32|=1LL<<rt;
7195           }
7196           else if(s2==0) {
7197             uint64_t sr=((is32>>s1)&1LL);
7198             is32&=~(1LL<<rt);
7199             is32|=sr<<rt;
7200           }
7201           else if(s1==0) {
7202             uint64_t sr=((is32>>s2)&1LL);
7203             is32&=~(1LL<<rt);
7204             is32|=sr<<rt;
7205           }
7206           else {
7207             is32&=~(1LL<<rt);
7208           }
7209         }
7210         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7211           if(s1==0&&s2==0) {
7212             is32|=1LL<<rt;
7213           }
7214           else if(s2==0) {
7215             uint64_t sr=((is32>>s1)&1LL);
7216             is32&=~(1LL<<rt);
7217             is32|=sr<<rt;
7218           }
7219           else {
7220             is32&=~(1LL<<rt);
7221           }
7222         }
7223         break;
7224       case MULTDIV:
7225         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7226           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7227         }
7228         else {
7229           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7230         }
7231         break;
7232       case MOV:
7233         {
7234           uint64_t sr=((is32>>s1)&1LL);
7235           is32&=~(1LL<<rt);
7236           is32|=sr<<rt;
7237         }
7238         break;
7239       case SHIFT:
7240         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7241         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7242         break;
7243       case SHIFTIMM:
7244         is32|=1LL<<rt;
7245         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7246         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7247         break;
7248       case COP0:
7249         if(op2==0) is32|=1LL<<rt; // MFC0
7250         break;
7251       case COP1:
7252       case COP2:
7253         if(op2==0) is32|=1LL<<rt; // MFC1
7254         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7255         if(op2==2) is32|=1LL<<rt; // CFC1
7256         break;
7257       case C1LS:
7258       case C2LS:
7259         break;
7260       case FLOAT:
7261       case FCONV:
7262         break;
7263       case FCOMP:
7264         break;
7265       case C2OP:
7266       case SYSCALL:
7267       case HLECALL:
7268         break;
7269       default:
7270         break;
7271     }
7272     is32|=1;
7273     p32[i]=is32;
7274
7275     if(i>0)
7276     {
7277       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7278       {
7279         if(rt1[i-1]==31) // JAL/JALR
7280         {
7281           // Subroutine call will return here, don't alloc any registers
7282           is32=1;
7283         }
7284         else if(i+1<slen)
7285         {
7286           // Internal branch will jump here, match registers to caller
7287           is32=0x3FFFFFFFFLL;
7288         }
7289       }
7290     }
7291   }
7292 }
7293
7294 // Identify registers which may be assumed to contain 32-bit values
7295 // and where optimizations will rely on this.
7296 // This is used to determine whether backward branches can safely
7297 // jump to a location with 64-bit values in registers.
7298 static void provisional_r32()
7299 {
7300   u_int r32=0;
7301   int i;
7302   
7303   for (i=slen-1;i>=0;i--)
7304   {
7305     int hr;
7306     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7307     {
7308       if(ba[i]<start || ba[i]>=(start+slen*4))
7309       {
7310         // Branch out of this block, don't need anything
7311         r32=0;
7312       }
7313       else
7314       {
7315         // Internal branch
7316         // Need whatever matches the target
7317         // (and doesn't get overwritten by the delay slot instruction)
7318         r32=0;
7319         int t=(ba[i]-start)>>2;
7320         if(ba[i]>start+i*4) {
7321           // Forward branch
7322           //if(!(requires_32bit[t]&~regs[i].was32))
7323           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7324           if(!(pr32[t]&~regs[i].was32))
7325             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7326         }else{
7327           // Backward branch
7328           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7329             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7330         }
7331       }
7332       // Conditional branch may need registers for following instructions
7333       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7334       {
7335         if(i<slen-2) {
7336           //r32|=requires_32bit[i+2];
7337           r32|=pr32[i+2];
7338           r32&=regs[i].was32;
7339           // Mark this address as a branch target since it may be called
7340           // upon return from interrupt
7341           //bt[i+2]=1;
7342         }
7343       }
7344       // Merge in delay slot
7345       if(!likely[i]) {
7346         // These are overwritten unless the branch is "likely"
7347         // and the delay slot is nullified if not taken
7348         r32&=~(1LL<<rt1[i+1]);
7349         r32&=~(1LL<<rt2[i+1]);
7350       }
7351       // Assume these are needed (delay slot)
7352       if(us1[i+1]>0)
7353       {
7354         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7355       }
7356       if(us2[i+1]>0)
7357       {
7358         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7359       }
7360       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7361       {
7362         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7363       }
7364       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7365       {
7366         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7367       }
7368     }
7369     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7370     {
7371       // SYSCALL instruction (software interrupt)
7372       r32=0;
7373     }
7374     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7375     {
7376       // ERET instruction (return from interrupt)
7377       r32=0;
7378     }
7379     // Check 32 bits
7380     r32&=~(1LL<<rt1[i]);
7381     r32&=~(1LL<<rt2[i]);
7382     if(us1[i]>0)
7383     {
7384       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7385     }
7386     if(us2[i]>0)
7387     {
7388       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7389     }
7390     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7391     {
7392       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7393     }
7394     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7395     {
7396       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7397     }
7398     //requires_32bit[i]=r32;
7399     pr32[i]=r32;
7400     
7401     // Dirty registers which are 32-bit, require 32-bit input
7402     // as they will be written as 32-bit values
7403     for(hr=0;hr<HOST_REGS;hr++)
7404     {
7405       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7406         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7407           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7408           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7409           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7410         }
7411       }
7412     }
7413   }
7414 }
7415
7416 // Write back dirty registers as soon as we will no longer modify them,
7417 // so that we don't end up with lots of writes at the branches.
7418 void clean_registers(int istart,int iend,int wr)
7419 {
7420   int i;
7421   int r;
7422   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7423   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7424   if(iend==slen-1) {
7425     will_dirty_i=will_dirty_next=0;
7426     wont_dirty_i=wont_dirty_next=0;
7427   }else{
7428     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7429     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7430   }
7431   for (i=iend;i>=istart;i--)
7432   {
7433     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7434     {
7435       if(ba[i]<start || ba[i]>=(start+slen*4))
7436       {
7437         // Branch out of this block, flush all regs
7438         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7439         {
7440           // Unconditional branch
7441           will_dirty_i=0;
7442           wont_dirty_i=0;
7443           // Merge in delay slot (will dirty)
7444           for(r=0;r<HOST_REGS;r++) {
7445             if(r!=EXCLUDE_REG) {
7446               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7447               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7448               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7449               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7450               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7451               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7452               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7453               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7454               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7455               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7456               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7457               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7458               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7459               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7460             }
7461           }
7462         }
7463         else
7464         {
7465           // Conditional branch
7466           will_dirty_i=0;
7467           wont_dirty_i=wont_dirty_next;
7468           // Merge in delay slot (will dirty)
7469           for(r=0;r<HOST_REGS;r++) {
7470             if(r!=EXCLUDE_REG) {
7471               if(!likely[i]) {
7472                 // Might not dirty if likely branch is not taken
7473                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7474                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7475                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7476                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7477                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7478                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7479                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7480                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7481                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7482                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7483                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7484                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7485                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7486                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7487               }
7488             }
7489           }
7490         }
7491         // Merge in delay slot (wont dirty)
7492         for(r=0;r<HOST_REGS;r++) {
7493           if(r!=EXCLUDE_REG) {
7494             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7495             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7496             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7497             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7498             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7499             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7500             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7501             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7502             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7503             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7504           }
7505         }
7506         if(wr) {
7507           #ifndef DESTRUCTIVE_WRITEBACK
7508           branch_regs[i].dirty&=wont_dirty_i;
7509           #endif
7510           branch_regs[i].dirty|=will_dirty_i;
7511         }
7512       }
7513       else
7514       {
7515         // Internal branch
7516         if(ba[i]<=start+i*4) {
7517           // Backward branch
7518           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7519           {
7520             // Unconditional branch
7521             temp_will_dirty=0;
7522             temp_wont_dirty=0;
7523             // Merge in delay slot (will dirty)
7524             for(r=0;r<HOST_REGS;r++) {
7525               if(r!=EXCLUDE_REG) {
7526                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7527                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7528                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7529                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7530                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7531                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7532                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7533                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7534                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7535                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7536                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7537                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7538                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7539                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7540               }
7541             }
7542           } else {
7543             // Conditional branch (not taken case)
7544             temp_will_dirty=will_dirty_next;
7545             temp_wont_dirty=wont_dirty_next;
7546             // Merge in delay slot (will dirty)
7547             for(r=0;r<HOST_REGS;r++) {
7548               if(r!=EXCLUDE_REG) {
7549                 if(!likely[i]) {
7550                   // Will not dirty if likely branch is not taken
7551                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7552                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7553                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7554                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7555                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7556                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7557                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7558                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7559                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7560                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7561                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7562                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7563                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7564                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7565                 }
7566               }
7567             }
7568           }
7569           // Merge in delay slot (wont dirty)
7570           for(r=0;r<HOST_REGS;r++) {
7571             if(r!=EXCLUDE_REG) {
7572               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7573               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7574               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7575               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7576               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7577               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7578               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7579               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7580               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7581               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7582             }
7583           }
7584           // Deal with changed mappings
7585           if(i<iend) {
7586             for(r=0;r<HOST_REGS;r++) {
7587               if(r!=EXCLUDE_REG) {
7588                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7589                   temp_will_dirty&=~(1<<r);
7590                   temp_wont_dirty&=~(1<<r);
7591                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7592                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7593                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7594                   } else {
7595                     temp_will_dirty|=1<<r;
7596                     temp_wont_dirty|=1<<r;
7597                   }
7598                 }
7599               }
7600             }
7601           }
7602           if(wr) {
7603             will_dirty[i]=temp_will_dirty;
7604             wont_dirty[i]=temp_wont_dirty;
7605             clean_registers((ba[i]-start)>>2,i-1,0);
7606           }else{
7607             // Limit recursion.  It can take an excessive amount
7608             // of time if there are a lot of nested loops.
7609             will_dirty[(ba[i]-start)>>2]=0;
7610             wont_dirty[(ba[i]-start)>>2]=-1;
7611           }
7612         }
7613         /*else*/ if(1)
7614         {
7615           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7616           {
7617             // Unconditional branch
7618             will_dirty_i=0;
7619             wont_dirty_i=0;
7620           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7621             for(r=0;r<HOST_REGS;r++) {
7622               if(r!=EXCLUDE_REG) {
7623                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7624                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7625                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7626                 }
7627                 if(branch_regs[i].regmap[r]>=0) {
7628                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7629                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7630                 }
7631               }
7632             }
7633           //}
7634             // Merge in delay slot
7635             for(r=0;r<HOST_REGS;r++) {
7636               if(r!=EXCLUDE_REG) {
7637                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7638                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7639                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7640                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7641                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7642                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7643                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7644                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7645                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7646                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7647                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7648                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7649                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7650                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7651               }
7652             }
7653           } else {
7654             // Conditional branch
7655             will_dirty_i=will_dirty_next;
7656             wont_dirty_i=wont_dirty_next;
7657           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7658             for(r=0;r<HOST_REGS;r++) {
7659               if(r!=EXCLUDE_REG) {
7660                 signed char target_reg=branch_regs[i].regmap[r];
7661                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7662                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7663                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7664                 }
7665                 else if(target_reg>=0) {
7666                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7667                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7668                 }
7669                 // Treat delay slot as part of branch too
7670                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7671                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7672                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7673                 }
7674                 else
7675                 {
7676                   will_dirty[i+1]&=~(1<<r);
7677                 }*/
7678               }
7679             }
7680           //}
7681             // Merge in delay slot
7682             for(r=0;r<HOST_REGS;r++) {
7683               if(r!=EXCLUDE_REG) {
7684                 if(!likely[i]) {
7685                   // Might not dirty if likely branch is not taken
7686                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7687                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7688                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7689                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7690                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7691                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7692                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7693                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7694                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7695                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7696                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7697                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7698                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7699                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7700                 }
7701               }
7702             }
7703           }
7704           // Merge in delay slot (won't dirty)
7705           for(r=0;r<HOST_REGS;r++) {
7706             if(r!=EXCLUDE_REG) {
7707               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7708               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7709               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7710               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7711               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7712               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7713               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7714               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7715               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7716               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7717             }
7718           }
7719           if(wr) {
7720             #ifndef DESTRUCTIVE_WRITEBACK
7721             branch_regs[i].dirty&=wont_dirty_i;
7722             #endif
7723             branch_regs[i].dirty|=will_dirty_i;
7724           }
7725         }
7726       }
7727     }
7728     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7729     {
7730       // SYSCALL instruction (software interrupt)
7731       will_dirty_i=0;
7732       wont_dirty_i=0;
7733     }
7734     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7735     {
7736       // ERET instruction (return from interrupt)
7737       will_dirty_i=0;
7738       wont_dirty_i=0;
7739     }
7740     will_dirty_next=will_dirty_i;
7741     wont_dirty_next=wont_dirty_i;
7742     for(r=0;r<HOST_REGS;r++) {
7743       if(r!=EXCLUDE_REG) {
7744         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7745         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7746         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7747         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7748         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7749         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7750         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7751         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7752         if(i>istart) {
7753           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7754           {
7755             // Don't store a register immediately after writing it,
7756             // may prevent dual-issue.
7757             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7758             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7759           }
7760         }
7761       }
7762     }
7763     // Save it
7764     will_dirty[i]=will_dirty_i;
7765     wont_dirty[i]=wont_dirty_i;
7766     // Mark registers that won't be dirtied as not dirty
7767     if(wr) {
7768       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7769       for(r=0;r<HOST_REGS;r++) {
7770         if((will_dirty_i>>r)&1) {
7771           printf(" r%d",r);
7772         }
7773       }
7774       printf("\n");*/
7775
7776       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7777         regs[i].dirty|=will_dirty_i;
7778         #ifndef DESTRUCTIVE_WRITEBACK
7779         regs[i].dirty&=wont_dirty_i;
7780         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7781         {
7782           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7783             for(r=0;r<HOST_REGS;r++) {
7784               if(r!=EXCLUDE_REG) {
7785                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7786                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7787                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7788               }
7789             }
7790           }
7791         }
7792         else
7793         {
7794           if(i<iend) {
7795             for(r=0;r<HOST_REGS;r++) {
7796               if(r!=EXCLUDE_REG) {
7797                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7798                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7799                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7800               }
7801             }
7802           }
7803         }
7804         #endif
7805       //}
7806     }
7807     // Deal with changed mappings
7808     temp_will_dirty=will_dirty_i;
7809     temp_wont_dirty=wont_dirty_i;
7810     for(r=0;r<HOST_REGS;r++) {
7811       if(r!=EXCLUDE_REG) {
7812         int nr;
7813         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7814           if(wr) {
7815             #ifndef DESTRUCTIVE_WRITEBACK
7816             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7817             #endif
7818             regs[i].wasdirty|=will_dirty_i&(1<<r);
7819           }
7820         }
7821         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7822           // Register moved to a different register
7823           will_dirty_i&=~(1<<r);
7824           wont_dirty_i&=~(1<<r);
7825           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7826           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7827           if(wr) {
7828             #ifndef DESTRUCTIVE_WRITEBACK
7829             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7830             #endif
7831             regs[i].wasdirty|=will_dirty_i&(1<<r);
7832           }
7833         }
7834         else {
7835           will_dirty_i&=~(1<<r);
7836           wont_dirty_i&=~(1<<r);
7837           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7838             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7839             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7840           } else {
7841             wont_dirty_i|=1<<r;
7842             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7843           }
7844         }
7845       }
7846     }
7847   }
7848 }
7849
7850 #ifdef DISASM
7851   /* disassembly */
7852 void disassemble_inst(int i)
7853 {
7854     if (bt[i]) printf("*"); else printf(" ");
7855     switch(itype[i]) {
7856       case UJUMP:
7857         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7858       case CJUMP:
7859         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7860       case SJUMP:
7861         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7862       case FJUMP:
7863         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7864       case RJUMP:
7865         if (opcode[i]==0x9&&rt1[i]!=31)
7866           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7867         else
7868           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7869         break;
7870       case SPAN:
7871         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7872       case IMM16:
7873         if(opcode[i]==0xf) //LUI
7874           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7875         else
7876           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7877         break;
7878       case LOAD:
7879       case LOADLR:
7880         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7881         break;
7882       case STORE:
7883       case STORELR:
7884         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7885         break;
7886       case ALU:
7887       case SHIFT:
7888         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7889         break;
7890       case MULTDIV:
7891         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7892         break;
7893       case SHIFTIMM:
7894         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7895         break;
7896       case MOV:
7897         if((opcode2[i]&0x1d)==0x10)
7898           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7899         else if((opcode2[i]&0x1d)==0x11)
7900           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7901         else
7902           printf (" %x: %s\n",start+i*4,insn[i]);
7903         break;
7904       case COP0:
7905         if(opcode2[i]==0)
7906           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7907         else if(opcode2[i]==4)
7908           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7909         else printf (" %x: %s\n",start+i*4,insn[i]);
7910         break;
7911       case COP1:
7912         if(opcode2[i]<3)
7913           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7914         else if(opcode2[i]>3)
7915           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7916         else printf (" %x: %s\n",start+i*4,insn[i]);
7917         break;
7918       case COP2:
7919         if(opcode2[i]<3)
7920           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7921         else if(opcode2[i]>3)
7922           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7923         else printf (" %x: %s\n",start+i*4,insn[i]);
7924         break;
7925       case C1LS:
7926         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7927         break;
7928       case C2LS:
7929         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7930         break;
7931       case INTCALL:
7932         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7933         break;
7934       default:
7935         //printf (" %s %8x\n",insn[i],source[i]);
7936         printf (" %x: %s\n",start+i*4,insn[i]);
7937     }
7938 }
7939 #else
7940 static void disassemble_inst(int i) {}
7941 #endif // DISASM
7942
7943 // clear the state completely, instead of just marking
7944 // things invalid like invalidate_all_pages() does
7945 void new_dynarec_clear_full()
7946 {
7947   int n;
7948   out=(u_char *)BASE_ADDR;
7949   memset(invalid_code,1,sizeof(invalid_code));
7950   memset(hash_table,0xff,sizeof(hash_table));
7951   memset(mini_ht,-1,sizeof(mini_ht));
7952   memset(restore_candidate,0,sizeof(restore_candidate));
7953   memset(shadow,0,sizeof(shadow));
7954   copy=shadow;
7955   expirep=16384; // Expiry pointer, +2 blocks
7956   pending_exception=0;
7957   literalcount=0;
7958   stop_after_jal=0;
7959   inv_code_start=inv_code_end=~0;
7960   // TLB
7961 #ifndef DISABLE_TLB
7962   using_tlb=0;
7963   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7964     memory_map[n]=-1;
7965   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7966     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7967   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7968     memory_map[n]=-1;
7969 #endif
7970   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7971   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7972   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7973 }
7974
7975 void new_dynarec_init()
7976 {
7977   printf("Init new dynarec\n");
7978   out=(u_char *)BASE_ADDR;
7979   if (mmap (out, 1<<TARGET_SIZE_2,
7980             PROT_READ | PROT_WRITE | PROT_EXEC,
7981             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7982             -1, 0) <= 0) {printf("mmap() failed\n");}
7983 #ifdef MUPEN64
7984   rdword=&readmem_dword;
7985   fake_pc.f.r.rs=&readmem_dword;
7986   fake_pc.f.r.rt=&readmem_dword;
7987   fake_pc.f.r.rd=&readmem_dword;
7988 #endif
7989   int n;
7990   cycle_multiplier=200;
7991   new_dynarec_clear_full();
7992 #ifdef HOST_IMM8
7993   // Copy this into local area so we don't have to put it in every literal pool
7994   invc_ptr=invalid_code;
7995 #endif
7996 #ifdef MUPEN64
7997   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7998     writemem[n] = write_nomem_new;
7999     writememb[n] = write_nomemb_new;
8000     writememh[n] = write_nomemh_new;
8001 #ifndef FORCE32
8002     writememd[n] = write_nomemd_new;
8003 #endif
8004     readmem[n] = read_nomem_new;
8005     readmemb[n] = read_nomemb_new;
8006     readmemh[n] = read_nomemh_new;
8007 #ifndef FORCE32
8008     readmemd[n] = read_nomemd_new;
8009 #endif
8010   }
8011   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
8012     writemem[n] = write_rdram_new;
8013     writememb[n] = write_rdramb_new;
8014     writememh[n] = write_rdramh_new;
8015 #ifndef FORCE32
8016     writememd[n] = write_rdramd_new;
8017 #endif
8018   }
8019   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
8020     writemem[n] = write_nomem_new;
8021     writememb[n] = write_nomemb_new;
8022     writememh[n] = write_nomemh_new;
8023 #ifndef FORCE32
8024     writememd[n] = write_nomemd_new;
8025 #endif
8026     readmem[n] = read_nomem_new;
8027     readmemb[n] = read_nomemb_new;
8028     readmemh[n] = read_nomemh_new;
8029 #ifndef FORCE32
8030     readmemd[n] = read_nomemd_new;
8031 #endif
8032   }
8033 #endif
8034   tlb_hacks();
8035   arch_init();
8036 }
8037
8038 void new_dynarec_cleanup()
8039 {
8040   int n;
8041   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
8042   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8043   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8044   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8045   #ifdef ROM_COPY
8046   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
8047   #endif
8048 }
8049
8050 int new_recompile_block(int addr)
8051 {
8052 /*
8053   if(addr==0x800cd050) {
8054     int block;
8055     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
8056     int n;
8057     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
8058   }
8059 */
8060   //if(Count==365117028) tracedebug=1;
8061   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8062   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8063   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8064   //if(debug) 
8065   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8066   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8067   /*if(Count>=312978186) {
8068     rlist();
8069   }*/
8070   //rlist();
8071   start = (u_int)addr&~3;
8072   //assert(((u_int)addr&1)==0);
8073   new_dynarec_did_compile=1;
8074 #ifdef PCSX
8075   if (Config.HLE && start == 0x80001000) // hlecall
8076   {
8077     // XXX: is this enough? Maybe check hleSoftCall?
8078     u_int beginning=(u_int)out;
8079     u_int page=get_page(start);
8080     invalid_code[start>>12]=0;
8081     emit_movimm(start,0);
8082     emit_writeword(0,(int)&pcaddr);
8083     emit_jmp((int)new_dyna_leave);
8084     literal_pool(0);
8085 #ifdef __arm__
8086     __clear_cache((void *)beginning,out);
8087 #endif
8088     ll_add(jump_in+page,start,(void *)beginning);
8089     return 0;
8090   }
8091   else if ((u_int)addr < 0x00200000 ||
8092     (0xa0000000 <= addr && addr < 0xa0200000)) {
8093     // used for BIOS calls mostly?
8094     source = (u_int *)((u_int)rdram+(start&0x1fffff));
8095     pagelimit = (addr&0xa0000000)|0x00200000;
8096   }
8097   else if (!Config.HLE && (
8098 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8099     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8100     // BIOS
8101     source = (u_int *)((u_int)psxR+(start&0x7ffff));
8102     pagelimit = (addr&0xfff00000)|0x80000;
8103   }
8104   else
8105 #endif
8106 #ifdef MUPEN64
8107   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
8108     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
8109     pagelimit = 0xa4001000;
8110   }
8111   else
8112 #endif
8113   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
8114     source = (u_int *)((u_int)rdram+start-0x80000000);
8115     pagelimit = 0x80000000+RAM_SIZE;
8116   }
8117 #ifndef DISABLE_TLB
8118   else if ((signed int)addr >= (signed int)0xC0000000) {
8119     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
8120     //if(tlb_LUT_r[start>>12])
8121       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
8122     if((signed int)memory_map[start>>12]>=0) {
8123       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
8124       pagelimit=(start+4096)&0xFFFFF000;
8125       int map=memory_map[start>>12];
8126       int i;
8127       for(i=0;i<5;i++) {
8128         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
8129         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
8130       }
8131       assem_debug("pagelimit=%x\n",pagelimit);
8132       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
8133     }
8134     else {
8135       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
8136       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
8137       return -1; // Caller will invoke exception handler
8138     }
8139     //printf("source= %x\n",(int)source);
8140   }
8141 #endif
8142   else {
8143     printf("Compile at bogus memory address: %x \n", (int)addr);
8144     exit(1);
8145   }
8146
8147   /* Pass 1: disassemble */
8148   /* Pass 2: register dependencies, branch targets */
8149   /* Pass 3: register allocation */
8150   /* Pass 4: branch dependencies */
8151   /* Pass 5: pre-alloc */
8152   /* Pass 6: optimize clean/dirty state */
8153   /* Pass 7: flag 32-bit registers */
8154   /* Pass 8: assembly */
8155   /* Pass 9: linker */
8156   /* Pass 10: garbage collection / free memory */
8157
8158   int i,j;
8159   int done=0;
8160   unsigned int type,op,op2;
8161
8162   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8163   
8164   /* Pass 1 disassembly */
8165
8166   for(i=0;!done;i++) {
8167     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8168     minimum_free_regs[i]=0;
8169     opcode[i]=op=source[i]>>26;
8170     switch(op)
8171     {
8172       case 0x00: strcpy(insn[i],"special"); type=NI;
8173         op2=source[i]&0x3f;
8174         switch(op2)
8175         {
8176           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8177           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8178           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8179           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8180           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8181           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8182           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8183           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8184           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8185           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8186           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8187           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8188           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8189           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8190           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8191           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8192           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8193           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8194           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8195           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8196           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8197           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8198           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8199           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8200           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8201           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8202           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8203           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8204           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8205           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8206           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8207           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8208           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8209           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8210           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8211 #ifndef FORCE32
8212           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8213           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8214           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8215           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8216           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8217           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8218           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8219           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8220           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8221           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8222           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8223           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8224           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8225           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8226           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8227           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8228           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8229 #endif
8230         }
8231         break;
8232       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8233         op2=(source[i]>>16)&0x1f;
8234         switch(op2)
8235         {
8236           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8237           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8238           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8239           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8240           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8241           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8242           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8243           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8244           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8245           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8246           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8247           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8248           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8249           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8250         }
8251         break;
8252       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8253       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8254       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8255       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8256       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8257       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8258       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8259       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8260       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8261       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8262       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8263       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8264       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8265       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8266       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8267         op2=(source[i]>>21)&0x1f;
8268         switch(op2)
8269         {
8270           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8271           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8272           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8273           switch(source[i]&0x3f)
8274           {
8275             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8276             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8277             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8278             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8279 #ifdef PCSX
8280             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8281 #else
8282             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8283 #endif
8284           }
8285         }
8286         break;
8287       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8288         op2=(source[i]>>21)&0x1f;
8289         switch(op2)
8290         {
8291           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8292           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8293           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8294           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8295           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8296           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8297           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8298           switch((source[i]>>16)&0x3)
8299           {
8300             case 0x00: strcpy(insn[i],"BC1F"); break;
8301             case 0x01: strcpy(insn[i],"BC1T"); break;
8302             case 0x02: strcpy(insn[i],"BC1FL"); break;
8303             case 0x03: strcpy(insn[i],"BC1TL"); break;
8304           }
8305           break;
8306           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8307           switch(source[i]&0x3f)
8308           {
8309             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8310             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8311             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8312             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8313             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8314             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8315             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8316             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8317             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8318             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8319             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8320             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8321             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8322             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8323             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8324             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8325             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8326             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8327             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8328             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8329             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8330             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8331             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8332             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8333             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8334             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8335             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8336             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8337             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8338             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8339             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8340             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8341             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8342             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8343             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8344           }
8345           break;
8346           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8347           switch(source[i]&0x3f)
8348           {
8349             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8350             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8351             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8352             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8353             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8354             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8355             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8356             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8357             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8358             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8359             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8360             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8361             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8362             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8363             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8364             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8365             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8366             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8367             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8368             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8369             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8370             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8371             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8372             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8373             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8374             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8375             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8376             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8377             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8378             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8379             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8380             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8381             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8382             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8383             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8384           }
8385           break;
8386           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8387           switch(source[i]&0x3f)
8388           {
8389             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8390             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8391           }
8392           break;
8393           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8394           switch(source[i]&0x3f)
8395           {
8396             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8397             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8398           }
8399           break;
8400         }
8401         break;
8402 #ifndef FORCE32
8403       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8404       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8405       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8406       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8407       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8408       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8409       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8410       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8411 #endif
8412       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8413       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8414       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8415       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8416       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8417       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8418       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8419 #ifndef FORCE32
8420       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8421 #endif
8422       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8423       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8424       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8425       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8426 #ifndef FORCE32
8427       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8428       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8429 #endif
8430       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8431       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8432       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8433       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8434 #ifndef FORCE32
8435       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8436       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8437       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8438 #endif
8439       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8440       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8441 #ifndef FORCE32
8442       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8443       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8444       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8445 #endif
8446 #ifdef PCSX
8447       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8448         op2=(source[i]>>21)&0x1f;
8449         //if (op2 & 0x10) {
8450         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8451           if (gte_handlers[source[i]&0x3f]!=NULL) {
8452             if (gte_regnames[source[i]&0x3f]!=NULL)
8453               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8454             else
8455               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8456             type=C2OP;
8457           }
8458         }
8459         else switch(op2)
8460         {
8461           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8462           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8463           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8464           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8465         }
8466         break;
8467       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8468       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8469       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8470 #endif
8471       default: strcpy(insn[i],"???"); type=NI;
8472         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8473         break;
8474     }
8475     itype[i]=type;
8476     opcode2[i]=op2;
8477     /* Get registers/immediates */
8478     lt1[i]=0;
8479     us1[i]=0;
8480     us2[i]=0;
8481     dep1[i]=0;
8482     dep2[i]=0;
8483     gte_rs[i]=gte_rt[i]=0;
8484     switch(type) {
8485       case LOAD:
8486         rs1[i]=(source[i]>>21)&0x1f;
8487         rs2[i]=0;
8488         rt1[i]=(source[i]>>16)&0x1f;
8489         rt2[i]=0;
8490         imm[i]=(short)source[i];
8491         break;
8492       case STORE:
8493       case STORELR:
8494         rs1[i]=(source[i]>>21)&0x1f;
8495         rs2[i]=(source[i]>>16)&0x1f;
8496         rt1[i]=0;
8497         rt2[i]=0;
8498         imm[i]=(short)source[i];
8499         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8500         break;
8501       case LOADLR:
8502         // LWL/LWR only load part of the register,
8503         // therefore the target register must be treated as a source too
8504         rs1[i]=(source[i]>>21)&0x1f;
8505         rs2[i]=(source[i]>>16)&0x1f;
8506         rt1[i]=(source[i]>>16)&0x1f;
8507         rt2[i]=0;
8508         imm[i]=(short)source[i];
8509         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8510         if(op==0x26) dep1[i]=rt1[i]; // LWR
8511         break;
8512       case IMM16:
8513         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8514         else rs1[i]=(source[i]>>21)&0x1f;
8515         rs2[i]=0;
8516         rt1[i]=(source[i]>>16)&0x1f;
8517         rt2[i]=0;
8518         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8519           imm[i]=(unsigned short)source[i];
8520         }else{
8521           imm[i]=(short)source[i];
8522         }
8523         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8524         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8525         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8526         break;
8527       case UJUMP:
8528         rs1[i]=0;
8529         rs2[i]=0;
8530         rt1[i]=0;
8531         rt2[i]=0;
8532         // The JAL instruction writes to r31.
8533         if (op&1) {
8534           rt1[i]=31;
8535         }
8536         rs2[i]=CCREG;
8537         break;
8538       case RJUMP:
8539         rs1[i]=(source[i]>>21)&0x1f;
8540         rs2[i]=0;
8541         rt1[i]=0;
8542         rt2[i]=0;
8543         // The JALR instruction writes to rd.
8544         if (op2&1) {
8545           rt1[i]=(source[i]>>11)&0x1f;
8546         }
8547         rs2[i]=CCREG;
8548         break;
8549       case CJUMP:
8550         rs1[i]=(source[i]>>21)&0x1f;
8551         rs2[i]=(source[i]>>16)&0x1f;
8552         rt1[i]=0;
8553         rt2[i]=0;
8554         if(op&2) { // BGTZ/BLEZ
8555           rs2[i]=0;
8556         }
8557         us1[i]=rs1[i];
8558         us2[i]=rs2[i];
8559         likely[i]=op>>4;
8560         break;
8561       case SJUMP:
8562         rs1[i]=(source[i]>>21)&0x1f;
8563         rs2[i]=CCREG;
8564         rt1[i]=0;
8565         rt2[i]=0;
8566         us1[i]=rs1[i];
8567         if(op2&0x10) { // BxxAL
8568           rt1[i]=31;
8569           // NOTE: If the branch is not taken, r31 is still overwritten
8570         }
8571         likely[i]=(op2&2)>>1;
8572         break;
8573       case FJUMP:
8574         rs1[i]=FSREG;
8575         rs2[i]=CSREG;
8576         rt1[i]=0;
8577         rt2[i]=0;
8578         likely[i]=((source[i])>>17)&1;
8579         break;
8580       case ALU:
8581         rs1[i]=(source[i]>>21)&0x1f; // source
8582         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8583         rt1[i]=(source[i]>>11)&0x1f; // destination
8584         rt2[i]=0;
8585         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8586           us1[i]=rs1[i];us2[i]=rs2[i];
8587         }
8588         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8589           dep1[i]=rs1[i];dep2[i]=rs2[i];
8590         }
8591         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8592           dep1[i]=rs1[i];dep2[i]=rs2[i];
8593         }
8594         break;
8595       case MULTDIV:
8596         rs1[i]=(source[i]>>21)&0x1f; // source
8597         rs2[i]=(source[i]>>16)&0x1f; // divisor
8598         rt1[i]=HIREG;
8599         rt2[i]=LOREG;
8600         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8601           us1[i]=rs1[i];us2[i]=rs2[i];
8602         }
8603         break;
8604       case MOV:
8605         rs1[i]=0;
8606         rs2[i]=0;
8607         rt1[i]=0;
8608         rt2[i]=0;
8609         if(op2==0x10) rs1[i]=HIREG; // MFHI
8610         if(op2==0x11) rt1[i]=HIREG; // MTHI
8611         if(op2==0x12) rs1[i]=LOREG; // MFLO
8612         if(op2==0x13) rt1[i]=LOREG; // MTLO
8613         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8614         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8615         dep1[i]=rs1[i];
8616         break;
8617       case SHIFT:
8618         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8619         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8620         rt1[i]=(source[i]>>11)&0x1f; // destination
8621         rt2[i]=0;
8622         // DSLLV/DSRLV/DSRAV are 64-bit
8623         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8624         break;
8625       case SHIFTIMM:
8626         rs1[i]=(source[i]>>16)&0x1f;
8627         rs2[i]=0;
8628         rt1[i]=(source[i]>>11)&0x1f;
8629         rt2[i]=0;
8630         imm[i]=(source[i]>>6)&0x1f;
8631         // DSxx32 instructions
8632         if(op2>=0x3c) imm[i]|=0x20;
8633         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8634         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8635         break;
8636       case COP0:
8637         rs1[i]=0;
8638         rs2[i]=0;
8639         rt1[i]=0;
8640         rt2[i]=0;
8641         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8642         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8643         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8644         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8645         break;
8646       case COP1:
8647         rs1[i]=0;
8648         rs2[i]=0;
8649         rt1[i]=0;
8650         rt2[i]=0;
8651         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8652         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8653         if(op2==5) us1[i]=rs1[i]; // DMTC1
8654         rs2[i]=CSREG;
8655         break;
8656       case COP2:
8657         rs1[i]=0;
8658         rs2[i]=0;
8659         rt1[i]=0;
8660         rt2[i]=0;
8661         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8662         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8663         rs2[i]=CSREG;
8664         int gr=(source[i]>>11)&0x1F;
8665         switch(op2)
8666         {
8667           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8668           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8669           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
8670           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8671         }
8672         break;
8673       case C1LS:
8674         rs1[i]=(source[i]>>21)&0x1F;
8675         rs2[i]=CSREG;
8676         rt1[i]=0;
8677         rt2[i]=0;
8678         imm[i]=(short)source[i];
8679         break;
8680       case C2LS:
8681         rs1[i]=(source[i]>>21)&0x1F;
8682         rs2[i]=0;
8683         rt1[i]=0;
8684         rt2[i]=0;
8685         imm[i]=(short)source[i];
8686         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8687         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8688         break;
8689       case C2OP:
8690         rs1[i]=0;
8691         rs2[i]=0;
8692         rt1[i]=0;
8693         rt2[i]=0;
8694         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
8695         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
8696         gte_rt[i]|=1ll<<63; // every op changes flags
8697         break;
8698       case FLOAT:
8699       case FCONV:
8700         rs1[i]=0;
8701         rs2[i]=CSREG;
8702         rt1[i]=0;
8703         rt2[i]=0;
8704         break;
8705       case FCOMP:
8706         rs1[i]=FSREG;
8707         rs2[i]=CSREG;
8708         rt1[i]=FSREG;
8709         rt2[i]=0;
8710         break;
8711       case SYSCALL:
8712       case HLECALL:
8713       case INTCALL:
8714         rs1[i]=CCREG;
8715         rs2[i]=0;
8716         rt1[i]=0;
8717         rt2[i]=0;
8718         break;
8719       default:
8720         rs1[i]=0;
8721         rs2[i]=0;
8722         rt1[i]=0;
8723         rt2[i]=0;
8724     }
8725     /* Calculate branch target addresses */
8726     if(type==UJUMP)
8727       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8728     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8729       ba[i]=start+i*4+8; // Ignore never taken branch
8730     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8731       ba[i]=start+i*4+8; // Ignore never taken branch
8732     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8733       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8734     else ba[i]=-1;
8735 #ifdef PCSX
8736     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8737       int do_in_intrp=0;
8738       // branch in delay slot?
8739       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8740         // don't handle first branch and call interpreter if it's hit
8741         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8742         do_in_intrp=1;
8743       }
8744       // basic load delay detection
8745       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8746         int t=(ba[i-1]-start)/4;
8747         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8748           // jump target wants DS result - potential load delay effect
8749           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8750           do_in_intrp=1;
8751           bt[t+1]=1; // expected return from interpreter
8752         }
8753         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8754               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8755           // v0 overwrite like this is a sign of trouble, bail out
8756           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8757           do_in_intrp=1;
8758         }
8759       }
8760       if(do_in_intrp) {
8761         rs1[i-1]=CCREG;
8762         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8763         ba[i-1]=-1;
8764         itype[i-1]=INTCALL;
8765         done=2;
8766         i--; // don't compile the DS
8767       }
8768     }
8769 #endif
8770     /* Is this the end of the block? */
8771     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8772       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8773         done=2;
8774       }
8775       else {
8776         if(stop_after_jal) done=1;
8777         // Stop on BREAK
8778         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8779       }
8780       // Don't recompile stuff that's already compiled
8781       if(check_addr(start+i*4+4)) done=1;
8782       // Don't get too close to the limit
8783       if(i>MAXBLOCK/2) done=1;
8784     }
8785     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8786     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8787     if(done==2) {
8788       // Does the block continue due to a branch?
8789       for(j=i-1;j>=0;j--)
8790       {
8791         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8792         if(ba[j]==start+i*4+4) done=j=0;
8793         if(ba[j]==start+i*4+8) done=j=0;
8794       }
8795     }
8796     //assert(i<MAXBLOCK-1);
8797     if(start+i*4==pagelimit-4) done=1;
8798     assert(start+i*4<pagelimit);
8799     if (i==MAXBLOCK-1) done=1;
8800     // Stop if we're compiling junk
8801     if(itype[i]==NI&&opcode[i]==0x11) {
8802       done=stop_after_jal=1;
8803       printf("Disabled speculative precompilation\n");
8804     }
8805   }
8806   slen=i;
8807   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8808     if(start+i*4==pagelimit) {
8809       itype[i-1]=SPAN;
8810     }
8811   }
8812   assert(slen>0);
8813
8814   /* Pass 2 - Register dependencies and branch targets */
8815
8816   unneeded_registers(0,slen-1,0);
8817   
8818   /* Pass 3 - Register allocation */
8819
8820   struct regstat current; // Current register allocations/status
8821   current.is32=1;
8822   current.dirty=0;
8823   current.u=unneeded_reg[0];
8824   current.uu=unneeded_reg_upper[0];
8825   clear_all_regs(current.regmap);
8826   alloc_reg(&current,0,CCREG);
8827   dirty_reg(&current,CCREG);
8828   current.isconst=0;
8829   current.wasconst=0;
8830   current.waswritten=0;
8831   int ds=0;
8832   int cc=0;
8833   int hr=-1;
8834
8835 #ifndef FORCE32
8836   provisional_32bit();
8837 #endif
8838   if((u_int)addr&1) {
8839     // First instruction is delay slot
8840     cc=-1;
8841     bt[1]=1;
8842     ds=1;
8843     unneeded_reg[0]=1;
8844     unneeded_reg_upper[0]=1;
8845     current.regmap[HOST_BTREG]=BTREG;
8846   }
8847   
8848   for(i=0;i<slen;i++)
8849   {
8850     if(bt[i])
8851     {
8852       int hr;
8853       for(hr=0;hr<HOST_REGS;hr++)
8854       {
8855         // Is this really necessary?
8856         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8857       }
8858       current.isconst=0;
8859       current.waswritten=0;
8860     }
8861     if(i>1)
8862     {
8863       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8864       {
8865         if(rs1[i-2]==0||rs2[i-2]==0)
8866         {
8867           if(rs1[i-2]) {
8868             current.is32|=1LL<<rs1[i-2];
8869             int hr=get_reg(current.regmap,rs1[i-2]|64);
8870             if(hr>=0) current.regmap[hr]=-1;
8871           }
8872           if(rs2[i-2]) {
8873             current.is32|=1LL<<rs2[i-2];
8874             int hr=get_reg(current.regmap,rs2[i-2]|64);
8875             if(hr>=0) current.regmap[hr]=-1;
8876           }
8877         }
8878       }
8879     }
8880 #ifndef FORCE32
8881     // If something jumps here with 64-bit values
8882     // then promote those registers to 64 bits
8883     if(bt[i])
8884     {
8885       uint64_t temp_is32=current.is32;
8886       for(j=i-1;j>=0;j--)
8887       {
8888         if(ba[j]==start+i*4) 
8889           temp_is32&=branch_regs[j].is32;
8890       }
8891       for(j=i;j<slen;j++)
8892       {
8893         if(ba[j]==start+i*4) 
8894           //temp_is32=1;
8895           temp_is32&=p32[j];
8896       }
8897       if(temp_is32!=current.is32) {
8898         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8899         #ifndef DESTRUCTIVE_WRITEBACK
8900         if(ds)
8901         #endif
8902         for(hr=0;hr<HOST_REGS;hr++)
8903         {
8904           int r=current.regmap[hr];
8905           if(r>0&&r<64)
8906           {
8907             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8908               temp_is32|=1LL<<r;
8909               //printf("restore %d\n",r);
8910             }
8911           }
8912         }
8913         current.is32=temp_is32;
8914       }
8915     }
8916 #else
8917     current.is32=-1LL;
8918 #endif
8919
8920     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8921     regs[i].wasconst=current.isconst;
8922     regs[i].was32=current.is32;
8923     regs[i].wasdirty=current.dirty;
8924     regs[i].loadedconst=0;
8925     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8926     // To change a dirty register from 32 to 64 bits, we must write
8927     // it out during the previous cycle (for branches, 2 cycles)
8928     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8929     {
8930       uint64_t temp_is32=current.is32;
8931       for(j=i-1;j>=0;j--)
8932       {
8933         if(ba[j]==start+i*4+4) 
8934           temp_is32&=branch_regs[j].is32;
8935       }
8936       for(j=i;j<slen;j++)
8937       {
8938         if(ba[j]==start+i*4+4) 
8939           //temp_is32=1;
8940           temp_is32&=p32[j];
8941       }
8942       if(temp_is32!=current.is32) {
8943         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8944         for(hr=0;hr<HOST_REGS;hr++)
8945         {
8946           int r=current.regmap[hr];
8947           if(r>0)
8948           {
8949             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8950               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8951               {
8952                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8953                 {
8954                   //printf("dump %d/r%d\n",hr,r);
8955                   current.regmap[hr]=-1;
8956                   if(get_reg(current.regmap,r|64)>=0) 
8957                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8958                 }
8959               }
8960             }
8961           }
8962         }
8963       }
8964     }
8965     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8966     {
8967       uint64_t temp_is32=current.is32;
8968       for(j=i-1;j>=0;j--)
8969       {
8970         if(ba[j]==start+i*4+8) 
8971           temp_is32&=branch_regs[j].is32;
8972       }
8973       for(j=i;j<slen;j++)
8974       {
8975         if(ba[j]==start+i*4+8) 
8976           //temp_is32=1;
8977           temp_is32&=p32[j];
8978       }
8979       if(temp_is32!=current.is32) {
8980         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8981         for(hr=0;hr<HOST_REGS;hr++)
8982         {
8983           int r=current.regmap[hr];
8984           if(r>0)
8985           {
8986             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8987               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8988               {
8989                 //printf("dump %d/r%d\n",hr,r);
8990                 current.regmap[hr]=-1;
8991                 if(get_reg(current.regmap,r|64)>=0) 
8992                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8993               }
8994             }
8995           }
8996         }
8997       }
8998     }
8999     #endif
9000     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9001       if(i+1<slen) {
9002         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9003         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9004         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9005         current.u|=1;
9006         current.uu|=1;
9007       } else {
9008         current.u=1;
9009         current.uu=1;
9010       }
9011     } else {
9012       if(i+1<slen) {
9013         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
9014         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9015         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9016         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9017         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9018         current.u|=1;
9019         current.uu|=1;
9020       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
9021     }
9022     is_ds[i]=ds;
9023     if(ds) {
9024       ds=0; // Skip delay slot, already allocated as part of branch
9025       // ...but we need to alloc it in case something jumps here
9026       if(i+1<slen) {
9027         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
9028         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
9029       }else{
9030         current.u=branch_unneeded_reg[i-1];
9031         current.uu=branch_unneeded_reg_upper[i-1];
9032       }
9033       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9034       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9035       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9036       current.u|=1;
9037       current.uu|=1;
9038       struct regstat temp;
9039       memcpy(&temp,&current,sizeof(current));
9040       temp.wasdirty=temp.dirty;
9041       temp.was32=temp.is32;
9042       // TODO: Take into account unconditional branches, as below
9043       delayslot_alloc(&temp,i);
9044       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
9045       regs[i].wasdirty=temp.wasdirty;
9046       regs[i].was32=temp.was32;
9047       regs[i].dirty=temp.dirty;
9048       regs[i].is32=temp.is32;
9049       regs[i].isconst=0;
9050       regs[i].wasconst=0;
9051       current.isconst=0;
9052       // Create entry (branch target) regmap
9053       for(hr=0;hr<HOST_REGS;hr++)
9054       {
9055         int r=temp.regmap[hr];
9056         if(r>=0) {
9057           if(r!=regmap_pre[i][hr]) {
9058             regs[i].regmap_entry[hr]=-1;
9059           }
9060           else
9061           {
9062             if(r<64){
9063               if((current.u>>r)&1) {
9064                 regs[i].regmap_entry[hr]=-1;
9065                 regs[i].regmap[hr]=-1;
9066                 //Don't clear regs in the delay slot as the branch might need them
9067                 //current.regmap[hr]=-1;
9068               }else
9069                 regs[i].regmap_entry[hr]=r;
9070             }
9071             else {
9072               if((current.uu>>(r&63))&1) {
9073                 regs[i].regmap_entry[hr]=-1;
9074                 regs[i].regmap[hr]=-1;
9075                 //Don't clear regs in the delay slot as the branch might need them
9076                 //current.regmap[hr]=-1;
9077               }else
9078                 regs[i].regmap_entry[hr]=r;
9079             }
9080           }
9081         } else {
9082           // First instruction expects CCREG to be allocated
9083           if(i==0&&hr==HOST_CCREG) 
9084             regs[i].regmap_entry[hr]=CCREG;
9085           else
9086             regs[i].regmap_entry[hr]=-1;
9087         }
9088       }
9089     }
9090     else { // Not delay slot
9091       switch(itype[i]) {
9092         case UJUMP:
9093           //current.isconst=0; // DEBUG
9094           //current.wasconst=0; // DEBUG
9095           //regs[i].wasconst=0; // DEBUG
9096           clear_const(&current,rt1[i]);
9097           alloc_cc(&current,i);
9098           dirty_reg(&current,CCREG);
9099           if (rt1[i]==31) {
9100             alloc_reg(&current,i,31);
9101             dirty_reg(&current,31);
9102             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9103             //assert(rt1[i+1]!=rt1[i]);
9104             #ifdef REG_PREFETCH
9105             alloc_reg(&current,i,PTEMP);
9106             #endif
9107             //current.is32|=1LL<<rt1[i];
9108           }
9109           ooo[i]=1;
9110           delayslot_alloc(&current,i+1);
9111           //current.isconst=0; // DEBUG
9112           ds=1;
9113           //printf("i=%d, isconst=%x\n",i,current.isconst);
9114           break;
9115         case RJUMP:
9116           //current.isconst=0;
9117           //current.wasconst=0;
9118           //regs[i].wasconst=0;
9119           clear_const(&current,rs1[i]);
9120           clear_const(&current,rt1[i]);
9121           alloc_cc(&current,i);
9122           dirty_reg(&current,CCREG);
9123           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9124             alloc_reg(&current,i,rs1[i]);
9125             if (rt1[i]!=0) {
9126               alloc_reg(&current,i,rt1[i]);
9127               dirty_reg(&current,rt1[i]);
9128               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9129               assert(rt1[i+1]!=rt1[i]);
9130               #ifdef REG_PREFETCH
9131               alloc_reg(&current,i,PTEMP);
9132               #endif
9133             }
9134             #ifdef USE_MINI_HT
9135             if(rs1[i]==31) { // JALR
9136               alloc_reg(&current,i,RHASH);
9137               #ifndef HOST_IMM_ADDR32
9138               alloc_reg(&current,i,RHTBL);
9139               #endif
9140             }
9141             #endif
9142             delayslot_alloc(&current,i+1);
9143           } else {
9144             // The delay slot overwrites our source register,
9145             // allocate a temporary register to hold the old value.
9146             current.isconst=0;
9147             current.wasconst=0;
9148             regs[i].wasconst=0;
9149             delayslot_alloc(&current,i+1);
9150             current.isconst=0;
9151             alloc_reg(&current,i,RTEMP);
9152           }
9153           //current.isconst=0; // DEBUG
9154           ooo[i]=1;
9155           ds=1;
9156           break;
9157         case CJUMP:
9158           //current.isconst=0;
9159           //current.wasconst=0;
9160           //regs[i].wasconst=0;
9161           clear_const(&current,rs1[i]);
9162           clear_const(&current,rs2[i]);
9163           if((opcode[i]&0x3E)==4) // BEQ/BNE
9164           {
9165             alloc_cc(&current,i);
9166             dirty_reg(&current,CCREG);
9167             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9168             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9169             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9170             {
9171               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9172               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9173             }
9174             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9175                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9176               // The delay slot overwrites one of our conditions.
9177               // Allocate the branch condition registers instead.
9178               current.isconst=0;
9179               current.wasconst=0;
9180               regs[i].wasconst=0;
9181               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9182               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9183               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9184               {
9185                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9186                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9187               }
9188             }
9189             else
9190             {
9191               ooo[i]=1;
9192               delayslot_alloc(&current,i+1);
9193             }
9194           }
9195           else
9196           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9197           {
9198             alloc_cc(&current,i);
9199             dirty_reg(&current,CCREG);
9200             alloc_reg(&current,i,rs1[i]);
9201             if(!(current.is32>>rs1[i]&1))
9202             {
9203               alloc_reg64(&current,i,rs1[i]);
9204             }
9205             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9206               // The delay slot overwrites one of our conditions.
9207               // Allocate the branch condition registers instead.
9208               current.isconst=0;
9209               current.wasconst=0;
9210               regs[i].wasconst=0;
9211               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9212               if(!((current.is32>>rs1[i])&1))
9213               {
9214                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9215               }
9216             }
9217             else
9218             {
9219               ooo[i]=1;
9220               delayslot_alloc(&current,i+1);
9221             }
9222           }
9223           else
9224           // Don't alloc the delay slot yet because we might not execute it
9225           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9226           {
9227             current.isconst=0;
9228             current.wasconst=0;
9229             regs[i].wasconst=0;
9230             alloc_cc(&current,i);
9231             dirty_reg(&current,CCREG);
9232             alloc_reg(&current,i,rs1[i]);
9233             alloc_reg(&current,i,rs2[i]);
9234             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9235             {
9236               alloc_reg64(&current,i,rs1[i]);
9237               alloc_reg64(&current,i,rs2[i]);
9238             }
9239           }
9240           else
9241           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9242           {
9243             current.isconst=0;
9244             current.wasconst=0;
9245             regs[i].wasconst=0;
9246             alloc_cc(&current,i);
9247             dirty_reg(&current,CCREG);
9248             alloc_reg(&current,i,rs1[i]);
9249             if(!(current.is32>>rs1[i]&1))
9250             {
9251               alloc_reg64(&current,i,rs1[i]);
9252             }
9253           }
9254           ds=1;
9255           //current.isconst=0;
9256           break;
9257         case SJUMP:
9258           //current.isconst=0;
9259           //current.wasconst=0;
9260           //regs[i].wasconst=0;
9261           clear_const(&current,rs1[i]);
9262           clear_const(&current,rt1[i]);
9263           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9264           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9265           {
9266             alloc_cc(&current,i);
9267             dirty_reg(&current,CCREG);
9268             alloc_reg(&current,i,rs1[i]);
9269             if(!(current.is32>>rs1[i]&1))
9270             {
9271               alloc_reg64(&current,i,rs1[i]);
9272             }
9273             if (rt1[i]==31) { // BLTZAL/BGEZAL
9274               alloc_reg(&current,i,31);
9275               dirty_reg(&current,31);
9276               //#ifdef REG_PREFETCH
9277               //alloc_reg(&current,i,PTEMP);
9278               //#endif
9279               //current.is32|=1LL<<rt1[i];
9280             }
9281             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9282                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9283               // Allocate the branch condition registers instead.
9284               current.isconst=0;
9285               current.wasconst=0;
9286               regs[i].wasconst=0;
9287               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9288               if(!((current.is32>>rs1[i])&1))
9289               {
9290                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9291               }
9292             }
9293             else
9294             {
9295               ooo[i]=1;
9296               delayslot_alloc(&current,i+1);
9297             }
9298           }
9299           else
9300           // Don't alloc the delay slot yet because we might not execute it
9301           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9302           {
9303             current.isconst=0;
9304             current.wasconst=0;
9305             regs[i].wasconst=0;
9306             alloc_cc(&current,i);
9307             dirty_reg(&current,CCREG);
9308             alloc_reg(&current,i,rs1[i]);
9309             if(!(current.is32>>rs1[i]&1))
9310             {
9311               alloc_reg64(&current,i,rs1[i]);
9312             }
9313           }
9314           ds=1;
9315           //current.isconst=0;
9316           break;
9317         case FJUMP:
9318           current.isconst=0;
9319           current.wasconst=0;
9320           regs[i].wasconst=0;
9321           if(likely[i]==0) // BC1F/BC1T
9322           {
9323             // TODO: Theoretically we can run out of registers here on x86.
9324             // The delay slot can allocate up to six, and we need to check
9325             // CSREG before executing the delay slot.  Possibly we can drop
9326             // the cycle count and then reload it after checking that the
9327             // FPU is in a usable state, or don't do out-of-order execution.
9328             alloc_cc(&current,i);
9329             dirty_reg(&current,CCREG);
9330             alloc_reg(&current,i,FSREG);
9331             alloc_reg(&current,i,CSREG);
9332             if(itype[i+1]==FCOMP) {
9333               // The delay slot overwrites the branch condition.
9334               // Allocate the branch condition registers instead.
9335               alloc_cc(&current,i);
9336               dirty_reg(&current,CCREG);
9337               alloc_reg(&current,i,CSREG);
9338               alloc_reg(&current,i,FSREG);
9339             }
9340             else {
9341               ooo[i]=1;
9342               delayslot_alloc(&current,i+1);
9343               alloc_reg(&current,i+1,CSREG);
9344             }
9345           }
9346           else
9347           // Don't alloc the delay slot yet because we might not execute it
9348           if(likely[i]) // BC1FL/BC1TL
9349           {
9350             alloc_cc(&current,i);
9351             dirty_reg(&current,CCREG);
9352             alloc_reg(&current,i,CSREG);
9353             alloc_reg(&current,i,FSREG);
9354           }
9355           ds=1;
9356           current.isconst=0;
9357           break;
9358         case IMM16:
9359           imm16_alloc(&current,i);
9360           break;
9361         case LOAD:
9362         case LOADLR:
9363           load_alloc(&current,i);
9364           break;
9365         case STORE:
9366         case STORELR:
9367           store_alloc(&current,i);
9368           break;
9369         case ALU:
9370           alu_alloc(&current,i);
9371           break;
9372         case SHIFT:
9373           shift_alloc(&current,i);
9374           break;
9375         case MULTDIV:
9376           multdiv_alloc(&current,i);
9377           break;
9378         case SHIFTIMM:
9379           shiftimm_alloc(&current,i);
9380           break;
9381         case MOV:
9382           mov_alloc(&current,i);
9383           break;
9384         case COP0:
9385           cop0_alloc(&current,i);
9386           break;
9387         case COP1:
9388         case COP2:
9389           cop1_alloc(&current,i);
9390           break;
9391         case C1LS:
9392           c1ls_alloc(&current,i);
9393           break;
9394         case C2LS:
9395           c2ls_alloc(&current,i);
9396           break;
9397         case C2OP:
9398           c2op_alloc(&current,i);
9399           break;
9400         case FCONV:
9401           fconv_alloc(&current,i);
9402           break;
9403         case FLOAT:
9404           float_alloc(&current,i);
9405           break;
9406         case FCOMP:
9407           fcomp_alloc(&current,i);
9408           break;
9409         case SYSCALL:
9410         case HLECALL:
9411         case INTCALL:
9412           syscall_alloc(&current,i);
9413           break;
9414         case SPAN:
9415           pagespan_alloc(&current,i);
9416           break;
9417       }
9418       
9419       // Drop the upper half of registers that have become 32-bit
9420       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9421       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9422         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9423         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9424         current.uu|=1;
9425       } else {
9426         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9427         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9428         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9429         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9430         current.uu|=1;
9431       }
9432
9433       // Create entry (branch target) regmap
9434       for(hr=0;hr<HOST_REGS;hr++)
9435       {
9436         int r,or,er;
9437         r=current.regmap[hr];
9438         if(r>=0) {
9439           if(r!=regmap_pre[i][hr]) {
9440             // TODO: delay slot (?)
9441             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9442             if(or<0||(r&63)>=TEMPREG){
9443               regs[i].regmap_entry[hr]=-1;
9444             }
9445             else
9446             {
9447               // Just move it to a different register
9448               regs[i].regmap_entry[hr]=r;
9449               // If it was dirty before, it's still dirty
9450               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9451             }
9452           }
9453           else
9454           {
9455             // Unneeded
9456             if(r==0){
9457               regs[i].regmap_entry[hr]=0;
9458             }
9459             else
9460             if(r<64){
9461               if((current.u>>r)&1) {
9462                 regs[i].regmap_entry[hr]=-1;
9463                 //regs[i].regmap[hr]=-1;
9464                 current.regmap[hr]=-1;
9465               }else
9466                 regs[i].regmap_entry[hr]=r;
9467             }
9468             else {
9469               if((current.uu>>(r&63))&1) {
9470                 regs[i].regmap_entry[hr]=-1;
9471                 //regs[i].regmap[hr]=-1;
9472                 current.regmap[hr]=-1;
9473               }else
9474                 regs[i].regmap_entry[hr]=r;
9475             }
9476           }
9477         } else {
9478           // Branches expect CCREG to be allocated at the target
9479           if(regmap_pre[i][hr]==CCREG) 
9480             regs[i].regmap_entry[hr]=CCREG;
9481           else
9482             regs[i].regmap_entry[hr]=-1;
9483         }
9484       }
9485       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9486     }
9487
9488     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
9489       current.waswritten|=1<<rs1[i-1];
9490     current.waswritten&=~(1<<rt1[i]);
9491     current.waswritten&=~(1<<rt2[i]);
9492     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
9493       current.waswritten&=~(1<<rs1[i]);
9494
9495     /* Branch post-alloc */
9496     if(i>0)
9497     {
9498       current.was32=current.is32;
9499       current.wasdirty=current.dirty;
9500       switch(itype[i-1]) {
9501         case UJUMP:
9502           memcpy(&branch_regs[i-1],&current,sizeof(current));
9503           branch_regs[i-1].isconst=0;
9504           branch_regs[i-1].wasconst=0;
9505           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9506           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9507           alloc_cc(&branch_regs[i-1],i-1);
9508           dirty_reg(&branch_regs[i-1],CCREG);
9509           if(rt1[i-1]==31) { // JAL
9510             alloc_reg(&branch_regs[i-1],i-1,31);
9511             dirty_reg(&branch_regs[i-1],31);
9512             branch_regs[i-1].is32|=1LL<<31;
9513           }
9514           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9515           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9516           break;
9517         case RJUMP:
9518           memcpy(&branch_regs[i-1],&current,sizeof(current));
9519           branch_regs[i-1].isconst=0;
9520           branch_regs[i-1].wasconst=0;
9521           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9522           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9523           alloc_cc(&branch_regs[i-1],i-1);
9524           dirty_reg(&branch_regs[i-1],CCREG);
9525           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9526           if(rt1[i-1]!=0) { // JALR
9527             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9528             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9529             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9530           }
9531           #ifdef USE_MINI_HT
9532           if(rs1[i-1]==31) { // JALR
9533             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9534             #ifndef HOST_IMM_ADDR32
9535             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9536             #endif
9537           }
9538           #endif
9539           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9540           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9541           break;
9542         case CJUMP:
9543           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9544           {
9545             alloc_cc(&current,i-1);
9546             dirty_reg(&current,CCREG);
9547             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9548                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9549               // The delay slot overwrote one of our conditions
9550               // Delay slot goes after the test (in order)
9551               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9552               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9553               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9554               current.u|=1;
9555               current.uu|=1;
9556               delayslot_alloc(&current,i);
9557               current.isconst=0;
9558             }
9559             else
9560             {
9561               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9562               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9563               // Alloc the branch condition registers
9564               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9565               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9566               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9567               {
9568                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9569                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9570               }
9571             }
9572             memcpy(&branch_regs[i-1],&current,sizeof(current));
9573             branch_regs[i-1].isconst=0;
9574             branch_regs[i-1].wasconst=0;
9575             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9576             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9577           }
9578           else
9579           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9580           {
9581             alloc_cc(&current,i-1);
9582             dirty_reg(&current,CCREG);
9583             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9584               // The delay slot overwrote the branch condition
9585               // Delay slot goes after the test (in order)
9586               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9587               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9588               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9589               current.u|=1;
9590               current.uu|=1;
9591               delayslot_alloc(&current,i);
9592               current.isconst=0;
9593             }
9594             else
9595             {
9596               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9597               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9598               // Alloc the branch condition register
9599               alloc_reg(&current,i-1,rs1[i-1]);
9600               if(!(current.is32>>rs1[i-1]&1))
9601               {
9602                 alloc_reg64(&current,i-1,rs1[i-1]);
9603               }
9604             }
9605             memcpy(&branch_regs[i-1],&current,sizeof(current));
9606             branch_regs[i-1].isconst=0;
9607             branch_regs[i-1].wasconst=0;
9608             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9609             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9610           }
9611           else
9612           // Alloc the delay slot in case the branch is taken
9613           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9614           {
9615             memcpy(&branch_regs[i-1],&current,sizeof(current));
9616             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9617             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9618             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9619             alloc_cc(&branch_regs[i-1],i);
9620             dirty_reg(&branch_regs[i-1],CCREG);
9621             delayslot_alloc(&branch_regs[i-1],i);
9622             branch_regs[i-1].isconst=0;
9623             alloc_reg(&current,i,CCREG); // Not taken path
9624             dirty_reg(&current,CCREG);
9625             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9626           }
9627           else
9628           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9629           {
9630             memcpy(&branch_regs[i-1],&current,sizeof(current));
9631             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9632             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9633             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9634             alloc_cc(&branch_regs[i-1],i);
9635             dirty_reg(&branch_regs[i-1],CCREG);
9636             delayslot_alloc(&branch_regs[i-1],i);
9637             branch_regs[i-1].isconst=0;
9638             alloc_reg(&current,i,CCREG); // Not taken path
9639             dirty_reg(&current,CCREG);
9640             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9641           }
9642           break;
9643         case SJUMP:
9644           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9645           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9646           {
9647             alloc_cc(&current,i-1);
9648             dirty_reg(&current,CCREG);
9649             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9650               // The delay slot overwrote the branch condition
9651               // Delay slot goes after the test (in order)
9652               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9653               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9654               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9655               current.u|=1;
9656               current.uu|=1;
9657               delayslot_alloc(&current,i);
9658               current.isconst=0;
9659             }
9660             else
9661             {
9662               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9663               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9664               // Alloc the branch condition register
9665               alloc_reg(&current,i-1,rs1[i-1]);
9666               if(!(current.is32>>rs1[i-1]&1))
9667               {
9668                 alloc_reg64(&current,i-1,rs1[i-1]);
9669               }
9670             }
9671             memcpy(&branch_regs[i-1],&current,sizeof(current));
9672             branch_regs[i-1].isconst=0;
9673             branch_regs[i-1].wasconst=0;
9674             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9675             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9676           }
9677           else
9678           // Alloc the delay slot in case the branch is taken
9679           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9680           {
9681             memcpy(&branch_regs[i-1],&current,sizeof(current));
9682             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9683             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9684             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9685             alloc_cc(&branch_regs[i-1],i);
9686             dirty_reg(&branch_regs[i-1],CCREG);
9687             delayslot_alloc(&branch_regs[i-1],i);
9688             branch_regs[i-1].isconst=0;
9689             alloc_reg(&current,i,CCREG); // Not taken path
9690             dirty_reg(&current,CCREG);
9691             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9692           }
9693           // FIXME: BLTZAL/BGEZAL
9694           if(opcode2[i-1]&0x10) { // BxxZAL
9695             alloc_reg(&branch_regs[i-1],i-1,31);
9696             dirty_reg(&branch_regs[i-1],31);
9697             branch_regs[i-1].is32|=1LL<<31;
9698           }
9699           break;
9700         case FJUMP:
9701           if(likely[i-1]==0) // BC1F/BC1T
9702           {
9703             alloc_cc(&current,i-1);
9704             dirty_reg(&current,CCREG);
9705             if(itype[i]==FCOMP) {
9706               // The delay slot overwrote the branch condition
9707               // Delay slot goes after the test (in order)
9708               delayslot_alloc(&current,i);
9709               current.isconst=0;
9710             }
9711             else
9712             {
9713               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9714               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9715               // Alloc the branch condition register
9716               alloc_reg(&current,i-1,FSREG);
9717             }
9718             memcpy(&branch_regs[i-1],&current,sizeof(current));
9719             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9720           }
9721           else // BC1FL/BC1TL
9722           {
9723             // Alloc the delay slot in case the branch is taken
9724             memcpy(&branch_regs[i-1],&current,sizeof(current));
9725             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9726             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9727             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9728             alloc_cc(&branch_regs[i-1],i);
9729             dirty_reg(&branch_regs[i-1],CCREG);
9730             delayslot_alloc(&branch_regs[i-1],i);
9731             branch_regs[i-1].isconst=0;
9732             alloc_reg(&current,i,CCREG); // Not taken path
9733             dirty_reg(&current,CCREG);
9734             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9735           }
9736           break;
9737       }
9738
9739       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9740       {
9741         if(rt1[i-1]==31) // JAL/JALR
9742         {
9743           // Subroutine call will return here, don't alloc any registers
9744           current.is32=1;
9745           current.dirty=0;
9746           clear_all_regs(current.regmap);
9747           alloc_reg(&current,i,CCREG);
9748           dirty_reg(&current,CCREG);
9749         }
9750         else if(i+1<slen)
9751         {
9752           // Internal branch will jump here, match registers to caller
9753           current.is32=0x3FFFFFFFFLL;
9754           current.dirty=0;
9755           clear_all_regs(current.regmap);
9756           alloc_reg(&current,i,CCREG);
9757           dirty_reg(&current,CCREG);
9758           for(j=i-1;j>=0;j--)
9759           {
9760             if(ba[j]==start+i*4+4) {
9761               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9762               current.is32=branch_regs[j].is32;
9763               current.dirty=branch_regs[j].dirty;
9764               break;
9765             }
9766           }
9767           while(j>=0) {
9768             if(ba[j]==start+i*4+4) {
9769               for(hr=0;hr<HOST_REGS;hr++) {
9770                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9771                   current.regmap[hr]=-1;
9772                 }
9773                 current.is32&=branch_regs[j].is32;
9774                 current.dirty&=branch_regs[j].dirty;
9775               }
9776             }
9777             j--;
9778           }
9779         }
9780       }
9781     }
9782
9783     // Count cycles in between branches
9784     ccadj[i]=cc;
9785     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9786     {
9787       cc=0;
9788     }
9789 #if defined(PCSX) && !defined(DRC_DBG)
9790     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
9791     {
9792       // GTE runs in parallel until accessed, divide by 2 for a rough guess
9793       cc+=gte_cycletab[source[i]&0x3f]/2;
9794     }
9795     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9796     {
9797       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9798     }
9799     else if(itype[i]==C2LS)
9800     {
9801       cc+=4;
9802     }
9803 #endif
9804     else
9805     {
9806       cc++;
9807     }
9808
9809     flush_dirty_uppers(&current);
9810     if(!is_ds[i]) {
9811       regs[i].is32=current.is32;
9812       regs[i].dirty=current.dirty;
9813       regs[i].isconst=current.isconst;
9814       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9815     }
9816     for(hr=0;hr<HOST_REGS;hr++) {
9817       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9818         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9819           regs[i].wasconst&=~(1<<hr);
9820         }
9821       }
9822     }
9823     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9824     regs[i].waswritten=current.waswritten;
9825   }
9826   
9827   /* Pass 4 - Cull unused host registers */
9828   
9829   uint64_t nr=0;
9830   
9831   for (i=slen-1;i>=0;i--)
9832   {
9833     int hr;
9834     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9835     {
9836       if(ba[i]<start || ba[i]>=(start+slen*4))
9837       {
9838         // Branch out of this block, don't need anything
9839         nr=0;
9840       }
9841       else
9842       {
9843         // Internal branch
9844         // Need whatever matches the target
9845         nr=0;
9846         int t=(ba[i]-start)>>2;
9847         for(hr=0;hr<HOST_REGS;hr++)
9848         {
9849           if(regs[i].regmap_entry[hr]>=0) {
9850             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9851           }
9852         }
9853       }
9854       // Conditional branch may need registers for following instructions
9855       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9856       {
9857         if(i<slen-2) {
9858           nr|=needed_reg[i+2];
9859           for(hr=0;hr<HOST_REGS;hr++)
9860           {
9861             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9862             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9863           }
9864         }
9865       }
9866       // Don't need stuff which is overwritten
9867       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9868       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9869       // Merge in delay slot
9870       for(hr=0;hr<HOST_REGS;hr++)
9871       {
9872         if(!likely[i]) {
9873           // These are overwritten unless the branch is "likely"
9874           // and the delay slot is nullified if not taken
9875           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9876           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9877         }
9878         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9879         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9880         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9881         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9882         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9883         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9884         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9885         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9886         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9887           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9888           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9889         }
9890         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9891           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9892           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9893         }
9894         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9895           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9896           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9897         }
9898       }
9899     }
9900     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9901     {
9902       // SYSCALL instruction (software interrupt)
9903       nr=0;
9904     }
9905     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9906     {
9907       // ERET instruction (return from interrupt)
9908       nr=0;
9909     }
9910     else // Non-branch
9911     {
9912       if(i<slen-1) {
9913         for(hr=0;hr<HOST_REGS;hr++) {
9914           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9915           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9916           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9917           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9918         }
9919       }
9920     }
9921     for(hr=0;hr<HOST_REGS;hr++)
9922     {
9923       // Overwritten registers are not needed
9924       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9925       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9926       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9927       // Source registers are needed
9928       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9929       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9930       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9931       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9932       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9933       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9934       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9935       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9936       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9937         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9938         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9939       }
9940       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9941         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9942         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9943       }
9944       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9945         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9946         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9947       }
9948       // Don't store a register immediately after writing it,
9949       // may prevent dual-issue.
9950       // But do so if this is a branch target, otherwise we
9951       // might have to load the register before the branch.
9952       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9953         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9954            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9955           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9956           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9957         }
9958         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9959            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9960           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9961           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9962         }
9963       }
9964     }
9965     // Cycle count is needed at branches.  Assume it is needed at the target too.
9966     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9967       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9968       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9969     }
9970     // Save it
9971     needed_reg[i]=nr;
9972     
9973     // Deallocate unneeded registers
9974     for(hr=0;hr<HOST_REGS;hr++)
9975     {
9976       if(!((nr>>hr)&1)) {
9977         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9978         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9979            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9980            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9981         {
9982           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9983           {
9984             if(likely[i]) {
9985               regs[i].regmap[hr]=-1;
9986               regs[i].isconst&=~(1<<hr);
9987               if(i<slen-2) {
9988                 regmap_pre[i+2][hr]=-1;
9989                 regs[i+2].wasconst&=~(1<<hr);
9990               }
9991             }
9992           }
9993         }
9994         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9995         {
9996           int d1=0,d2=0,map=0,temp=0;
9997           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9998           {
9999             d1=dep1[i+1];
10000             d2=dep2[i+1];
10001           }
10002           if(using_tlb) {
10003             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
10004                itype[i+1]==STORE || itype[i+1]==STORELR ||
10005                itype[i+1]==C1LS || itype[i+1]==C2LS)
10006             map=TLREG;
10007           } else
10008           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
10009              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10010             map=INVCP;
10011           }
10012           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
10013              itype[i+1]==C1LS || itype[i+1]==C2LS)
10014             temp=FTEMP;
10015           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10016              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10017              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
10018              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
10019              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10020              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
10021              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
10022              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
10023              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
10024              regs[i].regmap[hr]!=map )
10025           {
10026             regs[i].regmap[hr]=-1;
10027             regs[i].isconst&=~(1<<hr);
10028             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
10029                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
10030                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
10031                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
10032                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
10033                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
10034                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
10035                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
10036                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
10037                branch_regs[i].regmap[hr]!=map)
10038             {
10039               branch_regs[i].regmap[hr]=-1;
10040               branch_regs[i].regmap_entry[hr]=-1;
10041               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10042               {
10043                 if(!likely[i]&&i<slen-2) {
10044                   regmap_pre[i+2][hr]=-1;
10045                   regs[i+2].wasconst&=~(1<<hr);
10046                 }
10047               }
10048             }
10049           }
10050         }
10051         else
10052         {
10053           // Non-branch
10054           if(i>0)
10055           {
10056             int d1=0,d2=0,map=-1,temp=-1;
10057             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
10058             {
10059               d1=dep1[i];
10060               d2=dep2[i];
10061             }
10062             if(using_tlb) {
10063               if(itype[i]==LOAD || itype[i]==LOADLR ||
10064                  itype[i]==STORE || itype[i]==STORELR ||
10065                  itype[i]==C1LS || itype[i]==C2LS)
10066               map=TLREG;
10067             } else if(itype[i]==STORE || itype[i]==STORELR ||
10068                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10069               map=INVCP;
10070             }
10071             if(itype[i]==LOADLR || itype[i]==STORELR ||
10072                itype[i]==C1LS || itype[i]==C2LS)
10073               temp=FTEMP;
10074             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10075                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10076                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10077                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10078                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10079                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10080             {
10081               if(i<slen-1&&!is_ds[i]) {
10082                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10083                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10084                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10085                 {
10086                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10087                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10088                 }
10089                 regmap_pre[i+1][hr]=-1;
10090                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10091                 regs[i+1].wasconst&=~(1<<hr);
10092               }
10093               regs[i].regmap[hr]=-1;
10094               regs[i].isconst&=~(1<<hr);
10095             }
10096           }
10097         }
10098       }
10099     }
10100   }
10101   
10102   /* Pass 5 - Pre-allocate registers */
10103   
10104   // If a register is allocated during a loop, try to allocate it for the
10105   // entire loop, if possible.  This avoids loading/storing registers
10106   // inside of the loop.
10107   
10108   signed char f_regmap[HOST_REGS];
10109   clear_all_regs(f_regmap);
10110   for(i=0;i<slen-1;i++)
10111   {
10112     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10113     {
10114       if(ba[i]>=start && ba[i]<(start+i*4)) 
10115       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10116       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10117       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10118       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10119       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10120       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10121       {
10122         int t=(ba[i]-start)>>2;
10123         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10124         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10125         for(hr=0;hr<HOST_REGS;hr++)
10126         {
10127           if(regs[i].regmap[hr]>64) {
10128             if(!((regs[i].dirty>>hr)&1))
10129               f_regmap[hr]=regs[i].regmap[hr];
10130             else f_regmap[hr]=-1;
10131           }
10132           else if(regs[i].regmap[hr]>=0) {
10133             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10134               // dealloc old register
10135               int n;
10136               for(n=0;n<HOST_REGS;n++)
10137               {
10138                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10139               }
10140               // and alloc new one
10141               f_regmap[hr]=regs[i].regmap[hr];
10142             }
10143           }
10144           if(branch_regs[i].regmap[hr]>64) {
10145             if(!((branch_regs[i].dirty>>hr)&1))
10146               f_regmap[hr]=branch_regs[i].regmap[hr];
10147             else f_regmap[hr]=-1;
10148           }
10149           else if(branch_regs[i].regmap[hr]>=0) {
10150             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10151               // dealloc old register
10152               int n;
10153               for(n=0;n<HOST_REGS;n++)
10154               {
10155                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10156               }
10157               // and alloc new one
10158               f_regmap[hr]=branch_regs[i].regmap[hr];
10159             }
10160           }
10161           if(ooo[i]) {
10162             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10163               f_regmap[hr]=branch_regs[i].regmap[hr];
10164           }else{
10165             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10166               f_regmap[hr]=branch_regs[i].regmap[hr];
10167           }
10168           // Avoid dirty->clean transition
10169           #ifdef DESTRUCTIVE_WRITEBACK
10170           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10171           #endif
10172           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10173           // case above, however it's always a good idea.  We can't hoist the
10174           // load if the register was already allocated, so there's no point
10175           // wasting time analyzing most of these cases.  It only "succeeds"
10176           // when the mapping was different and the load can be replaced with
10177           // a mov, which is of negligible benefit.  So such cases are
10178           // skipped below.
10179           if(f_regmap[hr]>0) {
10180             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10181               int r=f_regmap[hr];
10182               for(j=t;j<=i;j++)
10183               {
10184                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10185                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10186                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10187                 if(r>63) {
10188                   // NB This can exclude the case where the upper-half
10189                   // register is lower numbered than the lower-half
10190                   // register.  Not sure if it's worth fixing...
10191                   if(get_reg(regs[j].regmap,r&63)<0) break;
10192                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10193                   if(regs[j].is32&(1LL<<(r&63))) break;
10194                 }
10195                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10196                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10197                   int k;
10198                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10199                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10200                     if(r>63) {
10201                       if(get_reg(regs[i].regmap,r&63)<0) break;
10202                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10203                     }
10204                     k=i;
10205                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10206                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10207                         //printf("no free regs for store %x\n",start+(k-1)*4);
10208                         break;
10209                       }
10210                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10211                         //printf("no-match due to different register\n");
10212                         break;
10213                       }
10214                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10215                         //printf("no-match due to branch\n");
10216                         break;
10217                       }
10218                       // call/ret fast path assumes no registers allocated
10219                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10220                         break;
10221                       }
10222                       if(r>63) {
10223                         // NB This can exclude the case where the upper-half
10224                         // register is lower numbered than the lower-half
10225                         // register.  Not sure if it's worth fixing...
10226                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10227                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10228                       }
10229                       k--;
10230                     }
10231                     if(i<slen-1) {
10232                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10233                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10234                         //printf("bad match after branch\n");
10235                         break;
10236                       }
10237                     }
10238                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10239                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10240                       while(k<i) {
10241                         regs[k].regmap_entry[hr]=f_regmap[hr];
10242                         regs[k].regmap[hr]=f_regmap[hr];
10243                         regmap_pre[k+1][hr]=f_regmap[hr];
10244                         regs[k].wasdirty&=~(1<<hr);
10245                         regs[k].dirty&=~(1<<hr);
10246                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10247                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10248                         regs[k].wasconst&=~(1<<hr);
10249                         regs[k].isconst&=~(1<<hr);
10250                         k++;
10251                       }
10252                     }
10253                     else {
10254                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10255                       break;
10256                     }
10257                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10258                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10259                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10260                       regs[i].regmap_entry[hr]=f_regmap[hr];
10261                       regs[i].regmap[hr]=f_regmap[hr];
10262                       regs[i].wasdirty&=~(1<<hr);
10263                       regs[i].dirty&=~(1<<hr);
10264                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10265                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10266                       regs[i].wasconst&=~(1<<hr);
10267                       regs[i].isconst&=~(1<<hr);
10268                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10269                       branch_regs[i].wasdirty&=~(1<<hr);
10270                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10271                       branch_regs[i].regmap[hr]=f_regmap[hr];
10272                       branch_regs[i].dirty&=~(1<<hr);
10273                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10274                       branch_regs[i].wasconst&=~(1<<hr);
10275                       branch_regs[i].isconst&=~(1<<hr);
10276                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10277                         regmap_pre[i+2][hr]=f_regmap[hr];
10278                         regs[i+2].wasdirty&=~(1<<hr);
10279                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10280                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10281                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10282                       }
10283                     }
10284                   }
10285                   for(k=t;k<j;k++) {
10286                     // Alloc register clean at beginning of loop,
10287                     // but may dirty it in pass 6
10288                     regs[k].regmap_entry[hr]=f_regmap[hr];
10289                     regs[k].regmap[hr]=f_regmap[hr];
10290                     regs[k].dirty&=~(1<<hr);
10291                     regs[k].wasconst&=~(1<<hr);
10292                     regs[k].isconst&=~(1<<hr);
10293                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10294                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10295                       branch_regs[k].regmap[hr]=f_regmap[hr];
10296                       branch_regs[k].dirty&=~(1<<hr);
10297                       branch_regs[k].wasconst&=~(1<<hr);
10298                       branch_regs[k].isconst&=~(1<<hr);
10299                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10300                         regmap_pre[k+2][hr]=f_regmap[hr];
10301                         regs[k+2].wasdirty&=~(1<<hr);
10302                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10303                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10304                       }
10305                     }
10306                     else
10307                     {
10308                       regmap_pre[k+1][hr]=f_regmap[hr];
10309                       regs[k+1].wasdirty&=~(1<<hr);
10310                     }
10311                   }
10312                   if(regs[j].regmap[hr]==f_regmap[hr])
10313                     regs[j].regmap_entry[hr]=f_regmap[hr];
10314                   break;
10315                 }
10316                 if(j==i) break;
10317                 if(regs[j].regmap[hr]>=0)
10318                   break;
10319                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10320                   //printf("no-match due to different register\n");
10321                   break;
10322                 }
10323                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10324                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10325                   break;
10326                 }
10327                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10328                 {
10329                   // Stop on unconditional branch
10330                   break;
10331                 }
10332                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10333                 {
10334                   if(ooo[j]) {
10335                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10336                       break;
10337                   }else{
10338                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10339                       break;
10340                   }
10341                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10342                     //printf("no-match due to different register (branch)\n");
10343                     break;
10344                   }
10345                 }
10346                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10347                   //printf("No free regs for store %x\n",start+j*4);
10348                   break;
10349                 }
10350                 if(f_regmap[hr]>=64) {
10351                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10352                     break;
10353                   }
10354                   else
10355                   {
10356                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10357                       break;
10358                     }
10359                   }
10360                 }
10361               }
10362             }
10363           }
10364         }
10365       }
10366     }else{
10367       // Non branch or undetermined branch target
10368       for(hr=0;hr<HOST_REGS;hr++)
10369       {
10370         if(hr!=EXCLUDE_REG) {
10371           if(regs[i].regmap[hr]>64) {
10372             if(!((regs[i].dirty>>hr)&1))
10373               f_regmap[hr]=regs[i].regmap[hr];
10374           }
10375           else if(regs[i].regmap[hr]>=0) {
10376             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10377               // dealloc old register
10378               int n;
10379               for(n=0;n<HOST_REGS;n++)
10380               {
10381                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10382               }
10383               // and alloc new one
10384               f_regmap[hr]=regs[i].regmap[hr];
10385             }
10386           }
10387         }
10388       }
10389       // Try to restore cycle count at branch targets
10390       if(bt[i]) {
10391         for(j=i;j<slen-1;j++) {
10392           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10393           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10394             //printf("no free regs for store %x\n",start+j*4);
10395             break;
10396           }
10397         }
10398         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10399           int k=i;
10400           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10401           while(k<j) {
10402             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10403             regs[k].regmap[HOST_CCREG]=CCREG;
10404             regmap_pre[k+1][HOST_CCREG]=CCREG;
10405             regs[k+1].wasdirty|=1<<HOST_CCREG;
10406             regs[k].dirty|=1<<HOST_CCREG;
10407             regs[k].wasconst&=~(1<<HOST_CCREG);
10408             regs[k].isconst&=~(1<<HOST_CCREG);
10409             k++;
10410           }
10411           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10412         }
10413         // Work backwards from the branch target
10414         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10415         {
10416           //printf("Extend backwards\n");
10417           int k;
10418           k=i;
10419           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10420             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10421               //printf("no free regs for store %x\n",start+(k-1)*4);
10422               break;
10423             }
10424             k--;
10425           }
10426           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10427             //printf("Extend CC, %x ->\n",start+k*4);
10428             while(k<=i) {
10429               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10430               regs[k].regmap[HOST_CCREG]=CCREG;
10431               regmap_pre[k+1][HOST_CCREG]=CCREG;
10432               regs[k+1].wasdirty|=1<<HOST_CCREG;
10433               regs[k].dirty|=1<<HOST_CCREG;
10434               regs[k].wasconst&=~(1<<HOST_CCREG);
10435               regs[k].isconst&=~(1<<HOST_CCREG);
10436               k++;
10437             }
10438           }
10439           else {
10440             //printf("Fail Extend CC, %x ->\n",start+k*4);
10441           }
10442         }
10443       }
10444       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10445          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10446          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10447          itype[i]!=FCONV&&itype[i]!=FCOMP)
10448       {
10449         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10450       }
10451     }
10452   }
10453   
10454   // Cache memory offset or tlb map pointer if a register is available
10455   #ifndef HOST_IMM_ADDR32
10456   #ifndef RAM_OFFSET
10457   if(using_tlb)
10458   #endif
10459   {
10460     int earliest_available[HOST_REGS];
10461     int loop_start[HOST_REGS];
10462     int score[HOST_REGS];
10463     int end[HOST_REGS];
10464     int reg=using_tlb?MMREG:ROREG;
10465
10466     // Init
10467     for(hr=0;hr<HOST_REGS;hr++) {
10468       score[hr]=0;earliest_available[hr]=0;
10469       loop_start[hr]=MAXBLOCK;
10470     }
10471     for(i=0;i<slen-1;i++)
10472     {
10473       // Can't do anything if no registers are available
10474       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10475         for(hr=0;hr<HOST_REGS;hr++) {
10476           score[hr]=0;earliest_available[hr]=i+1;
10477           loop_start[hr]=MAXBLOCK;
10478         }
10479       }
10480       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10481         if(!ooo[i]) {
10482           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10483             for(hr=0;hr<HOST_REGS;hr++) {
10484               score[hr]=0;earliest_available[hr]=i+1;
10485               loop_start[hr]=MAXBLOCK;
10486             }
10487           }
10488         }else{
10489           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10490             for(hr=0;hr<HOST_REGS;hr++) {
10491               score[hr]=0;earliest_available[hr]=i+1;
10492               loop_start[hr]=MAXBLOCK;
10493             }
10494           }
10495         }
10496       }
10497       // Mark unavailable registers
10498       for(hr=0;hr<HOST_REGS;hr++) {
10499         if(regs[i].regmap[hr]>=0) {
10500           score[hr]=0;earliest_available[hr]=i+1;
10501           loop_start[hr]=MAXBLOCK;
10502         }
10503         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10504           if(branch_regs[i].regmap[hr]>=0) {
10505             score[hr]=0;earliest_available[hr]=i+2;
10506             loop_start[hr]=MAXBLOCK;
10507           }
10508         }
10509       }
10510       // No register allocations after unconditional jumps
10511       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10512       {
10513         for(hr=0;hr<HOST_REGS;hr++) {
10514           score[hr]=0;earliest_available[hr]=i+2;
10515           loop_start[hr]=MAXBLOCK;
10516         }
10517         i++; // Skip delay slot too
10518         //printf("skip delay slot: %x\n",start+i*4);
10519       }
10520       else
10521       // Possible match
10522       if(itype[i]==LOAD||itype[i]==LOADLR||
10523          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10524         for(hr=0;hr<HOST_REGS;hr++) {
10525           if(hr!=EXCLUDE_REG) {
10526             end[hr]=i-1;
10527             for(j=i;j<slen-1;j++) {
10528               if(regs[j].regmap[hr]>=0) break;
10529               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10530                 if(branch_regs[j].regmap[hr]>=0) break;
10531                 if(ooo[j]) {
10532                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10533                 }else{
10534                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10535                 }
10536               }
10537               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10538               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10539                 int t=(ba[j]-start)>>2;
10540                 if(t<j&&t>=earliest_available[hr]) {
10541                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10542                     // Score a point for hoisting loop invariant
10543                     if(t<loop_start[hr]) loop_start[hr]=t;
10544                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10545                     score[hr]++;
10546                     end[hr]=j;
10547                   }
10548                 }
10549                 else if(t<j) {
10550                   if(regs[t].regmap[hr]==reg) {
10551                     // Score a point if the branch target matches this register
10552                     score[hr]++;
10553                     end[hr]=j;
10554                   }
10555                 }
10556                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10557                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10558                   score[hr]++;
10559                   end[hr]=j;
10560                 }
10561               }
10562               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10563               {
10564                 // Stop on unconditional branch
10565                 break;
10566               }
10567               else
10568               if(itype[j]==LOAD||itype[j]==LOADLR||
10569                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10570                 score[hr]++;
10571                 end[hr]=j;
10572               }
10573             }
10574           }
10575         }
10576         // Find highest score and allocate that register
10577         int maxscore=0;
10578         for(hr=0;hr<HOST_REGS;hr++) {
10579           if(hr!=EXCLUDE_REG) {
10580             if(score[hr]>score[maxscore]) {
10581               maxscore=hr;
10582               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10583             }
10584           }
10585         }
10586         if(score[maxscore]>1)
10587         {
10588           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10589           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10590             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10591             assert(regs[j].regmap[maxscore]<0);
10592             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10593             regs[j].regmap[maxscore]=reg;
10594             regs[j].dirty&=~(1<<maxscore);
10595             regs[j].wasconst&=~(1<<maxscore);
10596             regs[j].isconst&=~(1<<maxscore);
10597             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10598               branch_regs[j].regmap[maxscore]=reg;
10599               branch_regs[j].wasdirty&=~(1<<maxscore);
10600               branch_regs[j].dirty&=~(1<<maxscore);
10601               branch_regs[j].wasconst&=~(1<<maxscore);
10602               branch_regs[j].isconst&=~(1<<maxscore);
10603               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10604                 regmap_pre[j+2][maxscore]=reg;
10605                 regs[j+2].wasdirty&=~(1<<maxscore);
10606               }
10607               // loop optimization (loop_preload)
10608               int t=(ba[j]-start)>>2;
10609               if(t==loop_start[maxscore]) {
10610                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10611                   regs[t].regmap_entry[maxscore]=reg;
10612               }
10613             }
10614             else
10615             {
10616               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10617                 regmap_pre[j+1][maxscore]=reg;
10618                 regs[j+1].wasdirty&=~(1<<maxscore);
10619               }
10620             }
10621           }
10622           i=j-1;
10623           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10624           for(hr=0;hr<HOST_REGS;hr++) {
10625             score[hr]=0;earliest_available[hr]=i+i;
10626             loop_start[hr]=MAXBLOCK;
10627           }
10628         }
10629       }
10630     }
10631   }
10632   #endif
10633   
10634   // This allocates registers (if possible) one instruction prior
10635   // to use, which can avoid a load-use penalty on certain CPUs.
10636   for(i=0;i<slen-1;i++)
10637   {
10638     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10639     {
10640       if(!bt[i+1])
10641       {
10642         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10643            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10644         {
10645           if(rs1[i+1]) {
10646             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10647             {
10648               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10649               {
10650                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10651                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10652                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10653                 regs[i].isconst&=~(1<<hr);
10654                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10655                 constmap[i][hr]=constmap[i+1][hr];
10656                 regs[i+1].wasdirty&=~(1<<hr);
10657                 regs[i].dirty&=~(1<<hr);
10658               }
10659             }
10660           }
10661           if(rs2[i+1]) {
10662             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10663             {
10664               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10665               {
10666                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10667                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10668                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10669                 regs[i].isconst&=~(1<<hr);
10670                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10671                 constmap[i][hr]=constmap[i+1][hr];
10672                 regs[i+1].wasdirty&=~(1<<hr);
10673                 regs[i].dirty&=~(1<<hr);
10674               }
10675             }
10676           }
10677           // Preload target address for load instruction (non-constant)
10678           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10679             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10680             {
10681               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10682               {
10683                 regs[i].regmap[hr]=rs1[i+1];
10684                 regmap_pre[i+1][hr]=rs1[i+1];
10685                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10686                 regs[i].isconst&=~(1<<hr);
10687                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10688                 constmap[i][hr]=constmap[i+1][hr];
10689                 regs[i+1].wasdirty&=~(1<<hr);
10690                 regs[i].dirty&=~(1<<hr);
10691               }
10692             }
10693           }
10694           // Load source into target register 
10695           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10696             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10697             {
10698               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10699               {
10700                 regs[i].regmap[hr]=rs1[i+1];
10701                 regmap_pre[i+1][hr]=rs1[i+1];
10702                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10703                 regs[i].isconst&=~(1<<hr);
10704                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10705                 constmap[i][hr]=constmap[i+1][hr];
10706                 regs[i+1].wasdirty&=~(1<<hr);
10707                 regs[i].dirty&=~(1<<hr);
10708               }
10709             }
10710           }
10711           // Preload map address
10712           #ifndef HOST_IMM_ADDR32
10713           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10714             hr=get_reg(regs[i+1].regmap,TLREG);
10715             if(hr>=0) {
10716               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10717               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10718                 int nr;
10719                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10720                 {
10721                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10722                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10723                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10724                   regs[i].isconst&=~(1<<hr);
10725                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10726                   constmap[i][hr]=constmap[i+1][hr];
10727                   regs[i+1].wasdirty&=~(1<<hr);
10728                   regs[i].dirty&=~(1<<hr);
10729                 }
10730                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10731                 {
10732                   // move it to another register
10733                   regs[i+1].regmap[hr]=-1;
10734                   regmap_pre[i+2][hr]=-1;
10735                   regs[i+1].regmap[nr]=TLREG;
10736                   regmap_pre[i+2][nr]=TLREG;
10737                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10738                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10739                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10740                   regs[i].isconst&=~(1<<nr);
10741                   regs[i+1].isconst&=~(1<<nr);
10742                   regs[i].dirty&=~(1<<nr);
10743                   regs[i+1].wasdirty&=~(1<<nr);
10744                   regs[i+1].dirty&=~(1<<nr);
10745                   regs[i+2].wasdirty&=~(1<<nr);
10746                 }
10747               }
10748             }
10749           }
10750           #endif
10751           // Address for store instruction (non-constant)
10752           if(itype[i+1]==STORE||itype[i+1]==STORELR
10753              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10754             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10755               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10756               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10757               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10758               assert(hr>=0);
10759               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10760               {
10761                 regs[i].regmap[hr]=rs1[i+1];
10762                 regmap_pre[i+1][hr]=rs1[i+1];
10763                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10764                 regs[i].isconst&=~(1<<hr);
10765                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10766                 constmap[i][hr]=constmap[i+1][hr];
10767                 regs[i+1].wasdirty&=~(1<<hr);
10768                 regs[i].dirty&=~(1<<hr);
10769               }
10770             }
10771           }
10772           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10773             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10774               int nr;
10775               hr=get_reg(regs[i+1].regmap,FTEMP);
10776               assert(hr>=0);
10777               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10778               {
10779                 regs[i].regmap[hr]=rs1[i+1];
10780                 regmap_pre[i+1][hr]=rs1[i+1];
10781                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10782                 regs[i].isconst&=~(1<<hr);
10783                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10784                 constmap[i][hr]=constmap[i+1][hr];
10785                 regs[i+1].wasdirty&=~(1<<hr);
10786                 regs[i].dirty&=~(1<<hr);
10787               }
10788               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10789               {
10790                 // move it to another register
10791                 regs[i+1].regmap[hr]=-1;
10792                 regmap_pre[i+2][hr]=-1;
10793                 regs[i+1].regmap[nr]=FTEMP;
10794                 regmap_pre[i+2][nr]=FTEMP;
10795                 regs[i].regmap[nr]=rs1[i+1];
10796                 regmap_pre[i+1][nr]=rs1[i+1];
10797                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10798                 regs[i].isconst&=~(1<<nr);
10799                 regs[i+1].isconst&=~(1<<nr);
10800                 regs[i].dirty&=~(1<<nr);
10801                 regs[i+1].wasdirty&=~(1<<nr);
10802                 regs[i+1].dirty&=~(1<<nr);
10803                 regs[i+2].wasdirty&=~(1<<nr);
10804               }
10805             }
10806           }
10807           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10808             if(itype[i+1]==LOAD) 
10809               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10810             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10811               hr=get_reg(regs[i+1].regmap,FTEMP);
10812             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10813               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10814               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10815             }
10816             if(hr>=0&&regs[i].regmap[hr]<0) {
10817               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10818               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10819                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10820                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10821                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10822                 regs[i].isconst&=~(1<<hr);
10823                 regs[i+1].wasdirty&=~(1<<hr);
10824                 regs[i].dirty&=~(1<<hr);
10825               }
10826             }
10827           }
10828         }
10829       }
10830     }
10831   }
10832   
10833   /* Pass 6 - Optimize clean/dirty state */
10834   clean_registers(0,slen-1,1);
10835   
10836   /* Pass 7 - Identify 32-bit registers */
10837 #ifndef FORCE32
10838   provisional_r32();
10839
10840   u_int r32=0;
10841   
10842   for (i=slen-1;i>=0;i--)
10843   {
10844     int hr;
10845     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10846     {
10847       if(ba[i]<start || ba[i]>=(start+slen*4))
10848       {
10849         // Branch out of this block, don't need anything
10850         r32=0;
10851       }
10852       else
10853       {
10854         // Internal branch
10855         // Need whatever matches the target
10856         // (and doesn't get overwritten by the delay slot instruction)
10857         r32=0;
10858         int t=(ba[i]-start)>>2;
10859         if(ba[i]>start+i*4) {
10860           // Forward branch
10861           if(!(requires_32bit[t]&~regs[i].was32))
10862             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10863         }else{
10864           // Backward branch
10865           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10866           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10867           if(!(pr32[t]&~regs[i].was32))
10868             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10869         }
10870       }
10871       // Conditional branch may need registers for following instructions
10872       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10873       {
10874         if(i<slen-2) {
10875           r32|=requires_32bit[i+2];
10876           r32&=regs[i].was32;
10877           // Mark this address as a branch target since it may be called
10878           // upon return from interrupt
10879           bt[i+2]=1;
10880         }
10881       }
10882       // Merge in delay slot
10883       if(!likely[i]) {
10884         // These are overwritten unless the branch is "likely"
10885         // and the delay slot is nullified if not taken
10886         r32&=~(1LL<<rt1[i+1]);
10887         r32&=~(1LL<<rt2[i+1]);
10888       }
10889       // Assume these are needed (delay slot)
10890       if(us1[i+1]>0)
10891       {
10892         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10893       }
10894       if(us2[i+1]>0)
10895       {
10896         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10897       }
10898       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10899       {
10900         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10901       }
10902       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10903       {
10904         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10905       }
10906     }
10907     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10908     {
10909       // SYSCALL instruction (software interrupt)
10910       r32=0;
10911     }
10912     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10913     {
10914       // ERET instruction (return from interrupt)
10915       r32=0;
10916     }
10917     // Check 32 bits
10918     r32&=~(1LL<<rt1[i]);
10919     r32&=~(1LL<<rt2[i]);
10920     if(us1[i]>0)
10921     {
10922       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10923     }
10924     if(us2[i]>0)
10925     {
10926       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10927     }
10928     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10929     {
10930       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10931     }
10932     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10933     {
10934       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10935     }
10936     requires_32bit[i]=r32;
10937     
10938     // Dirty registers which are 32-bit, require 32-bit input
10939     // as they will be written as 32-bit values
10940     for(hr=0;hr<HOST_REGS;hr++)
10941     {
10942       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10943         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10944           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10945           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10946         }
10947       }
10948     }
10949     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10950   }
10951 #else
10952   for (i=slen-1;i>=0;i--)
10953   {
10954     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10955     {
10956       // Conditional branch
10957       if((source[i]>>16)!=0x1000&&i<slen-2) {
10958         // Mark this address as a branch target since it may be called
10959         // upon return from interrupt
10960         bt[i+2]=1;
10961       }
10962     }
10963   }
10964 #endif
10965
10966   if(itype[slen-1]==SPAN) {
10967     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10968   }
10969
10970 #ifdef DISASM
10971   /* Debug/disassembly */
10972   for(i=0;i<slen;i++)
10973   {
10974     printf("U:");
10975     int r;
10976     for(r=1;r<=CCREG;r++) {
10977       if((unneeded_reg[i]>>r)&1) {
10978         if(r==HIREG) printf(" HI");
10979         else if(r==LOREG) printf(" LO");
10980         else printf(" r%d",r);
10981       }
10982     }
10983 #ifndef FORCE32
10984     printf(" UU:");
10985     for(r=1;r<=CCREG;r++) {
10986       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10987         if(r==HIREG) printf(" HI");
10988         else if(r==LOREG) printf(" LO");
10989         else printf(" r%d",r);
10990       }
10991     }
10992     printf(" 32:");
10993     for(r=0;r<=CCREG;r++) {
10994       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10995       if((regs[i].was32>>r)&1) {
10996         if(r==CCREG) printf(" CC");
10997         else if(r==HIREG) printf(" HI");
10998         else if(r==LOREG) printf(" LO");
10999         else printf(" r%d",r);
11000       }
11001     }
11002 #endif
11003     printf("\n");
11004     #if defined(__i386__) || defined(__x86_64__)
11005     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
11006     #endif
11007     #ifdef __arm__
11008     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
11009     #endif
11010     printf("needs: ");
11011     if(needed_reg[i]&1) printf("eax ");
11012     if((needed_reg[i]>>1)&1) printf("ecx ");
11013     if((needed_reg[i]>>2)&1) printf("edx ");
11014     if((needed_reg[i]>>3)&1) printf("ebx ");
11015     if((needed_reg[i]>>5)&1) printf("ebp ");
11016     if((needed_reg[i]>>6)&1) printf("esi ");
11017     if((needed_reg[i]>>7)&1) printf("edi ");
11018     printf("r:");
11019     for(r=0;r<=CCREG;r++) {
11020       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11021       if((requires_32bit[i]>>r)&1) {
11022         if(r==CCREG) printf(" CC");
11023         else if(r==HIREG) printf(" HI");
11024         else if(r==LOREG) printf(" LO");
11025         else printf(" r%d",r);
11026       }
11027     }
11028     printf("\n");
11029     /*printf("pr:");
11030     for(r=0;r<=CCREG;r++) {
11031       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11032       if((pr32[i]>>r)&1) {
11033         if(r==CCREG) printf(" CC");
11034         else if(r==HIREG) printf(" HI");
11035         else if(r==LOREG) printf(" LO");
11036         else printf(" r%d",r);
11037       }
11038     }
11039     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
11040     printf("\n");*/
11041     #if defined(__i386__) || defined(__x86_64__)
11042     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
11043     printf("dirty: ");
11044     if(regs[i].wasdirty&1) printf("eax ");
11045     if((regs[i].wasdirty>>1)&1) printf("ecx ");
11046     if((regs[i].wasdirty>>2)&1) printf("edx ");
11047     if((regs[i].wasdirty>>3)&1) printf("ebx ");
11048     if((regs[i].wasdirty>>5)&1) printf("ebp ");
11049     if((regs[i].wasdirty>>6)&1) printf("esi ");
11050     if((regs[i].wasdirty>>7)&1) printf("edi ");
11051     #endif
11052     #ifdef __arm__
11053     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
11054     printf("dirty: ");
11055     if(regs[i].wasdirty&1) printf("r0 ");
11056     if((regs[i].wasdirty>>1)&1) printf("r1 ");
11057     if((regs[i].wasdirty>>2)&1) printf("r2 ");
11058     if((regs[i].wasdirty>>3)&1) printf("r3 ");
11059     if((regs[i].wasdirty>>4)&1) printf("r4 ");
11060     if((regs[i].wasdirty>>5)&1) printf("r5 ");
11061     if((regs[i].wasdirty>>6)&1) printf("r6 ");
11062     if((regs[i].wasdirty>>7)&1) printf("r7 ");
11063     if((regs[i].wasdirty>>8)&1) printf("r8 ");
11064     if((regs[i].wasdirty>>9)&1) printf("r9 ");
11065     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11066     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11067     #endif
11068     printf("\n");
11069     disassemble_inst(i);
11070     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11071     #if defined(__i386__) || defined(__x86_64__)
11072     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11073     if(regs[i].dirty&1) printf("eax ");
11074     if((regs[i].dirty>>1)&1) printf("ecx ");
11075     if((regs[i].dirty>>2)&1) printf("edx ");
11076     if((regs[i].dirty>>3)&1) printf("ebx ");
11077     if((regs[i].dirty>>5)&1) printf("ebp ");
11078     if((regs[i].dirty>>6)&1) printf("esi ");
11079     if((regs[i].dirty>>7)&1) printf("edi ");
11080     #endif
11081     #ifdef __arm__
11082     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11083     if(regs[i].dirty&1) printf("r0 ");
11084     if((regs[i].dirty>>1)&1) printf("r1 ");
11085     if((regs[i].dirty>>2)&1) printf("r2 ");
11086     if((regs[i].dirty>>3)&1) printf("r3 ");
11087     if((regs[i].dirty>>4)&1) printf("r4 ");
11088     if((regs[i].dirty>>5)&1) printf("r5 ");
11089     if((regs[i].dirty>>6)&1) printf("r6 ");
11090     if((regs[i].dirty>>7)&1) printf("r7 ");
11091     if((regs[i].dirty>>8)&1) printf("r8 ");
11092     if((regs[i].dirty>>9)&1) printf("r9 ");
11093     if((regs[i].dirty>>10)&1) printf("r10 ");
11094     if((regs[i].dirty>>12)&1) printf("r12 ");
11095     #endif
11096     printf("\n");
11097     if(regs[i].isconst) {
11098       printf("constants: ");
11099       #if defined(__i386__) || defined(__x86_64__)
11100       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11101       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11102       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11103       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11104       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11105       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11106       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11107       #endif
11108       #ifdef __arm__
11109       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11110       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11111       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11112       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11113       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11114       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11115       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11116       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11117       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11118       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11119       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11120       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11121       #endif
11122       printf("\n");
11123     }
11124 #ifndef FORCE32
11125     printf(" 32:");
11126     for(r=0;r<=CCREG;r++) {
11127       if((regs[i].is32>>r)&1) {
11128         if(r==CCREG) printf(" CC");
11129         else if(r==HIREG) printf(" HI");
11130         else if(r==LOREG) printf(" LO");
11131         else printf(" r%d",r);
11132       }
11133     }
11134     printf("\n");
11135 #endif
11136     /*printf(" p32:");
11137     for(r=0;r<=CCREG;r++) {
11138       if((p32[i]>>r)&1) {
11139         if(r==CCREG) printf(" CC");
11140         else if(r==HIREG) printf(" HI");
11141         else if(r==LOREG) printf(" LO");
11142         else printf(" r%d",r);
11143       }
11144     }
11145     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11146     else printf("\n");*/
11147     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11148       #if defined(__i386__) || defined(__x86_64__)
11149       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11150       if(branch_regs[i].dirty&1) printf("eax ");
11151       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11152       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11153       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11154       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11155       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11156       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11157       #endif
11158       #ifdef __arm__
11159       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11160       if(branch_regs[i].dirty&1) printf("r0 ");
11161       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11162       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11163       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11164       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11165       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11166       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11167       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11168       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11169       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11170       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11171       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11172       #endif
11173 #ifndef FORCE32
11174       printf(" 32:");
11175       for(r=0;r<=CCREG;r++) {
11176         if((branch_regs[i].is32>>r)&1) {
11177           if(r==CCREG) printf(" CC");
11178           else if(r==HIREG) printf(" HI");
11179           else if(r==LOREG) printf(" LO");
11180           else printf(" r%d",r);
11181         }
11182       }
11183       printf("\n");
11184 #endif
11185     }
11186   }
11187 #endif // DISASM
11188
11189   /* Pass 8 - Assembly */
11190   linkcount=0;stubcount=0;
11191   ds=0;is_delayslot=0;
11192   cop1_usable=0;
11193   uint64_t is32_pre=0;
11194   u_int dirty_pre=0;
11195   u_int beginning=(u_int)out;
11196   if((u_int)addr&1) {
11197     ds=1;
11198     pagespan_ds();
11199   }
11200   u_int instr_addr0_override=0;
11201
11202 #ifdef PCSX
11203   if (start == 0x80030000) {
11204     // nasty hack for fastbios thing
11205     // override block entry to this code
11206     instr_addr0_override=(u_int)out;
11207     emit_movimm(start,0);
11208     // abuse io address var as a flag that we
11209     // have already returned here once
11210     emit_readword((int)&address,1);
11211     emit_writeword(0,(int)&pcaddr);
11212     emit_writeword(0,(int)&address);
11213     emit_cmp(0,1);
11214     emit_jne((int)new_dyna_leave);
11215   }
11216 #endif
11217   for(i=0;i<slen;i++)
11218   {
11219     //if(ds) printf("ds: ");
11220     disassemble_inst(i);
11221     if(ds) {
11222       ds=0; // Skip delay slot
11223       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11224       instr_addr[i]=0;
11225     } else {
11226       speculate_register_values(i);
11227       #ifndef DESTRUCTIVE_WRITEBACK
11228       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11229       {
11230         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11231               unneeded_reg[i],unneeded_reg_upper[i]);
11232         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11233               unneeded_reg[i],unneeded_reg_upper[i]);
11234       }
11235       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11236         is32_pre=branch_regs[i].is32;
11237         dirty_pre=branch_regs[i].dirty;
11238       }else{
11239         is32_pre=regs[i].is32;
11240         dirty_pre=regs[i].dirty;
11241       }
11242       #endif
11243       // write back
11244       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11245       {
11246         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11247                       unneeded_reg[i],unneeded_reg_upper[i]);
11248         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11249       }
11250       // branch target entry point
11251       instr_addr[i]=(u_int)out;
11252       assem_debug("<->\n");
11253       // load regs
11254       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11255         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11256       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11257       address_generation(i,&regs[i],regs[i].regmap_entry);
11258       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11259       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11260       {
11261         // Load the delay slot registers if necessary
11262         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11263           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11264         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11265           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11266         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11267           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11268       }
11269       else if(i+1<slen)
11270       {
11271         // Preload registers for following instruction
11272         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11273           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11274             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11275         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11276           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11277             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11278       }
11279       // TODO: if(is_ooo(i)) address_generation(i+1);
11280       if(itype[i]==CJUMP||itype[i]==FJUMP)
11281         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11282       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11283         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11284       if(bt[i]) cop1_usable=0;
11285       // assemble
11286       switch(itype[i]) {
11287         case ALU:
11288           alu_assemble(i,&regs[i]);break;
11289         case IMM16:
11290           imm16_assemble(i,&regs[i]);break;
11291         case SHIFT:
11292           shift_assemble(i,&regs[i]);break;
11293         case SHIFTIMM:
11294           shiftimm_assemble(i,&regs[i]);break;
11295         case LOAD:
11296           load_assemble(i,&regs[i]);break;
11297         case LOADLR:
11298           loadlr_assemble(i,&regs[i]);break;
11299         case STORE:
11300           store_assemble(i,&regs[i]);break;
11301         case STORELR:
11302           storelr_assemble(i,&regs[i]);break;
11303         case COP0:
11304           cop0_assemble(i,&regs[i]);break;
11305         case COP1:
11306           cop1_assemble(i,&regs[i]);break;
11307         case C1LS:
11308           c1ls_assemble(i,&regs[i]);break;
11309         case COP2:
11310           cop2_assemble(i,&regs[i]);break;
11311         case C2LS:
11312           c2ls_assemble(i,&regs[i]);break;
11313         case C2OP:
11314           c2op_assemble(i,&regs[i]);break;
11315         case FCONV:
11316           fconv_assemble(i,&regs[i]);break;
11317         case FLOAT:
11318           float_assemble(i,&regs[i]);break;
11319         case FCOMP:
11320           fcomp_assemble(i,&regs[i]);break;
11321         case MULTDIV:
11322           multdiv_assemble(i,&regs[i]);break;
11323         case MOV:
11324           mov_assemble(i,&regs[i]);break;
11325         case SYSCALL:
11326           syscall_assemble(i,&regs[i]);break;
11327         case HLECALL:
11328           hlecall_assemble(i,&regs[i]);break;
11329         case INTCALL:
11330           intcall_assemble(i,&regs[i]);break;
11331         case UJUMP:
11332           ujump_assemble(i,&regs[i]);ds=1;break;
11333         case RJUMP:
11334           rjump_assemble(i,&regs[i]);ds=1;break;
11335         case CJUMP:
11336           cjump_assemble(i,&regs[i]);ds=1;break;
11337         case SJUMP:
11338           sjump_assemble(i,&regs[i]);ds=1;break;
11339         case FJUMP:
11340           fjump_assemble(i,&regs[i]);ds=1;break;
11341         case SPAN:
11342           pagespan_assemble(i,&regs[i]);break;
11343       }
11344       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11345         literal_pool(1024);
11346       else
11347         literal_pool_jumpover(256);
11348     }
11349   }
11350   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11351   // If the block did not end with an unconditional branch,
11352   // add a jump to the next instruction.
11353   if(i>1) {
11354     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11355       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11356       assert(i==slen);
11357       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11358         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11359         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11360           emit_loadreg(CCREG,HOST_CCREG);
11361         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11362       }
11363       else if(!likely[i-2])
11364       {
11365         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11366         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11367       }
11368       else
11369       {
11370         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11371         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11372       }
11373       add_to_linker((int)out,start+i*4,0);
11374       emit_jmp(0);
11375     }
11376   }
11377   else
11378   {
11379     assert(i>0);
11380     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11381     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11382     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11383       emit_loadreg(CCREG,HOST_CCREG);
11384     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11385     add_to_linker((int)out,start+i*4,0);
11386     emit_jmp(0);
11387   }
11388
11389   // TODO: delay slot stubs?
11390   // Stubs
11391   for(i=0;i<stubcount;i++)
11392   {
11393     switch(stubs[i][0])
11394     {
11395       case LOADB_STUB:
11396       case LOADH_STUB:
11397       case LOADW_STUB:
11398       case LOADD_STUB:
11399       case LOADBU_STUB:
11400       case LOADHU_STUB:
11401         do_readstub(i);break;
11402       case STOREB_STUB:
11403       case STOREH_STUB:
11404       case STOREW_STUB:
11405       case STORED_STUB:
11406         do_writestub(i);break;
11407       case CC_STUB:
11408         do_ccstub(i);break;
11409       case INVCODE_STUB:
11410         do_invstub(i);break;
11411       case FP_STUB:
11412         do_cop1stub(i);break;
11413       case STORELR_STUB:
11414         do_unalignedwritestub(i);break;
11415     }
11416   }
11417
11418   if (instr_addr0_override)
11419     instr_addr[0] = instr_addr0_override;
11420
11421   /* Pass 9 - Linker */
11422   for(i=0;i<linkcount;i++)
11423   {
11424     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11425     literal_pool(64);
11426     if(!link_addr[i][2])
11427     {
11428       void *stub=out;
11429       void *addr=check_addr(link_addr[i][1]);
11430       emit_extjump(link_addr[i][0],link_addr[i][1]);
11431       if(addr) {
11432         set_jump_target(link_addr[i][0],(int)addr);
11433         add_link(link_addr[i][1],stub);
11434       }
11435       else set_jump_target(link_addr[i][0],(int)stub);
11436     }
11437     else
11438     {
11439       // Internal branch
11440       int target=(link_addr[i][1]-start)>>2;
11441       assert(target>=0&&target<slen);
11442       assert(instr_addr[target]);
11443       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11444       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11445       //#else
11446       set_jump_target(link_addr[i][0],instr_addr[target]);
11447       //#endif
11448     }
11449   }
11450   // External Branch Targets (jump_in)
11451   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11452   for(i=0;i<slen;i++)
11453   {
11454     if(bt[i]||i==0)
11455     {
11456       if(instr_addr[i]) // TODO - delay slots (=null)
11457       {
11458         u_int vaddr=start+i*4;
11459         u_int page=get_page(vaddr);
11460         u_int vpage=get_vpage(vaddr);
11461         literal_pool(256);
11462         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11463 #ifndef FORCE32
11464         if(!requires_32bit[i])
11465 #else
11466         if(1)
11467 #endif
11468         {
11469           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11470           assem_debug("jump_in: %x\n",start+i*4);
11471           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11472           int entry_point=do_dirty_stub(i);
11473           ll_add(jump_in+page,vaddr,(void *)entry_point);
11474           // If there was an existing entry in the hash table,
11475           // replace it with the new address.
11476           // Don't add new entries.  We'll insert the
11477           // ones that actually get used in check_addr().
11478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11479           if(ht_bin[0]==vaddr) {
11480             ht_bin[1]=entry_point;
11481           }
11482           if(ht_bin[2]==vaddr) {
11483             ht_bin[3]=entry_point;
11484           }
11485         }
11486         else
11487         {
11488           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11489           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11490           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11491           //int entry_point=(int)out;
11492           ////assem_debug("entry_point: %x\n",entry_point);
11493           //load_regs_entry(i);
11494           //if(entry_point==(int)out)
11495           //  entry_point=instr_addr[i];
11496           //else
11497           //  emit_jmp(instr_addr[i]);
11498           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11499           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11500           int entry_point=do_dirty_stub(i);
11501           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11502         }
11503       }
11504     }
11505   }
11506   // Write out the literal pool if necessary
11507   literal_pool(0);
11508   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11509   // Align code
11510   if(((u_int)out)&7) emit_addnop(13);
11511   #endif
11512   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11513   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11514   memcpy(copy,source,slen*4);
11515   copy+=slen*4;
11516   
11517   #ifdef __arm__
11518   __clear_cache((void *)beginning,out);
11519   #endif
11520   
11521   // If we're within 256K of the end of the buffer,
11522   // start over from the beginning. (Is 256K enough?)
11523   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11524   
11525   // Trap writes to any of the pages we compiled
11526   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11527     invalid_code[i]=0;
11528 #ifndef DISABLE_TLB
11529     memory_map[i]|=0x40000000;
11530     if((signed int)start>=(signed int)0xC0000000) {
11531       assert(using_tlb);
11532       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11533       invalid_code[j]=0;
11534       memory_map[j]|=0x40000000;
11535       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11536     }
11537 #endif
11538   }
11539   inv_code_start=inv_code_end=~0;
11540 #ifdef PCSX
11541   // for PCSX we need to mark all mirrors too
11542   if(get_page(start)<(RAM_SIZE>>12))
11543     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11544       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11545       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11546       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11547 #endif
11548   
11549   /* Pass 10 - Free memory by expiring oldest blocks */
11550   
11551   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11552   while(expirep!=end)
11553   {
11554     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11555     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11556     inv_debug("EXP: Phase %d\n",expirep);
11557     switch((expirep>>11)&3)
11558     {
11559       case 0:
11560         // Clear jump_in and jump_dirty
11561         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11562         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11563         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11564         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11565         break;
11566       case 1:
11567         // Clear pointers
11568         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11569         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11570         break;
11571       case 2:
11572         // Clear hash table
11573         for(i=0;i<32;i++) {
11574           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11575           if((ht_bin[3]>>shift)==(base>>shift) ||
11576              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11577             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11578             ht_bin[2]=ht_bin[3]=-1;
11579           }
11580           if((ht_bin[1]>>shift)==(base>>shift) ||
11581              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11582             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11583             ht_bin[0]=ht_bin[2];
11584             ht_bin[1]=ht_bin[3];
11585             ht_bin[2]=ht_bin[3]=-1;
11586           }
11587         }
11588         break;
11589       case 3:
11590         // Clear jump_out
11591         #ifdef __arm__
11592         if((expirep&2047)==0) 
11593           do_clear_cache();
11594         #endif
11595         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11596         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11597         break;
11598     }
11599     expirep=(expirep+1)&65535;
11600   }
11601   return 0;
11602 }
11603
11604 // vim:shiftwidth=2:expandtab