6f45cfc5838ade9192fcbf4ec8d7668726436999
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   static u_int sp_in_mirror;
130   u_int stop_after_jal;
131   extern u_char restore_candidate[512];
132   extern int cycle_count;
133
134   /* registers that may be allocated */
135   /* 1-31 gpr */
136 #define HIREG 32 // hi
137 #define LOREG 33 // lo
138 #define FSREG 34 // FPU status (FCSR)
139 #define CSREG 35 // Coprocessor status
140 #define CCREG 36 // Cycle count
141 #define INVCP 37 // Pointer to invalid_code
142 #define MMREG 38 // Pointer to memory_map
143 #define ROREG 39 // ram offset (if rdram!=0x80000000)
144 #define TEMPREG 40
145 #define FTEMP 40 // FPU temporary register
146 #define PTEMP 41 // Prefetch temporary register
147 #define TLREG 42 // TLB mapping offset
148 #define RHASH 43 // Return address hash
149 #define RHTBL 44 // Return address hash table address
150 #define RTEMP 45 // JR/JALR address register
151 #define MAXREG 45
152 #define AGEN1 46 // Address generation temporary register
153 #define AGEN2 47 // Address generation temporary register
154 #define MGEN1 48 // Maptable address generation temporary register
155 #define MGEN2 49 // Maptable address generation temporary register
156 #define BTREG 50 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185 #define HLECALL 26// PCSX fake opcodes for HLE
186 #define COP2 27   // Coprocessor 2 move
187 #define C2LS 28   // Coprocessor 2 load/store
188 #define C2OP 29   // Coprocessor 2 operation
189 #define INTCALL 30// Call interpreter to handle rare corner cases
190
191   /* stubs */
192 #define CC_STUB 1
193 #define FP_STUB 2
194 #define LOADB_STUB 3
195 #define LOADH_STUB 4
196 #define LOADW_STUB 5
197 #define LOADD_STUB 6
198 #define LOADBU_STUB 7
199 #define LOADHU_STUB 8
200 #define STOREB_STUB 9
201 #define STOREH_STUB 10
202 #define STOREW_STUB 11
203 #define STORED_STUB 12
204 #define STORELR_STUB 13
205 #define INVCODE_STUB 14
206
207   /* branch codes */
208 #define TAKEN 1
209 #define NOTTAKEN 2
210 #define NULLDS 3
211
212 // asm linkage
213 int new_recompile_block(int addr);
214 void *get_addr_ht(u_int vaddr);
215 void invalidate_block(u_int block);
216 void invalidate_addr(u_int addr);
217 void remove_hash(int vaddr);
218 void jump_vaddr();
219 void dyna_linker();
220 void dyna_linker_ds();
221 void verify_code();
222 void verify_code_vm();
223 void verify_code_ds();
224 void cc_interrupt();
225 void fp_exception();
226 void fp_exception_ds();
227 void jump_syscall();
228 void jump_syscall_hle();
229 void jump_eret();
230 void jump_hlecall();
231 void jump_intcall();
232 void new_dyna_leave();
233
234 // TLB
235 void TLBWI_new();
236 void TLBWR_new();
237 void read_nomem_new();
238 void read_nomemb_new();
239 void read_nomemh_new();
240 void read_nomemd_new();
241 void write_nomem_new();
242 void write_nomemb_new();
243 void write_nomemh_new();
244 void write_nomemd_new();
245 void write_rdram_new();
246 void write_rdramb_new();
247 void write_rdramh_new();
248 void write_rdramd_new();
249 extern u_int memory_map[1048576];
250
251 // Needed by assembler
252 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
253 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
254 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
255 void load_all_regs(signed char i_regmap[]);
256 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
257 void load_regs_entry(int t);
258 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
259
260 int tracedebug=0;
261
262 //#define DEBUG_CYCLE_COUNT 1
263
264 void nullf() {}
265 //#define assem_debug printf
266 //#define inv_debug printf
267 #define assem_debug nullf
268 #define inv_debug nullf
269
270 static void tlb_hacks()
271 {
272 #ifndef DISABLE_TLB
273   // Goldeneye hack
274   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
275   {
276     u_int addr;
277     int n;
278     switch (ROM_HEADER->Country_code&0xFF) 
279     {
280       case 0x45: // U
281         addr=0x34b30;
282         break;                   
283       case 0x4A: // J 
284         addr=0x34b70;    
285         break;    
286       case 0x50: // E 
287         addr=0x329f0;
288         break;                        
289       default: 
290         // Unknown country code
291         addr=0;
292         break;
293     }
294     u_int rom_addr=(u_int)rom;
295     #ifdef ROM_COPY
296     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
297     // in the lower 4G of memory to use this hack.  Copy it if necessary.
298     if((void *)rom>(void *)0xffffffff) {
299       munmap(ROM_COPY, 67108864);
300       if(mmap(ROM_COPY, 12582912,
301               PROT_READ | PROT_WRITE,
302               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
303               -1, 0) <= 0) {printf("mmap() failed\n");}
304       memcpy(ROM_COPY,rom,12582912);
305       rom_addr=(u_int)ROM_COPY;
306     }
307     #endif
308     if(addr) {
309       for(n=0x7F000;n<0x80000;n++) {
310         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
311       }
312     }
313   }
314 #endif
315 }
316
317 static u_int get_page(u_int vaddr)
318 {
319 #ifndef PCSX
320   u_int page=(vaddr^0x80000000)>>12;
321 #else
322   u_int page=vaddr&~0xe0000000;
323   if (page < 0x1000000)
324     page &= ~0x0e00000; // RAM mirrors
325   page>>=12;
326 #endif
327 #ifndef DISABLE_TLB
328   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
329 #endif
330   if(page>2048) page=2048+(page&2047);
331   return page;
332 }
333
334 static u_int get_vpage(u_int vaddr)
335 {
336   u_int vpage=(vaddr^0x80000000)>>12;
337 #ifndef DISABLE_TLB
338   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
339 #endif
340   if(vpage>2048) vpage=2048+(vpage&2047);
341   return vpage;
342 }
343
344 // Get address from virtual address
345 // This is called from the recompiled JR/JALR instructions
346 void *get_addr(u_int vaddr)
347 {
348   u_int page=get_page(vaddr);
349   u_int vpage=get_vpage(vaddr);
350   struct ll_entry *head;
351   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
352   head=jump_in[page];
353   while(head!=NULL) {
354     if(head->vaddr==vaddr&&head->reg32==0) {
355   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
356       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
357       ht_bin[3]=ht_bin[1];
358       ht_bin[2]=ht_bin[0];
359       ht_bin[1]=(int)head->addr;
360       ht_bin[0]=vaddr;
361       return head->addr;
362     }
363     head=head->next;
364   }
365   head=jump_dirty[vpage];
366   while(head!=NULL) {
367     if(head->vaddr==vaddr&&head->reg32==0) {
368       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
369       // Don't restore blocks which are about to expire from the cache
370       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
371       if(verify_dirty(head->addr)) {
372         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
373         invalid_code[vaddr>>12]=0;
374         memory_map[vaddr>>12]|=0x40000000;
375         if(vpage<2048) {
376 #ifndef DISABLE_TLB
377           if(tlb_LUT_r[vaddr>>12]) {
378             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
379             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
380           }
381 #endif
382           restore_candidate[vpage>>3]|=1<<(vpage&7);
383         }
384         else restore_candidate[page>>3]|=1<<(page&7);
385         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
386         if(ht_bin[0]==vaddr) {
387           ht_bin[1]=(int)head->addr; // Replace existing entry
388         }
389         else
390         {
391           ht_bin[3]=ht_bin[1];
392           ht_bin[2]=ht_bin[0];
393           ht_bin[1]=(int)head->addr;
394           ht_bin[0]=vaddr;
395         }
396         return head->addr;
397       }
398     }
399     head=head->next;
400   }
401   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
402   int r=new_recompile_block(vaddr);
403   if(r==0) return get_addr(vaddr);
404   // Execute in unmapped page, generate pagefault execption
405   Status|=2;
406   Cause=(vaddr<<31)|0x8;
407   EPC=(vaddr&1)?vaddr-5:vaddr;
408   BadVAddr=(vaddr&~1);
409   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
410   EntryHi=BadVAddr&0xFFFFE000;
411   return get_addr_ht(0x80000000);
412 }
413 // Look up address in hash table first
414 void *get_addr_ht(u_int vaddr)
415 {
416   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   return get_addr(vaddr);
421 }
422
423 void *get_addr_32(u_int vaddr,u_int flags)
424 {
425 #ifdef FORCE32
426   return get_addr(vaddr);
427 #else
428   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
429   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
430   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
431   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
432   u_int page=get_page(vaddr);
433   u_int vpage=get_vpage(vaddr);
434   struct ll_entry *head;
435   head=jump_in[page];
436   while(head!=NULL) {
437     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
438       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
439       if(head->reg32==0) {
440         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441         if(ht_bin[0]==-1) {
442           ht_bin[1]=(int)head->addr;
443           ht_bin[0]=vaddr;
444         }else if(ht_bin[2]==-1) {
445           ht_bin[3]=(int)head->addr;
446           ht_bin[2]=vaddr;
447         }
448         //ht_bin[3]=ht_bin[1];
449         //ht_bin[2]=ht_bin[0];
450         //ht_bin[1]=(int)head->addr;
451         //ht_bin[0]=vaddr;
452       }
453       return head->addr;
454     }
455     head=head->next;
456   }
457   head=jump_dirty[vpage];
458   while(head!=NULL) {
459     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
460       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
461       // Don't restore blocks which are about to expire from the cache
462       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
463       if(verify_dirty(head->addr)) {
464         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
465         invalid_code[vaddr>>12]=0;
466         memory_map[vaddr>>12]|=0x40000000;
467         if(vpage<2048) {
468 #ifndef DISABLE_TLB
469           if(tlb_LUT_r[vaddr>>12]) {
470             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
471             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
472           }
473 #endif
474           restore_candidate[vpage>>3]|=1<<(vpage&7);
475         }
476         else restore_candidate[page>>3]|=1<<(page&7);
477         if(head->reg32==0) {
478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479           if(ht_bin[0]==-1) {
480             ht_bin[1]=(int)head->addr;
481             ht_bin[0]=vaddr;
482           }else if(ht_bin[2]==-1) {
483             ht_bin[3]=(int)head->addr;
484             ht_bin[2]=vaddr;
485           }
486           //ht_bin[3]=ht_bin[1];
487           //ht_bin[2]=ht_bin[0];
488           //ht_bin[1]=(int)head->addr;
489           //ht_bin[0]=vaddr;
490         }
491         return head->addr;
492       }
493     }
494     head=head->next;
495   }
496   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
497   int r=new_recompile_block(vaddr);
498   if(r==0) return get_addr(vaddr);
499   // Execute in unmapped page, generate pagefault execption
500   Status|=2;
501   Cause=(vaddr<<31)|0x8;
502   EPC=(vaddr&1)?vaddr-5:vaddr;
503   BadVAddr=(vaddr&~1);
504   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
505   EntryHi=BadVAddr&0xFFFFE000;
506   return get_addr_ht(0x80000000);
507 #endif
508 }
509
510 void clear_all_regs(signed char regmap[])
511 {
512   int hr;
513   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
514 }
515
516 signed char get_reg(signed char regmap[],int r)
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
520   return -1;
521 }
522
523 // Find a register that is available for two consecutive cycles
524 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
525 {
526   int hr;
527   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
528   return -1;
529 }
530
531 int count_free_regs(signed char regmap[])
532 {
533   int count=0;
534   int hr;
535   for(hr=0;hr<HOST_REGS;hr++)
536   {
537     if(hr!=EXCLUDE_REG) {
538       if(regmap[hr]<0) count++;
539     }
540   }
541   return count;
542 }
543
544 void dirty_reg(struct regstat *cur,signed char reg)
545 {
546   int hr;
547   if(!reg) return;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       cur->dirty|=1<<hr;
551     }
552   }
553 }
554
555 // If we dirty the lower half of a 64 bit register which is now being
556 // sign-extended, we need to dump the upper half.
557 // Note: Do this only after completion of the instruction, because
558 // some instructions may need to read the full 64-bit value even if
559 // overwriting it (eg SLTI, DSRA32).
560 static void flush_dirty_uppers(struct regstat *cur)
561 {
562   int hr,reg;
563   for (hr=0;hr<HOST_REGS;hr++) {
564     if((cur->dirty>>hr)&1) {
565       reg=cur->regmap[hr];
566       if(reg>=64) 
567         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
568     }
569   }
570 }
571
572 void set_const(struct regstat *cur,signed char reg,uint64_t value)
573 {
574   int hr;
575   if(!reg) return;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       cur->isconst|=1<<hr;
579       cur->constmap[hr]=value;
580     }
581     else if((cur->regmap[hr]^64)==reg) {
582       cur->isconst|=1<<hr;
583       cur->constmap[hr]=value>>32;
584     }
585   }
586 }
587
588 void clear_const(struct regstat *cur,signed char reg)
589 {
590   int hr;
591   if(!reg) return;
592   for (hr=0;hr<HOST_REGS;hr++) {
593     if((cur->regmap[hr]&63)==reg) {
594       cur->isconst&=~(1<<hr);
595     }
596   }
597 }
598
599 int is_const(struct regstat *cur,signed char reg)
600 {
601   int hr;
602   if(reg<0) return 0;
603   if(!reg) return 1;
604   for (hr=0;hr<HOST_REGS;hr++) {
605     if((cur->regmap[hr]&63)==reg) {
606       return (cur->isconst>>hr)&1;
607     }
608   }
609   return 0;
610 }
611 uint64_t get_const(struct regstat *cur,signed char reg)
612 {
613   int hr;
614   if(!reg) return 0;
615   for (hr=0;hr<HOST_REGS;hr++) {
616     if(cur->regmap[hr]==reg) {
617       return cur->constmap[hr];
618     }
619   }
620   printf("Unknown constant in r%d\n",reg);
621   exit(1);
622 }
623
624 // Least soon needed registers
625 // Look at the next ten instructions and see which registers
626 // will be used.  Try not to reallocate these.
627 void lsn(u_char hsn[], int i, int *preferred_reg)
628 {
629   int j;
630   int b=-1;
631   for(j=0;j<9;j++)
632   {
633     if(i+j>=slen) {
634       j=slen-i-1;
635       break;
636     }
637     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
638     {
639       // Don't go past an unconditonal jump
640       j++;
641       break;
642     }
643   }
644   for(;j>=0;j--)
645   {
646     if(rs1[i+j]) hsn[rs1[i+j]]=j;
647     if(rs2[i+j]) hsn[rs2[i+j]]=j;
648     if(rt1[i+j]) hsn[rt1[i+j]]=j;
649     if(rt2[i+j]) hsn[rt2[i+j]]=j;
650     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
651       // Stores can allocate zero
652       hsn[rs1[i+j]]=j;
653       hsn[rs2[i+j]]=j;
654     }
655     // On some architectures stores need invc_ptr
656     #if defined(HOST_IMM8)
657     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
658       hsn[INVCP]=j;
659     }
660     #endif
661     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
662     {
663       hsn[CCREG]=j;
664       b=j;
665     }
666   }
667   if(b>=0)
668   {
669     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
670     {
671       // Follow first branch
672       int t=(ba[i+b]-start)>>2;
673       j=7-b;if(t+j>=slen) j=slen-t-1;
674       for(;j>=0;j--)
675       {
676         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
677         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
678         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
679         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
680       }
681     }
682     // TODO: preferred register based on backward branch
683   }
684   // Delay slot should preferably not overwrite branch conditions or cycle count
685   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
686     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
687     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
688     hsn[CCREG]=1;
689     // ...or hash tables
690     hsn[RHASH]=1;
691     hsn[RHTBL]=1;
692   }
693   // Coprocessor load/store needs FTEMP, even if not declared
694   if(itype[i]==C1LS||itype[i]==C2LS) {
695     hsn[FTEMP]=0;
696   }
697   // Load L/R also uses FTEMP as a temporary register
698   if(itype[i]==LOADLR) {
699     hsn[FTEMP]=0;
700   }
701   // Also SWL/SWR/SDL/SDR
702   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
703     hsn[FTEMP]=0;
704   }
705   // Don't remove the TLB registers either
706   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
707     hsn[TLREG]=0;
708   }
709   // Don't remove the miniht registers
710   if(itype[i]==UJUMP||itype[i]==RJUMP)
711   {
712     hsn[RHASH]=0;
713     hsn[RHTBL]=0;
714   }
715 }
716
717 // We only want to allocate registers if we're going to use them again soon
718 int needed_again(int r, int i)
719 {
720   int j;
721   int b=-1;
722   int rn=10;
723   
724   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
725   {
726     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
727       return 0; // Don't need any registers if exiting the block
728   }
729   for(j=0;j<9;j++)
730   {
731     if(i+j>=slen) {
732       j=slen-i-1;
733       break;
734     }
735     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
736     {
737       // Don't go past an unconditonal jump
738       j++;
739       break;
740     }
741     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
742     {
743       break;
744     }
745   }
746   for(;j>=1;j--)
747   {
748     if(rs1[i+j]==r) rn=j;
749     if(rs2[i+j]==r) rn=j;
750     if((unneeded_reg[i+j]>>r)&1) rn=10;
751     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
752     {
753       b=j;
754     }
755   }
756   /*
757   if(b>=0)
758   {
759     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
760     {
761       // Follow first branch
762       int o=rn;
763       int t=(ba[i+b]-start)>>2;
764       j=7-b;if(t+j>=slen) j=slen-t-1;
765       for(;j>=0;j--)
766       {
767         if(!((unneeded_reg[t+j]>>r)&1)) {
768           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
769           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
770         }
771         else rn=o;
772       }
773     }
774   }*/
775   if(rn<10) return 1;
776   return 0;
777 }
778
779 // Try to match register allocations at the end of a loop with those
780 // at the beginning
781 int loop_reg(int i, int r, int hr)
782 {
783   int j,k;
784   for(j=0;j<9;j++)
785   {
786     if(i+j>=slen) {
787       j=slen-i-1;
788       break;
789     }
790     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
791     {
792       // Don't go past an unconditonal jump
793       j++;
794       break;
795     }
796   }
797   k=0;
798   if(i>0){
799     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
800       k--;
801   }
802   for(;k<j;k++)
803   {
804     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
805     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
806     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
807     {
808       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
809       {
810         int t=(ba[i+k]-start)>>2;
811         int reg=get_reg(regs[t].regmap_entry,r);
812         if(reg>=0) return reg;
813         //reg=get_reg(regs[t+1].regmap_entry,r);
814         //if(reg>=0) return reg;
815       }
816     }
817   }
818   return hr;
819 }
820
821
822 // Allocate every register, preserving source/target regs
823 void alloc_all(struct regstat *cur,int i)
824 {
825   int hr;
826   
827   for(hr=0;hr<HOST_REGS;hr++) {
828     if(hr!=EXCLUDE_REG) {
829       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
830          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
831       {
832         cur->regmap[hr]=-1;
833         cur->dirty&=~(1<<hr);
834       }
835       // Don't need zeros
836       if((cur->regmap[hr]&63)==0)
837       {
838         cur->regmap[hr]=-1;
839         cur->dirty&=~(1<<hr);
840       }
841     }
842   }
843 }
844
845
846 void div64(int64_t dividend,int64_t divisor)
847 {
848   lo=dividend/divisor;
849   hi=dividend%divisor;
850   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
851   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
852 }
853 void divu64(uint64_t dividend,uint64_t divisor)
854 {
855   lo=dividend/divisor;
856   hi=dividend%divisor;
857   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
858   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
859 }
860
861 void mult64(uint64_t m1,uint64_t m2)
862 {
863    unsigned long long int op1, op2, op3, op4;
864    unsigned long long int result1, result2, result3, result4;
865    unsigned long long int temp1, temp2, temp3, temp4;
866    int sign = 0;
867    
868    if (m1 < 0)
869      {
870     op2 = -m1;
871     sign = 1 - sign;
872      }
873    else op2 = m1;
874    if (m2 < 0)
875      {
876     op4 = -m2;
877     sign = 1 - sign;
878      }
879    else op4 = m2;
880    
881    op1 = op2 & 0xFFFFFFFF;
882    op2 = (op2 >> 32) & 0xFFFFFFFF;
883    op3 = op4 & 0xFFFFFFFF;
884    op4 = (op4 >> 32) & 0xFFFFFFFF;
885    
886    temp1 = op1 * op3;
887    temp2 = (temp1 >> 32) + op1 * op4;
888    temp3 = op2 * op3;
889    temp4 = (temp3 >> 32) + op2 * op4;
890    
891    result1 = temp1 & 0xFFFFFFFF;
892    result2 = temp2 + (temp3 & 0xFFFFFFFF);
893    result3 = (result2 >> 32) + temp4;
894    result4 = (result3 >> 32);
895    
896    lo = result1 | (result2 << 32);
897    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
898    if (sign)
899      {
900     hi = ~hi;
901     if (!lo) hi++;
902     else lo = ~lo + 1;
903      }
904 }
905
906 void multu64(uint64_t m1,uint64_t m2)
907 {
908    unsigned long long int op1, op2, op3, op4;
909    unsigned long long int result1, result2, result3, result4;
910    unsigned long long int temp1, temp2, temp3, temp4;
911    
912    op1 = m1 & 0xFFFFFFFF;
913    op2 = (m1 >> 32) & 0xFFFFFFFF;
914    op3 = m2 & 0xFFFFFFFF;
915    op4 = (m2 >> 32) & 0xFFFFFFFF;
916    
917    temp1 = op1 * op3;
918    temp2 = (temp1 >> 32) + op1 * op4;
919    temp3 = op2 * op3;
920    temp4 = (temp3 >> 32) + op2 * op4;
921    
922    result1 = temp1 & 0xFFFFFFFF;
923    result2 = temp2 + (temp3 & 0xFFFFFFFF);
924    result3 = (result2 >> 32) + temp4;
925    result4 = (result3 >> 32);
926    
927    lo = result1 | (result2 << 32);
928    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
929    
930   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
931   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
932 }
933
934 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
935 {
936   if(bits) {
937     original<<=64-bits;
938     original>>=64-bits;
939     loaded<<=bits;
940     original|=loaded;
941   }
942   else original=loaded;
943   return original;
944 }
945 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
946 {
947   if(bits^56) {
948     original>>=64-(bits^56);
949     original<<=64-(bits^56);
950     loaded>>=bits^56;
951     original|=loaded;
952   }
953   else original=loaded;
954   return original;
955 }
956
957 #ifdef __i386__
958 #include "assem_x86.c"
959 #endif
960 #ifdef __x86_64__
961 #include "assem_x64.c"
962 #endif
963 #ifdef __arm__
964 #include "assem_arm.c"
965 #endif
966
967 // Add virtual address mapping to linked list
968 void ll_add(struct ll_entry **head,int vaddr,void *addr)
969 {
970   struct ll_entry *new_entry;
971   new_entry=malloc(sizeof(struct ll_entry));
972   assert(new_entry!=NULL);
973   new_entry->vaddr=vaddr;
974   new_entry->reg32=0;
975   new_entry->addr=addr;
976   new_entry->next=*head;
977   *head=new_entry;
978 }
979
980 // Add virtual address mapping for 32-bit compiled block
981 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
982 {
983   ll_add(head,vaddr,addr);
984 #ifndef FORCE32
985   (*head)->reg32=reg32;
986 #endif
987 }
988
989 // Check if an address is already compiled
990 // but don't return addresses which are about to expire from the cache
991 void *check_addr(u_int vaddr)
992 {
993   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
994   if(ht_bin[0]==vaddr) {
995     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
996       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
997   }
998   if(ht_bin[2]==vaddr) {
999     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1000       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1001   }
1002   u_int page=get_page(vaddr);
1003   struct ll_entry *head;
1004   head=jump_in[page];
1005   while(head!=NULL) {
1006     if(head->vaddr==vaddr&&head->reg32==0) {
1007       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1008         // Update existing entry with current address
1009         if(ht_bin[0]==vaddr) {
1010           ht_bin[1]=(int)head->addr;
1011           return head->addr;
1012         }
1013         if(ht_bin[2]==vaddr) {
1014           ht_bin[3]=(int)head->addr;
1015           return head->addr;
1016         }
1017         // Insert into hash table with low priority.
1018         // Don't evict existing entries, as they are probably
1019         // addresses that are being accessed frequently.
1020         if(ht_bin[0]==-1) {
1021           ht_bin[1]=(int)head->addr;
1022           ht_bin[0]=vaddr;
1023         }else if(ht_bin[2]==-1) {
1024           ht_bin[3]=(int)head->addr;
1025           ht_bin[2]=vaddr;
1026         }
1027         return head->addr;
1028       }
1029     }
1030     head=head->next;
1031   }
1032   return 0;
1033 }
1034
1035 void remove_hash(int vaddr)
1036 {
1037   //printf("remove hash: %x\n",vaddr);
1038   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1039   if(ht_bin[2]==vaddr) {
1040     ht_bin[2]=ht_bin[3]=-1;
1041   }
1042   if(ht_bin[0]==vaddr) {
1043     ht_bin[0]=ht_bin[2];
1044     ht_bin[1]=ht_bin[3];
1045     ht_bin[2]=ht_bin[3]=-1;
1046   }
1047 }
1048
1049 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1050 {
1051   struct ll_entry *next;
1052   while(*head) {
1053     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1054        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1055     {
1056       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1057       remove_hash((*head)->vaddr);
1058       next=(*head)->next;
1059       free(*head);
1060       *head=next;
1061     }
1062     else
1063     {
1064       head=&((*head)->next);
1065     }
1066   }
1067 }
1068
1069 // Remove all entries from linked list
1070 void ll_clear(struct ll_entry **head)
1071 {
1072   struct ll_entry *cur;
1073   struct ll_entry *next;
1074   if(cur=*head) {
1075     *head=0;
1076     while(cur) {
1077       next=cur->next;
1078       free(cur);
1079       cur=next;
1080     }
1081   }
1082 }
1083
1084 // Dereference the pointers and remove if it matches
1085 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1086 {
1087   while(head) {
1088     int ptr=get_pointer(head->addr);
1089     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1090     if(((ptr>>shift)==(addr>>shift)) ||
1091        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1092     {
1093       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1094       u_int host_addr=(u_int)kill_pointer(head->addr);
1095       #ifdef __arm__
1096         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1097       #endif
1098     }
1099     head=head->next;
1100   }
1101 }
1102
1103 // This is called when we write to a compiled block (see do_invstub)
1104 void invalidate_page(u_int page)
1105 {
1106   struct ll_entry *head;
1107   struct ll_entry *next;
1108   head=jump_in[page];
1109   jump_in[page]=0;
1110   while(head!=NULL) {
1111     inv_debug("INVALIDATE: %x\n",head->vaddr);
1112     remove_hash(head->vaddr);
1113     next=head->next;
1114     free(head);
1115     head=next;
1116   }
1117   head=jump_out[page];
1118   jump_out[page]=0;
1119   while(head!=NULL) {
1120     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1121     u_int host_addr=(u_int)kill_pointer(head->addr);
1122     #ifdef __arm__
1123       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1124     #endif
1125     next=head->next;
1126     free(head);
1127     head=next;
1128   }
1129 }
1130 void invalidate_block(u_int block)
1131 {
1132   u_int page=get_page(block<<12);
1133   u_int vpage=get_vpage(block<<12);
1134   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1135   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1136   u_int first,last;
1137   first=last=page;
1138   struct ll_entry *head;
1139   head=jump_dirty[vpage];
1140   //printf("page=%d vpage=%d\n",page,vpage);
1141   while(head!=NULL) {
1142     u_int start,end;
1143     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1144       get_bounds((int)head->addr,&start,&end);
1145       //printf("start: %x end: %x\n",start,end);
1146       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1147         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1148           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1149           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1150         }
1151       }
1152 #ifndef DISABLE_TLB
1153       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1154         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1155           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1156           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1157         }
1158       }
1159 #endif
1160     }
1161     head=head->next;
1162   }
1163   //printf("first=%d last=%d\n",first,last);
1164   invalidate_page(page);
1165   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1166   assert(last<page+5);
1167   // Invalidate the adjacent pages if a block crosses a 4K boundary
1168   while(first<page) {
1169     invalidate_page(first);
1170     first++;
1171   }
1172   for(first=page+1;first<last;first++) {
1173     invalidate_page(first);
1174   }
1175   #ifdef __arm__
1176     do_clear_cache();
1177   #endif
1178   
1179   // Don't trap writes
1180   invalid_code[block]=1;
1181 #ifdef PCSX
1182   invalid_code[((u_int)0x80000000>>12)|page]=1;
1183 #endif
1184 #ifndef DISABLE_TLB
1185   // If there is a valid TLB entry for this page, remove write protect
1186   if(tlb_LUT_w[block]) {
1187     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1188     // CHECK: Is this right?
1189     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1190     u_int real_block=tlb_LUT_w[block]>>12;
1191     invalid_code[real_block]=1;
1192     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1193   }
1194   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1195 #endif
1196
1197   #ifdef USE_MINI_HT
1198   memset(mini_ht,-1,sizeof(mini_ht));
1199   #endif
1200 }
1201 void invalidate_addr(u_int addr)
1202 {
1203   invalidate_block(addr>>12);
1204 }
1205 // This is called when loading a save state.
1206 // Anything could have changed, so invalidate everything.
1207 void invalidate_all_pages()
1208 {
1209   u_int page,n;
1210   for(page=0;page<4096;page++)
1211     invalidate_page(page);
1212   for(page=0;page<1048576;page++)
1213     if(!invalid_code[page]) {
1214       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1215       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1216     }
1217   #ifdef __arm__
1218   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1219   #endif
1220   #ifdef USE_MINI_HT
1221   memset(mini_ht,-1,sizeof(mini_ht));
1222   #endif
1223   #ifndef DISABLE_TLB
1224   // TLB
1225   for(page=0;page<0x100000;page++) {
1226     if(tlb_LUT_r[page]) {
1227       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1228       if(!tlb_LUT_w[page]||!invalid_code[page])
1229         memory_map[page]|=0x40000000; // Write protect
1230     }
1231     else memory_map[page]=-1;
1232     if(page==0x80000) page=0xC0000;
1233   }
1234   tlb_hacks();
1235   #endif
1236 }
1237
1238 // Add an entry to jump_out after making a link
1239 void add_link(u_int vaddr,void *src)
1240 {
1241   u_int page=get_page(vaddr);
1242   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1243   ll_add(jump_out+page,vaddr,src);
1244   //int ptr=get_pointer(src);
1245   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1246 }
1247
1248 // If a code block was found to be unmodified (bit was set in
1249 // restore_candidate) and it remains unmodified (bit is clear
1250 // in invalid_code) then move the entries for that 4K page from
1251 // the dirty list to the clean list.
1252 void clean_blocks(u_int page)
1253 {
1254   struct ll_entry *head;
1255   inv_debug("INV: clean_blocks page=%d\n",page);
1256   head=jump_dirty[page];
1257   while(head!=NULL) {
1258     if(!invalid_code[head->vaddr>>12]) {
1259       // Don't restore blocks which are about to expire from the cache
1260       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1261         u_int start,end;
1262         if(verify_dirty((int)head->addr)) {
1263           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1264           u_int i;
1265           u_int inv=0;
1266           get_bounds((int)head->addr,&start,&end);
1267           if(start-(u_int)rdram<RAM_SIZE) {
1268             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1269               inv|=invalid_code[i];
1270             }
1271           }
1272           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1273             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1274             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1275             if(addr<start||addr>=end) inv=1;
1276           }
1277           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1278             inv=1;
1279           }
1280           if(!inv) {
1281             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1282             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1283               u_int ppage=page;
1284 #ifndef DISABLE_TLB
1285               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1286 #endif
1287               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1288               //printf("page=%x, addr=%x\n",page,head->vaddr);
1289               //assert(head->vaddr>>12==(page|0x80000));
1290               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1291               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1292               if(!head->reg32) {
1293                 if(ht_bin[0]==head->vaddr) {
1294                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1295                 }
1296                 if(ht_bin[2]==head->vaddr) {
1297                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1298                 }
1299               }
1300             }
1301           }
1302         }
1303       }
1304     }
1305     head=head->next;
1306   }
1307 }
1308
1309
1310 void mov_alloc(struct regstat *current,int i)
1311 {
1312   // Note: Don't need to actually alloc the source registers
1313   if((~current->is32>>rs1[i])&1) {
1314     //alloc_reg64(current,i,rs1[i]);
1315     alloc_reg64(current,i,rt1[i]);
1316     current->is32&=~(1LL<<rt1[i]);
1317   } else {
1318     //alloc_reg(current,i,rs1[i]);
1319     alloc_reg(current,i,rt1[i]);
1320     current->is32|=(1LL<<rt1[i]);
1321   }
1322   clear_const(current,rs1[i]);
1323   clear_const(current,rt1[i]);
1324   dirty_reg(current,rt1[i]);
1325 }
1326
1327 void shiftimm_alloc(struct regstat *current,int i)
1328 {
1329   clear_const(current,rs1[i]);
1330   clear_const(current,rt1[i]);
1331   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1332   {
1333     if(rt1[i]) {
1334       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1335       else lt1[i]=rs1[i];
1336       alloc_reg(current,i,rt1[i]);
1337       current->is32|=1LL<<rt1[i];
1338       dirty_reg(current,rt1[i]);
1339     }
1340   }
1341   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1342   {
1343     if(rt1[i]) {
1344       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1345       alloc_reg64(current,i,rt1[i]);
1346       current->is32&=~(1LL<<rt1[i]);
1347       dirty_reg(current,rt1[i]);
1348     }
1349   }
1350   if(opcode2[i]==0x3c) // DSLL32
1351   {
1352     if(rt1[i]) {
1353       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1354       alloc_reg64(current,i,rt1[i]);
1355       current->is32&=~(1LL<<rt1[i]);
1356       dirty_reg(current,rt1[i]);
1357     }
1358   }
1359   if(opcode2[i]==0x3e) // DSRL32
1360   {
1361     if(rt1[i]) {
1362       alloc_reg64(current,i,rs1[i]);
1363       if(imm[i]==32) {
1364         alloc_reg64(current,i,rt1[i]);
1365         current->is32&=~(1LL<<rt1[i]);
1366       } else {
1367         alloc_reg(current,i,rt1[i]);
1368         current->is32|=1LL<<rt1[i];
1369       }
1370       dirty_reg(current,rt1[i]);
1371     }
1372   }
1373   if(opcode2[i]==0x3f) // DSRA32
1374   {
1375     if(rt1[i]) {
1376       alloc_reg64(current,i,rs1[i]);
1377       alloc_reg(current,i,rt1[i]);
1378       current->is32|=1LL<<rt1[i];
1379       dirty_reg(current,rt1[i]);
1380     }
1381   }
1382 }
1383
1384 void shift_alloc(struct regstat *current,int i)
1385 {
1386   if(rt1[i]) {
1387     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1388     {
1389       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1390       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1391       alloc_reg(current,i,rt1[i]);
1392       if(rt1[i]==rs2[i]) {
1393         alloc_reg_temp(current,i,-1);
1394         minimum_free_regs[i]=1;
1395       }
1396       current->is32|=1LL<<rt1[i];
1397     } else { // DSLLV/DSRLV/DSRAV
1398       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1399       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1400       alloc_reg64(current,i,rt1[i]);
1401       current->is32&=~(1LL<<rt1[i]);
1402       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1403       {
1404         alloc_reg_temp(current,i,-1);
1405         minimum_free_regs[i]=1;
1406       }
1407     }
1408     clear_const(current,rs1[i]);
1409     clear_const(current,rs2[i]);
1410     clear_const(current,rt1[i]);
1411     dirty_reg(current,rt1[i]);
1412   }
1413 }
1414
1415 void alu_alloc(struct regstat *current,int i)
1416 {
1417   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1418     if(rt1[i]) {
1419       if(rs1[i]&&rs2[i]) {
1420         alloc_reg(current,i,rs1[i]);
1421         alloc_reg(current,i,rs2[i]);
1422       }
1423       else {
1424         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1425         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1426       }
1427       alloc_reg(current,i,rt1[i]);
1428     }
1429     current->is32|=1LL<<rt1[i];
1430   }
1431   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1432     if(rt1[i]) {
1433       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1434       {
1435         alloc_reg64(current,i,rs1[i]);
1436         alloc_reg64(current,i,rs2[i]);
1437         alloc_reg(current,i,rt1[i]);
1438       } else {
1439         alloc_reg(current,i,rs1[i]);
1440         alloc_reg(current,i,rs2[i]);
1441         alloc_reg(current,i,rt1[i]);
1442       }
1443     }
1444     current->is32|=1LL<<rt1[i];
1445   }
1446   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1447     if(rt1[i]) {
1448       if(rs1[i]&&rs2[i]) {
1449         alloc_reg(current,i,rs1[i]);
1450         alloc_reg(current,i,rs2[i]);
1451       }
1452       else
1453       {
1454         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1455         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1456       }
1457       alloc_reg(current,i,rt1[i]);
1458       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1459       {
1460         if(!((current->uu>>rt1[i])&1)) {
1461           alloc_reg64(current,i,rt1[i]);
1462         }
1463         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1464           if(rs1[i]&&rs2[i]) {
1465             alloc_reg64(current,i,rs1[i]);
1466             alloc_reg64(current,i,rs2[i]);
1467           }
1468           else
1469           {
1470             // Is is really worth it to keep 64-bit values in registers?
1471             #ifdef NATIVE_64BIT
1472             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1473             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1474             #endif
1475           }
1476         }
1477         current->is32&=~(1LL<<rt1[i]);
1478       } else {
1479         current->is32|=1LL<<rt1[i];
1480       }
1481     }
1482   }
1483   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1484     if(rt1[i]) {
1485       if(rs1[i]&&rs2[i]) {
1486         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1487           alloc_reg64(current,i,rs1[i]);
1488           alloc_reg64(current,i,rs2[i]);
1489           alloc_reg64(current,i,rt1[i]);
1490         } else {
1491           alloc_reg(current,i,rs1[i]);
1492           alloc_reg(current,i,rs2[i]);
1493           alloc_reg(current,i,rt1[i]);
1494         }
1495       }
1496       else {
1497         alloc_reg(current,i,rt1[i]);
1498         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1499           // DADD used as move, or zeroing
1500           // If we have a 64-bit source, then make the target 64 bits too
1501           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1502             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1503             alloc_reg64(current,i,rt1[i]);
1504           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1505             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1506             alloc_reg64(current,i,rt1[i]);
1507           }
1508           if(opcode2[i]>=0x2e&&rs2[i]) {
1509             // DSUB used as negation - 64-bit result
1510             // If we have a 32-bit register, extend it to 64 bits
1511             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1512             alloc_reg64(current,i,rt1[i]);
1513           }
1514         }
1515       }
1516       if(rs1[i]&&rs2[i]) {
1517         current->is32&=~(1LL<<rt1[i]);
1518       } else if(rs1[i]) {
1519         current->is32&=~(1LL<<rt1[i]);
1520         if((current->is32>>rs1[i])&1)
1521           current->is32|=1LL<<rt1[i];
1522       } else if(rs2[i]) {
1523         current->is32&=~(1LL<<rt1[i]);
1524         if((current->is32>>rs2[i])&1)
1525           current->is32|=1LL<<rt1[i];
1526       } else {
1527         current->is32|=1LL<<rt1[i];
1528       }
1529     }
1530   }
1531   clear_const(current,rs1[i]);
1532   clear_const(current,rs2[i]);
1533   clear_const(current,rt1[i]);
1534   dirty_reg(current,rt1[i]);
1535 }
1536
1537 void imm16_alloc(struct regstat *current,int i)
1538 {
1539   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1540   else lt1[i]=rs1[i];
1541   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1542   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1543     current->is32&=~(1LL<<rt1[i]);
1544     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1545       // TODO: Could preserve the 32-bit flag if the immediate is zero
1546       alloc_reg64(current,i,rt1[i]);
1547       alloc_reg64(current,i,rs1[i]);
1548     }
1549     clear_const(current,rs1[i]);
1550     clear_const(current,rt1[i]);
1551   }
1552   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1553     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1554     current->is32|=1LL<<rt1[i];
1555     clear_const(current,rs1[i]);
1556     clear_const(current,rt1[i]);
1557   }
1558   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1559     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1560       if(rs1[i]!=rt1[i]) {
1561         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1562         alloc_reg64(current,i,rt1[i]);
1563         current->is32&=~(1LL<<rt1[i]);
1564       }
1565     }
1566     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1567     if(is_const(current,rs1[i])) {
1568       int v=get_const(current,rs1[i]);
1569       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1570       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1571       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1572     }
1573     else clear_const(current,rt1[i]);
1574   }
1575   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1576     if(is_const(current,rs1[i])) {
1577       int v=get_const(current,rs1[i]);
1578       set_const(current,rt1[i],v+imm[i]);
1579     }
1580     else clear_const(current,rt1[i]);
1581     current->is32|=1LL<<rt1[i];
1582   }
1583   else {
1584     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1585     current->is32|=1LL<<rt1[i];
1586   }
1587   dirty_reg(current,rt1[i]);
1588 }
1589
1590 void load_alloc(struct regstat *current,int i)
1591 {
1592   clear_const(current,rt1[i]);
1593   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1594   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1595   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1596   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1597     alloc_reg(current,i,rt1[i]);
1598     assert(get_reg(current->regmap,rt1[i])>=0);
1599     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1600     {
1601       current->is32&=~(1LL<<rt1[i]);
1602       alloc_reg64(current,i,rt1[i]);
1603     }
1604     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1605     {
1606       current->is32&=~(1LL<<rt1[i]);
1607       alloc_reg64(current,i,rt1[i]);
1608       alloc_all(current,i);
1609       alloc_reg64(current,i,FTEMP);
1610       minimum_free_regs[i]=HOST_REGS;
1611     }
1612     else current->is32|=1LL<<rt1[i];
1613     dirty_reg(current,rt1[i]);
1614     // If using TLB, need a register for pointer to the mapping table
1615     if(using_tlb) alloc_reg(current,i,TLREG);
1616     // LWL/LWR need a temporary register for the old value
1617     if(opcode[i]==0x22||opcode[i]==0x26)
1618     {
1619       alloc_reg(current,i,FTEMP);
1620       alloc_reg_temp(current,i,-1);
1621       minimum_free_regs[i]=1;
1622     }
1623   }
1624   else
1625   {
1626     // Load to r0 or unneeded register (dummy load)
1627     // but we still need a register to calculate the address
1628     if(opcode[i]==0x22||opcode[i]==0x26)
1629     {
1630       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1631     }
1632     // If using TLB, need a register for pointer to the mapping table
1633     if(using_tlb) alloc_reg(current,i,TLREG);
1634     alloc_reg_temp(current,i,-1);
1635     minimum_free_regs[i]=1;
1636     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1637     {
1638       alloc_all(current,i);
1639       alloc_reg64(current,i,FTEMP);
1640       minimum_free_regs[i]=HOST_REGS;
1641     }
1642   }
1643 }
1644
1645 void store_alloc(struct regstat *current,int i)
1646 {
1647   clear_const(current,rs2[i]);
1648   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1649   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1650   alloc_reg(current,i,rs2[i]);
1651   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1652     alloc_reg64(current,i,rs2[i]);
1653     if(rs2[i]) alloc_reg(current,i,FTEMP);
1654   }
1655   // If using TLB, need a register for pointer to the mapping table
1656   if(using_tlb) alloc_reg(current,i,TLREG);
1657   #if defined(HOST_IMM8)
1658   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1659   else alloc_reg(current,i,INVCP);
1660   #endif
1661   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1662     alloc_reg(current,i,FTEMP);
1663   }
1664   // We need a temporary register for address generation
1665   alloc_reg_temp(current,i,-1);
1666   minimum_free_regs[i]=1;
1667 }
1668
1669 void c1ls_alloc(struct regstat *current,int i)
1670 {
1671   //clear_const(current,rs1[i]); // FIXME
1672   clear_const(current,rt1[i]);
1673   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1674   alloc_reg(current,i,CSREG); // Status
1675   alloc_reg(current,i,FTEMP);
1676   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1677     alloc_reg64(current,i,FTEMP);
1678   }
1679   // If using TLB, need a register for pointer to the mapping table
1680   if(using_tlb) alloc_reg(current,i,TLREG);
1681   #if defined(HOST_IMM8)
1682   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1683   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1684     alloc_reg(current,i,INVCP);
1685   #endif
1686   // We need a temporary register for address generation
1687   alloc_reg_temp(current,i,-1);
1688 }
1689
1690 void c2ls_alloc(struct regstat *current,int i)
1691 {
1692   clear_const(current,rt1[i]);
1693   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1694   alloc_reg(current,i,FTEMP);
1695   // If using TLB, need a register for pointer to the mapping table
1696   if(using_tlb) alloc_reg(current,i,TLREG);
1697   #if defined(HOST_IMM8)
1698   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1699   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1700     alloc_reg(current,i,INVCP);
1701   #endif
1702   // We need a temporary register for address generation
1703   alloc_reg_temp(current,i,-1);
1704   minimum_free_regs[i]=1;
1705 }
1706
1707 #ifndef multdiv_alloc
1708 void multdiv_alloc(struct regstat *current,int i)
1709 {
1710   //  case 0x18: MULT
1711   //  case 0x19: MULTU
1712   //  case 0x1A: DIV
1713   //  case 0x1B: DIVU
1714   //  case 0x1C: DMULT
1715   //  case 0x1D: DMULTU
1716   //  case 0x1E: DDIV
1717   //  case 0x1F: DDIVU
1718   clear_const(current,rs1[i]);
1719   clear_const(current,rs2[i]);
1720   if(rs1[i]&&rs2[i])
1721   {
1722     if((opcode2[i]&4)==0) // 32-bit
1723     {
1724       current->u&=~(1LL<<HIREG);
1725       current->u&=~(1LL<<LOREG);
1726       alloc_reg(current,i,HIREG);
1727       alloc_reg(current,i,LOREG);
1728       alloc_reg(current,i,rs1[i]);
1729       alloc_reg(current,i,rs2[i]);
1730       current->is32|=1LL<<HIREG;
1731       current->is32|=1LL<<LOREG;
1732       dirty_reg(current,HIREG);
1733       dirty_reg(current,LOREG);
1734     }
1735     else // 64-bit
1736     {
1737       current->u&=~(1LL<<HIREG);
1738       current->u&=~(1LL<<LOREG);
1739       current->uu&=~(1LL<<HIREG);
1740       current->uu&=~(1LL<<LOREG);
1741       alloc_reg64(current,i,HIREG);
1742       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1743       alloc_reg64(current,i,rs1[i]);
1744       alloc_reg64(current,i,rs2[i]);
1745       alloc_all(current,i);
1746       current->is32&=~(1LL<<HIREG);
1747       current->is32&=~(1LL<<LOREG);
1748       dirty_reg(current,HIREG);
1749       dirty_reg(current,LOREG);
1750       minimum_free_regs[i]=HOST_REGS;
1751     }
1752   }
1753   else
1754   {
1755     // Multiply by zero is zero.
1756     // MIPS does not have a divide by zero exception.
1757     // The result is undefined, we return zero.
1758     alloc_reg(current,i,HIREG);
1759     alloc_reg(current,i,LOREG);
1760     current->is32|=1LL<<HIREG;
1761     current->is32|=1LL<<LOREG;
1762     dirty_reg(current,HIREG);
1763     dirty_reg(current,LOREG);
1764   }
1765 }
1766 #endif
1767
1768 void cop0_alloc(struct regstat *current,int i)
1769 {
1770   if(opcode2[i]==0) // MFC0
1771   {
1772     if(rt1[i]) {
1773       clear_const(current,rt1[i]);
1774       alloc_all(current,i);
1775       alloc_reg(current,i,rt1[i]);
1776       current->is32|=1LL<<rt1[i];
1777       dirty_reg(current,rt1[i]);
1778     }
1779   }
1780   else if(opcode2[i]==4) // MTC0
1781   {
1782     if(rs1[i]){
1783       clear_const(current,rs1[i]);
1784       alloc_reg(current,i,rs1[i]);
1785       alloc_all(current,i);
1786     }
1787     else {
1788       alloc_all(current,i); // FIXME: Keep r0
1789       current->u&=~1LL;
1790       alloc_reg(current,i,0);
1791     }
1792   }
1793   else
1794   {
1795     // TLBR/TLBWI/TLBWR/TLBP/ERET
1796     assert(opcode2[i]==0x10);
1797     alloc_all(current,i);
1798   }
1799   minimum_free_regs[i]=HOST_REGS;
1800 }
1801
1802 void cop1_alloc(struct regstat *current,int i)
1803 {
1804   alloc_reg(current,i,CSREG); // Load status
1805   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1806   {
1807     if(rt1[i]){
1808       clear_const(current,rt1[i]);
1809       if(opcode2[i]==1) {
1810         alloc_reg64(current,i,rt1[i]); // DMFC1
1811         current->is32&=~(1LL<<rt1[i]);
1812       }else{
1813         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1814         current->is32|=1LL<<rt1[i];
1815       }
1816       dirty_reg(current,rt1[i]);
1817     }
1818     alloc_reg_temp(current,i,-1);
1819   }
1820   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1821   {
1822     if(rs1[i]){
1823       clear_const(current,rs1[i]);
1824       if(opcode2[i]==5)
1825         alloc_reg64(current,i,rs1[i]); // DMTC1
1826       else
1827         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1828       alloc_reg_temp(current,i,-1);
1829     }
1830     else {
1831       current->u&=~1LL;
1832       alloc_reg(current,i,0);
1833       alloc_reg_temp(current,i,-1);
1834     }
1835   }
1836   minimum_free_regs[i]=1;
1837 }
1838 void fconv_alloc(struct regstat *current,int i)
1839 {
1840   alloc_reg(current,i,CSREG); // Load status
1841   alloc_reg_temp(current,i,-1);
1842   minimum_free_regs[i]=1;
1843 }
1844 void float_alloc(struct regstat *current,int i)
1845 {
1846   alloc_reg(current,i,CSREG); // Load status
1847   alloc_reg_temp(current,i,-1);
1848   minimum_free_regs[i]=1;
1849 }
1850 void c2op_alloc(struct regstat *current,int i)
1851 {
1852   alloc_reg_temp(current,i,-1);
1853 }
1854 void fcomp_alloc(struct regstat *current,int i)
1855 {
1856   alloc_reg(current,i,CSREG); // Load status
1857   alloc_reg(current,i,FSREG); // Load flags
1858   dirty_reg(current,FSREG); // Flag will be modified
1859   alloc_reg_temp(current,i,-1);
1860   minimum_free_regs[i]=1;
1861 }
1862
1863 void syscall_alloc(struct regstat *current,int i)
1864 {
1865   alloc_cc(current,i);
1866   dirty_reg(current,CCREG);
1867   alloc_all(current,i);
1868   minimum_free_regs[i]=HOST_REGS;
1869   current->isconst=0;
1870 }
1871
1872 void delayslot_alloc(struct regstat *current,int i)
1873 {
1874   switch(itype[i]) {
1875     case UJUMP:
1876     case CJUMP:
1877     case SJUMP:
1878     case RJUMP:
1879     case FJUMP:
1880     case SYSCALL:
1881     case HLECALL:
1882     case SPAN:
1883       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1884       printf("Disabled speculative precompilation\n");
1885       stop_after_jal=1;
1886       break;
1887     case IMM16:
1888       imm16_alloc(current,i);
1889       break;
1890     case LOAD:
1891     case LOADLR:
1892       load_alloc(current,i);
1893       break;
1894     case STORE:
1895     case STORELR:
1896       store_alloc(current,i);
1897       break;
1898     case ALU:
1899       alu_alloc(current,i);
1900       break;
1901     case SHIFT:
1902       shift_alloc(current,i);
1903       break;
1904     case MULTDIV:
1905       multdiv_alloc(current,i);
1906       break;
1907     case SHIFTIMM:
1908       shiftimm_alloc(current,i);
1909       break;
1910     case MOV:
1911       mov_alloc(current,i);
1912       break;
1913     case COP0:
1914       cop0_alloc(current,i);
1915       break;
1916     case COP1:
1917     case COP2:
1918       cop1_alloc(current,i);
1919       break;
1920     case C1LS:
1921       c1ls_alloc(current,i);
1922       break;
1923     case C2LS:
1924       c2ls_alloc(current,i);
1925       break;
1926     case FCONV:
1927       fconv_alloc(current,i);
1928       break;
1929     case FLOAT:
1930       float_alloc(current,i);
1931       break;
1932     case FCOMP:
1933       fcomp_alloc(current,i);
1934       break;
1935     case C2OP:
1936       c2op_alloc(current,i);
1937       break;
1938   }
1939 }
1940
1941 // Special case where a branch and delay slot span two pages in virtual memory
1942 static void pagespan_alloc(struct regstat *current,int i)
1943 {
1944   current->isconst=0;
1945   current->wasconst=0;
1946   regs[i].wasconst=0;
1947   minimum_free_regs[i]=HOST_REGS;
1948   alloc_all(current,i);
1949   alloc_cc(current,i);
1950   dirty_reg(current,CCREG);
1951   if(opcode[i]==3) // JAL
1952   {
1953     alloc_reg(current,i,31);
1954     dirty_reg(current,31);
1955   }
1956   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1957   {
1958     alloc_reg(current,i,rs1[i]);
1959     if (rt1[i]!=0) {
1960       alloc_reg(current,i,rt1[i]);
1961       dirty_reg(current,rt1[i]);
1962     }
1963   }
1964   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1965   {
1966     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1967     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1968     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1969     {
1970       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1971       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1972     }
1973   }
1974   else
1975   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1976   {
1977     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1978     if(!((current->is32>>rs1[i])&1))
1979     {
1980       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1981     }
1982   }
1983   else
1984   if(opcode[i]==0x11) // BC1
1985   {
1986     alloc_reg(current,i,FSREG);
1987     alloc_reg(current,i,CSREG);
1988   }
1989   //else ...
1990 }
1991
1992 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1993 {
1994   stubs[stubcount][0]=type;
1995   stubs[stubcount][1]=addr;
1996   stubs[stubcount][2]=retaddr;
1997   stubs[stubcount][3]=a;
1998   stubs[stubcount][4]=b;
1999   stubs[stubcount][5]=c;
2000   stubs[stubcount][6]=d;
2001   stubs[stubcount][7]=e;
2002   stubcount++;
2003 }
2004
2005 // Write out a single register
2006 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2007 {
2008   int hr;
2009   for(hr=0;hr<HOST_REGS;hr++) {
2010     if(hr!=EXCLUDE_REG) {
2011       if((regmap[hr]&63)==r) {
2012         if((dirty>>hr)&1) {
2013           if(regmap[hr]<64) {
2014             emit_storereg(r,hr);
2015 #ifndef FORCE32
2016             if((is32>>regmap[hr])&1) {
2017               emit_sarimm(hr,31,hr);
2018               emit_storereg(r|64,hr);
2019             }
2020 #endif
2021           }else{
2022             emit_storereg(r|64,hr);
2023           }
2024         }
2025       }
2026     }
2027   }
2028 }
2029
2030 int mchecksum()
2031 {
2032   //if(!tracedebug) return 0;
2033   int i;
2034   int sum=0;
2035   for(i=0;i<2097152;i++) {
2036     unsigned int temp=sum;
2037     sum<<=1;
2038     sum|=(~temp)>>31;
2039     sum^=((u_int *)rdram)[i];
2040   }
2041   return sum;
2042 }
2043 int rchecksum()
2044 {
2045   int i;
2046   int sum=0;
2047   for(i=0;i<64;i++)
2048     sum^=((u_int *)reg)[i];
2049   return sum;
2050 }
2051 void rlist()
2052 {
2053   int i;
2054   printf("TRACE: ");
2055   for(i=0;i<32;i++)
2056     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2057   printf("\n");
2058 #ifndef DISABLE_COP1
2059   printf("TRACE: ");
2060   for(i=0;i<32;i++)
2061     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2062   printf("\n");
2063 #endif
2064 }
2065
2066 void enabletrace()
2067 {
2068   tracedebug=1;
2069 }
2070
2071 void memdebug(int i)
2072 {
2073   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2074   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2075   //rlist();
2076   //if(tracedebug) {
2077   //if(Count>=-2084597794) {
2078   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2079   //if(0) {
2080     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2081     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2082     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2083     rlist();
2084     #ifdef __i386__
2085     printf("TRACE: %x\n",(&i)[-1]);
2086     #endif
2087     #ifdef __arm__
2088     int j;
2089     printf("TRACE: %x \n",(&j)[10]);
2090     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2091     #endif
2092     //fflush(stdout);
2093   }
2094   //printf("TRACE: %x\n",(&i)[-1]);
2095 }
2096
2097 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2098 {
2099   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2100 }
2101
2102 void alu_assemble(int i,struct regstat *i_regs)
2103 {
2104   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2105     if(rt1[i]) {
2106       signed char s1,s2,t;
2107       t=get_reg(i_regs->regmap,rt1[i]);
2108       if(t>=0) {
2109         s1=get_reg(i_regs->regmap,rs1[i]);
2110         s2=get_reg(i_regs->regmap,rs2[i]);
2111         if(rs1[i]&&rs2[i]) {
2112           assert(s1>=0);
2113           assert(s2>=0);
2114           if(opcode2[i]&2) emit_sub(s1,s2,t);
2115           else emit_add(s1,s2,t);
2116         }
2117         else if(rs1[i]) {
2118           if(s1>=0) emit_mov(s1,t);
2119           else emit_loadreg(rs1[i],t);
2120         }
2121         else if(rs2[i]) {
2122           if(s2>=0) {
2123             if(opcode2[i]&2) emit_neg(s2,t);
2124             else emit_mov(s2,t);
2125           }
2126           else {
2127             emit_loadreg(rs2[i],t);
2128             if(opcode2[i]&2) emit_neg(t,t);
2129           }
2130         }
2131         else emit_zeroreg(t);
2132       }
2133     }
2134   }
2135   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2136     if(rt1[i]) {
2137       signed char s1l,s2l,s1h,s2h,tl,th;
2138       tl=get_reg(i_regs->regmap,rt1[i]);
2139       th=get_reg(i_regs->regmap,rt1[i]|64);
2140       if(tl>=0) {
2141         s1l=get_reg(i_regs->regmap,rs1[i]);
2142         s2l=get_reg(i_regs->regmap,rs2[i]);
2143         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2144         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2145         if(rs1[i]&&rs2[i]) {
2146           assert(s1l>=0);
2147           assert(s2l>=0);
2148           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2149           else emit_adds(s1l,s2l,tl);
2150           if(th>=0) {
2151             #ifdef INVERTED_CARRY
2152             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2153             #else
2154             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2155             #endif
2156             else emit_add(s1h,s2h,th);
2157           }
2158         }
2159         else if(rs1[i]) {
2160           if(s1l>=0) emit_mov(s1l,tl);
2161           else emit_loadreg(rs1[i],tl);
2162           if(th>=0) {
2163             if(s1h>=0) emit_mov(s1h,th);
2164             else emit_loadreg(rs1[i]|64,th);
2165           }
2166         }
2167         else if(rs2[i]) {
2168           if(s2l>=0) {
2169             if(opcode2[i]&2) emit_negs(s2l,tl);
2170             else emit_mov(s2l,tl);
2171           }
2172           else {
2173             emit_loadreg(rs2[i],tl);
2174             if(opcode2[i]&2) emit_negs(tl,tl);
2175           }
2176           if(th>=0) {
2177             #ifdef INVERTED_CARRY
2178             if(s2h>=0) emit_mov(s2h,th);
2179             else emit_loadreg(rs2[i]|64,th);
2180             if(opcode2[i]&2) {
2181               emit_adcimm(-1,th); // x86 has inverted carry flag
2182               emit_not(th,th);
2183             }
2184             #else
2185             if(opcode2[i]&2) {
2186               if(s2h>=0) emit_rscimm(s2h,0,th);
2187               else {
2188                 emit_loadreg(rs2[i]|64,th);
2189                 emit_rscimm(th,0,th);
2190               }
2191             }else{
2192               if(s2h>=0) emit_mov(s2h,th);
2193               else emit_loadreg(rs2[i]|64,th);
2194             }
2195             #endif
2196           }
2197         }
2198         else {
2199           emit_zeroreg(tl);
2200           if(th>=0) emit_zeroreg(th);
2201         }
2202       }
2203     }
2204   }
2205   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2206     if(rt1[i]) {
2207       signed char s1l,s1h,s2l,s2h,t;
2208       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2209       {
2210         t=get_reg(i_regs->regmap,rt1[i]);
2211         //assert(t>=0);
2212         if(t>=0) {
2213           s1l=get_reg(i_regs->regmap,rs1[i]);
2214           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2215           s2l=get_reg(i_regs->regmap,rs2[i]);
2216           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2217           if(rs2[i]==0) // rx<r0
2218           {
2219             assert(s1h>=0);
2220             if(opcode2[i]==0x2a) // SLT
2221               emit_shrimm(s1h,31,t);
2222             else // SLTU (unsigned can not be less than zero)
2223               emit_zeroreg(t);
2224           }
2225           else if(rs1[i]==0) // r0<rx
2226           {
2227             assert(s2h>=0);
2228             if(opcode2[i]==0x2a) // SLT
2229               emit_set_gz64_32(s2h,s2l,t);
2230             else // SLTU (set if not zero)
2231               emit_set_nz64_32(s2h,s2l,t);
2232           }
2233           else {
2234             assert(s1l>=0);assert(s1h>=0);
2235             assert(s2l>=0);assert(s2h>=0);
2236             if(opcode2[i]==0x2a) // SLT
2237               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2238             else // SLTU
2239               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2240           }
2241         }
2242       } else {
2243         t=get_reg(i_regs->regmap,rt1[i]);
2244         //assert(t>=0);
2245         if(t>=0) {
2246           s1l=get_reg(i_regs->regmap,rs1[i]);
2247           s2l=get_reg(i_regs->regmap,rs2[i]);
2248           if(rs2[i]==0) // rx<r0
2249           {
2250             assert(s1l>=0);
2251             if(opcode2[i]==0x2a) // SLT
2252               emit_shrimm(s1l,31,t);
2253             else // SLTU (unsigned can not be less than zero)
2254               emit_zeroreg(t);
2255           }
2256           else if(rs1[i]==0) // r0<rx
2257           {
2258             assert(s2l>=0);
2259             if(opcode2[i]==0x2a) // SLT
2260               emit_set_gz32(s2l,t);
2261             else // SLTU (set if not zero)
2262               emit_set_nz32(s2l,t);
2263           }
2264           else{
2265             assert(s1l>=0);assert(s2l>=0);
2266             if(opcode2[i]==0x2a) // SLT
2267               emit_set_if_less32(s1l,s2l,t);
2268             else // SLTU
2269               emit_set_if_carry32(s1l,s2l,t);
2270           }
2271         }
2272       }
2273     }
2274   }
2275   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2276     if(rt1[i]) {
2277       signed char s1l,s1h,s2l,s2h,th,tl;
2278       tl=get_reg(i_regs->regmap,rt1[i]);
2279       th=get_reg(i_regs->regmap,rt1[i]|64);
2280       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2281       {
2282         assert(tl>=0);
2283         if(tl>=0) {
2284           s1l=get_reg(i_regs->regmap,rs1[i]);
2285           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2286           s2l=get_reg(i_regs->regmap,rs2[i]);
2287           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2288           if(rs1[i]&&rs2[i]) {
2289             assert(s1l>=0);assert(s1h>=0);
2290             assert(s2l>=0);assert(s2h>=0);
2291             if(opcode2[i]==0x24) { // AND
2292               emit_and(s1l,s2l,tl);
2293               emit_and(s1h,s2h,th);
2294             } else
2295             if(opcode2[i]==0x25) { // OR
2296               emit_or(s1l,s2l,tl);
2297               emit_or(s1h,s2h,th);
2298             } else
2299             if(opcode2[i]==0x26) { // XOR
2300               emit_xor(s1l,s2l,tl);
2301               emit_xor(s1h,s2h,th);
2302             } else
2303             if(opcode2[i]==0x27) { // NOR
2304               emit_or(s1l,s2l,tl);
2305               emit_or(s1h,s2h,th);
2306               emit_not(tl,tl);
2307               emit_not(th,th);
2308             }
2309           }
2310           else
2311           {
2312             if(opcode2[i]==0x24) { // AND
2313               emit_zeroreg(tl);
2314               emit_zeroreg(th);
2315             } else
2316             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2317               if(rs1[i]){
2318                 if(s1l>=0) emit_mov(s1l,tl);
2319                 else emit_loadreg(rs1[i],tl);
2320                 if(s1h>=0) emit_mov(s1h,th);
2321                 else emit_loadreg(rs1[i]|64,th);
2322               }
2323               else
2324               if(rs2[i]){
2325                 if(s2l>=0) emit_mov(s2l,tl);
2326                 else emit_loadreg(rs2[i],tl);
2327                 if(s2h>=0) emit_mov(s2h,th);
2328                 else emit_loadreg(rs2[i]|64,th);
2329               }
2330               else{
2331                 emit_zeroreg(tl);
2332                 emit_zeroreg(th);
2333               }
2334             } else
2335             if(opcode2[i]==0x27) { // NOR
2336               if(rs1[i]){
2337                 if(s1l>=0) emit_not(s1l,tl);
2338                 else{
2339                   emit_loadreg(rs1[i],tl);
2340                   emit_not(tl,tl);
2341                 }
2342                 if(s1h>=0) emit_not(s1h,th);
2343                 else{
2344                   emit_loadreg(rs1[i]|64,th);
2345                   emit_not(th,th);
2346                 }
2347               }
2348               else
2349               if(rs2[i]){
2350                 if(s2l>=0) emit_not(s2l,tl);
2351                 else{
2352                   emit_loadreg(rs2[i],tl);
2353                   emit_not(tl,tl);
2354                 }
2355                 if(s2h>=0) emit_not(s2h,th);
2356                 else{
2357                   emit_loadreg(rs2[i]|64,th);
2358                   emit_not(th,th);
2359                 }
2360               }
2361               else {
2362                 emit_movimm(-1,tl);
2363                 emit_movimm(-1,th);
2364               }
2365             }
2366           }
2367         }
2368       }
2369       else
2370       {
2371         // 32 bit
2372         if(tl>=0) {
2373           s1l=get_reg(i_regs->regmap,rs1[i]);
2374           s2l=get_reg(i_regs->regmap,rs2[i]);
2375           if(rs1[i]&&rs2[i]) {
2376             assert(s1l>=0);
2377             assert(s2l>=0);
2378             if(opcode2[i]==0x24) { // AND
2379               emit_and(s1l,s2l,tl);
2380             } else
2381             if(opcode2[i]==0x25) { // OR
2382               emit_or(s1l,s2l,tl);
2383             } else
2384             if(opcode2[i]==0x26) { // XOR
2385               emit_xor(s1l,s2l,tl);
2386             } else
2387             if(opcode2[i]==0x27) { // NOR
2388               emit_or(s1l,s2l,tl);
2389               emit_not(tl,tl);
2390             }
2391           }
2392           else
2393           {
2394             if(opcode2[i]==0x24) { // AND
2395               emit_zeroreg(tl);
2396             } else
2397             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2398               if(rs1[i]){
2399                 if(s1l>=0) emit_mov(s1l,tl);
2400                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2401               }
2402               else
2403               if(rs2[i]){
2404                 if(s2l>=0) emit_mov(s2l,tl);
2405                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2406               }
2407               else emit_zeroreg(tl);
2408             } else
2409             if(opcode2[i]==0x27) { // NOR
2410               if(rs1[i]){
2411                 if(s1l>=0) emit_not(s1l,tl);
2412                 else {
2413                   emit_loadreg(rs1[i],tl);
2414                   emit_not(tl,tl);
2415                 }
2416               }
2417               else
2418               if(rs2[i]){
2419                 if(s2l>=0) emit_not(s2l,tl);
2420                 else {
2421                   emit_loadreg(rs2[i],tl);
2422                   emit_not(tl,tl);
2423                 }
2424               }
2425               else emit_movimm(-1,tl);
2426             }
2427           }
2428         }
2429       }
2430     }
2431   }
2432 }
2433
2434 void imm16_assemble(int i,struct regstat *i_regs)
2435 {
2436   if (opcode[i]==0x0f) { // LUI
2437     if(rt1[i]) {
2438       signed char t;
2439       t=get_reg(i_regs->regmap,rt1[i]);
2440       //assert(t>=0);
2441       if(t>=0) {
2442         if(!((i_regs->isconst>>t)&1))
2443           emit_movimm(imm[i]<<16,t);
2444       }
2445     }
2446   }
2447   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2448     if(rt1[i]) {
2449       signed char s,t;
2450       t=get_reg(i_regs->regmap,rt1[i]);
2451       s=get_reg(i_regs->regmap,rs1[i]);
2452       if(rs1[i]) {
2453         //assert(t>=0);
2454         //assert(s>=0);
2455         if(t>=0) {
2456           if(!((i_regs->isconst>>t)&1)) {
2457             if(s<0) {
2458               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2459               emit_addimm(t,imm[i],t);
2460             }else{
2461               if(!((i_regs->wasconst>>s)&1))
2462                 emit_addimm(s,imm[i],t);
2463               else
2464                 emit_movimm(constmap[i][s]+imm[i],t);
2465             }
2466           }
2467         }
2468       } else {
2469         if(t>=0) {
2470           if(!((i_regs->isconst>>t)&1))
2471             emit_movimm(imm[i],t);
2472         }
2473       }
2474     }
2475   }
2476   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2477     if(rt1[i]) {
2478       signed char sh,sl,th,tl;
2479       th=get_reg(i_regs->regmap,rt1[i]|64);
2480       tl=get_reg(i_regs->regmap,rt1[i]);
2481       sh=get_reg(i_regs->regmap,rs1[i]|64);
2482       sl=get_reg(i_regs->regmap,rs1[i]);
2483       if(tl>=0) {
2484         if(rs1[i]) {
2485           assert(sh>=0);
2486           assert(sl>=0);
2487           if(th>=0) {
2488             emit_addimm64_32(sh,sl,imm[i],th,tl);
2489           }
2490           else {
2491             emit_addimm(sl,imm[i],tl);
2492           }
2493         } else {
2494           emit_movimm(imm[i],tl);
2495           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2496         }
2497       }
2498     }
2499   }
2500   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2501     if(rt1[i]) {
2502       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2503       signed char sh,sl,t;
2504       t=get_reg(i_regs->regmap,rt1[i]);
2505       sh=get_reg(i_regs->regmap,rs1[i]|64);
2506       sl=get_reg(i_regs->regmap,rs1[i]);
2507       //assert(t>=0);
2508       if(t>=0) {
2509         if(rs1[i]>0) {
2510           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2511           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2512             if(opcode[i]==0x0a) { // SLTI
2513               if(sl<0) {
2514                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2515                 emit_slti32(t,imm[i],t);
2516               }else{
2517                 emit_slti32(sl,imm[i],t);
2518               }
2519             }
2520             else { // SLTIU
2521               if(sl<0) {
2522                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2523                 emit_sltiu32(t,imm[i],t);
2524               }else{
2525                 emit_sltiu32(sl,imm[i],t);
2526               }
2527             }
2528           }else{ // 64-bit
2529             assert(sl>=0);
2530             if(opcode[i]==0x0a) // SLTI
2531               emit_slti64_32(sh,sl,imm[i],t);
2532             else // SLTIU
2533               emit_sltiu64_32(sh,sl,imm[i],t);
2534           }
2535         }else{
2536           // SLTI(U) with r0 is just stupid,
2537           // nonetheless examples can be found
2538           if(opcode[i]==0x0a) // SLTI
2539             if(0<imm[i]) emit_movimm(1,t);
2540             else emit_zeroreg(t);
2541           else // SLTIU
2542           {
2543             if(imm[i]) emit_movimm(1,t);
2544             else emit_zeroreg(t);
2545           }
2546         }
2547       }
2548     }
2549   }
2550   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2551     if(rt1[i]) {
2552       signed char sh,sl,th,tl;
2553       th=get_reg(i_regs->regmap,rt1[i]|64);
2554       tl=get_reg(i_regs->regmap,rt1[i]);
2555       sh=get_reg(i_regs->regmap,rs1[i]|64);
2556       sl=get_reg(i_regs->regmap,rs1[i]);
2557       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2558         if(opcode[i]==0x0c) //ANDI
2559         {
2560           if(rs1[i]) {
2561             if(sl<0) {
2562               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2563               emit_andimm(tl,imm[i],tl);
2564             }else{
2565               if(!((i_regs->wasconst>>sl)&1))
2566                 emit_andimm(sl,imm[i],tl);
2567               else
2568                 emit_movimm(constmap[i][sl]&imm[i],tl);
2569             }
2570           }
2571           else
2572             emit_zeroreg(tl);
2573           if(th>=0) emit_zeroreg(th);
2574         }
2575         else
2576         {
2577           if(rs1[i]) {
2578             if(sl<0) {
2579               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2580             }
2581             if(th>=0) {
2582               if(sh<0) {
2583                 emit_loadreg(rs1[i]|64,th);
2584               }else{
2585                 emit_mov(sh,th);
2586               }
2587             }
2588             if(opcode[i]==0x0d) //ORI
2589             if(sl<0) {
2590               emit_orimm(tl,imm[i],tl);
2591             }else{
2592               if(!((i_regs->wasconst>>sl)&1))
2593                 emit_orimm(sl,imm[i],tl);
2594               else
2595                 emit_movimm(constmap[i][sl]|imm[i],tl);
2596             }
2597             if(opcode[i]==0x0e) //XORI
2598             if(sl<0) {
2599               emit_xorimm(tl,imm[i],tl);
2600             }else{
2601               if(!((i_regs->wasconst>>sl)&1))
2602                 emit_xorimm(sl,imm[i],tl);
2603               else
2604                 emit_movimm(constmap[i][sl]^imm[i],tl);
2605             }
2606           }
2607           else {
2608             emit_movimm(imm[i],tl);
2609             if(th>=0) emit_zeroreg(th);
2610           }
2611         }
2612       }
2613     }
2614   }
2615 }
2616
2617 void shiftimm_assemble(int i,struct regstat *i_regs)
2618 {
2619   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2620   {
2621     if(rt1[i]) {
2622       signed char s,t;
2623       t=get_reg(i_regs->regmap,rt1[i]);
2624       s=get_reg(i_regs->regmap,rs1[i]);
2625       //assert(t>=0);
2626       if(t>=0){
2627         if(rs1[i]==0)
2628         {
2629           emit_zeroreg(t);
2630         }
2631         else
2632         {
2633           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2634           if(imm[i]) {
2635             if(opcode2[i]==0) // SLL
2636             {
2637               emit_shlimm(s<0?t:s,imm[i],t);
2638             }
2639             if(opcode2[i]==2) // SRL
2640             {
2641               emit_shrimm(s<0?t:s,imm[i],t);
2642             }
2643             if(opcode2[i]==3) // SRA
2644             {
2645               emit_sarimm(s<0?t:s,imm[i],t);
2646             }
2647           }else{
2648             // Shift by zero
2649             if(s>=0 && s!=t) emit_mov(s,t);
2650           }
2651         }
2652       }
2653       //emit_storereg(rt1[i],t); //DEBUG
2654     }
2655   }
2656   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2657   {
2658     if(rt1[i]) {
2659       signed char sh,sl,th,tl;
2660       th=get_reg(i_regs->regmap,rt1[i]|64);
2661       tl=get_reg(i_regs->regmap,rt1[i]);
2662       sh=get_reg(i_regs->regmap,rs1[i]|64);
2663       sl=get_reg(i_regs->regmap,rs1[i]);
2664       if(tl>=0) {
2665         if(rs1[i]==0)
2666         {
2667           emit_zeroreg(tl);
2668           if(th>=0) emit_zeroreg(th);
2669         }
2670         else
2671         {
2672           assert(sl>=0);
2673           assert(sh>=0);
2674           if(imm[i]) {
2675             if(opcode2[i]==0x38) // DSLL
2676             {
2677               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2678               emit_shlimm(sl,imm[i],tl);
2679             }
2680             if(opcode2[i]==0x3a) // DSRL
2681             {
2682               emit_shrdimm(sl,sh,imm[i],tl);
2683               if(th>=0) emit_shrimm(sh,imm[i],th);
2684             }
2685             if(opcode2[i]==0x3b) // DSRA
2686             {
2687               emit_shrdimm(sl,sh,imm[i],tl);
2688               if(th>=0) emit_sarimm(sh,imm[i],th);
2689             }
2690           }else{
2691             // Shift by zero
2692             if(sl!=tl) emit_mov(sl,tl);
2693             if(th>=0&&sh!=th) emit_mov(sh,th);
2694           }
2695         }
2696       }
2697     }
2698   }
2699   if(opcode2[i]==0x3c) // DSLL32
2700   {
2701     if(rt1[i]) {
2702       signed char sl,tl,th;
2703       tl=get_reg(i_regs->regmap,rt1[i]);
2704       th=get_reg(i_regs->regmap,rt1[i]|64);
2705       sl=get_reg(i_regs->regmap,rs1[i]);
2706       if(th>=0||tl>=0){
2707         assert(tl>=0);
2708         assert(th>=0);
2709         assert(sl>=0);
2710         emit_mov(sl,th);
2711         emit_zeroreg(tl);
2712         if(imm[i]>32)
2713         {
2714           emit_shlimm(th,imm[i]&31,th);
2715         }
2716       }
2717     }
2718   }
2719   if(opcode2[i]==0x3e) // DSRL32
2720   {
2721     if(rt1[i]) {
2722       signed char sh,tl,th;
2723       tl=get_reg(i_regs->regmap,rt1[i]);
2724       th=get_reg(i_regs->regmap,rt1[i]|64);
2725       sh=get_reg(i_regs->regmap,rs1[i]|64);
2726       if(tl>=0){
2727         assert(sh>=0);
2728         emit_mov(sh,tl);
2729         if(th>=0) emit_zeroreg(th);
2730         if(imm[i]>32)
2731         {
2732           emit_shrimm(tl,imm[i]&31,tl);
2733         }
2734       }
2735     }
2736   }
2737   if(opcode2[i]==0x3f) // DSRA32
2738   {
2739     if(rt1[i]) {
2740       signed char sh,tl;
2741       tl=get_reg(i_regs->regmap,rt1[i]);
2742       sh=get_reg(i_regs->regmap,rs1[i]|64);
2743       if(tl>=0){
2744         assert(sh>=0);
2745         emit_mov(sh,tl);
2746         if(imm[i]>32)
2747         {
2748           emit_sarimm(tl,imm[i]&31,tl);
2749         }
2750       }
2751     }
2752   }
2753 }
2754
2755 #ifndef shift_assemble
2756 void shift_assemble(int i,struct regstat *i_regs)
2757 {
2758   printf("Need shift_assemble for this architecture.\n");
2759   exit(1);
2760 }
2761 #endif
2762
2763 void load_assemble(int i,struct regstat *i_regs)
2764 {
2765   int s,th,tl,addr,map=-1;
2766   int offset;
2767   int jaddr=0;
2768   int memtarget=0,c=0;
2769   u_int hr,reglist=0;
2770   th=get_reg(i_regs->regmap,rt1[i]|64);
2771   tl=get_reg(i_regs->regmap,rt1[i]);
2772   s=get_reg(i_regs->regmap,rs1[i]);
2773   offset=imm[i];
2774   for(hr=0;hr<HOST_REGS;hr++) {
2775     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2776   }
2777   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2778   if(s>=0) {
2779     c=(i_regs->wasconst>>s)&1;
2780     if (c) {
2781       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2782       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2783     }
2784   }
2785   //printf("load_assemble: c=%d\n",c);
2786   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2787   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2788 #ifdef PCSX
2789   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2790     ||rt1[i]==0) {
2791       // could be FIFO, must perform the read
2792       // ||dummy read
2793       assem_debug("(forced read)\n");
2794       tl=get_reg(i_regs->regmap,-1);
2795       assert(tl>=0);
2796   }
2797 #endif
2798   if(offset||s<0||c) addr=tl;
2799   else addr=s;
2800   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2801  if(tl>=0) {
2802   //printf("load_assemble: c=%d\n",c);
2803   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2804   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2805   reglist&=~(1<<tl);
2806   if(th>=0) reglist&=~(1<<th);
2807   if(!using_tlb) {
2808     if(!c) {
2809       #ifdef RAM_OFFSET
2810       map=get_reg(i_regs->regmap,ROREG);
2811       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2812       #endif
2813 //#define R29_HACK 1
2814       #ifdef R29_HACK
2815       // Strmnnrmn's speed hack
2816       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2817       #endif
2818       {
2819         #ifdef PCSX
2820         if(sp_in_mirror&&rs1[i]==29) {
2821           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2822           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2823         }
2824         else
2825         #endif
2826         emit_cmpimm(addr,RAM_SIZE);
2827         jaddr=(int)out;
2828         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2829         // Hint to branch predictor that the branch is unlikely to be taken
2830         if(rs1[i]>=28)
2831           emit_jno_unlikely(0);
2832         else
2833         #endif
2834         emit_jno(0);
2835       }
2836     }
2837   }else{ // using tlb
2838     int x=0;
2839     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2840     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2841     map=get_reg(i_regs->regmap,TLREG);
2842     assert(map>=0);
2843     reglist&=~(1<<map);
2844     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2845     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2846   }
2847   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2848   if (opcode[i]==0x20) { // LB
2849     if(!c||memtarget) {
2850       if(!dummy) {
2851         #ifdef HOST_IMM_ADDR32
2852         if(c)
2853           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2854         else
2855         #endif
2856         {
2857           //emit_xorimm(addr,3,tl);
2858           //gen_tlb_addr_r(tl,map);
2859           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2860           int x=0,a=tl;
2861 #ifdef BIG_ENDIAN_MIPS
2862           if(!c) emit_xorimm(addr,3,tl);
2863           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2864 #else
2865           if(!c) a=addr;
2866 #endif
2867 #ifdef PCSX
2868           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2869 #endif
2870           emit_movsbl_indexed_tlb(x,a,map,tl);
2871         }
2872       }
2873       if(jaddr)
2874         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2875     }
2876     else
2877       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2878   }
2879   if (opcode[i]==0x21) { // LH
2880     if(!c||memtarget) {
2881       if(!dummy) {
2882         #ifdef HOST_IMM_ADDR32
2883         if(c)
2884           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2885         else
2886         #endif
2887         {
2888           int x=0,a=tl;
2889 #ifdef BIG_ENDIAN_MIPS
2890           if(!c) emit_xorimm(addr,2,tl);
2891           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2892 #else
2893           if(!c) a=addr;
2894 #endif
2895 #ifdef PCSX
2896           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2897 #endif
2898           //#ifdef
2899           //emit_movswl_indexed_tlb(x,tl,map,tl);
2900           //else
2901           if(map>=0) {
2902             gen_tlb_addr_r(a,map);
2903             emit_movswl_indexed(x,a,tl);
2904           }else{
2905             #ifdef RAM_OFFSET
2906             emit_movswl_indexed(x,a,tl);
2907             #else
2908             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2909             #endif
2910           }
2911         }
2912       }
2913       if(jaddr)
2914         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2915     }
2916     else
2917       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2918   }
2919   if (opcode[i]==0x23) { // LW
2920     if(!c||memtarget) {
2921       if(!dummy) {
2922         int a=addr;
2923 #ifdef PCSX
2924         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2925 #endif
2926         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2927         #ifdef HOST_IMM_ADDR32
2928         if(c)
2929           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2930         else
2931         #endif
2932         emit_readword_indexed_tlb(0,a,map,tl);
2933       }
2934       if(jaddr)
2935         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2936     }
2937     else
2938       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2939   }
2940   if (opcode[i]==0x24) { // LBU
2941     if(!c||memtarget) {
2942       if(!dummy) {
2943         #ifdef HOST_IMM_ADDR32
2944         if(c)
2945           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2946         else
2947         #endif
2948         {
2949           //emit_xorimm(addr,3,tl);
2950           //gen_tlb_addr_r(tl,map);
2951           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2952           int x=0,a=tl;
2953 #ifdef BIG_ENDIAN_MIPS
2954           if(!c) emit_xorimm(addr,3,tl);
2955           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2956 #else
2957           if(!c) a=addr;
2958 #endif
2959 #ifdef PCSX
2960           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2961 #endif
2962           emit_movzbl_indexed_tlb(x,a,map,tl);
2963         }
2964       }
2965       if(jaddr)
2966         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2967     }
2968     else
2969       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2970   }
2971   if (opcode[i]==0x25) { // LHU
2972     if(!c||memtarget) {
2973       if(!dummy) {
2974         #ifdef HOST_IMM_ADDR32
2975         if(c)
2976           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2977         else
2978         #endif
2979         {
2980           int x=0,a=tl;
2981 #ifdef BIG_ENDIAN_MIPS
2982           if(!c) emit_xorimm(addr,2,tl);
2983           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2984 #else
2985           if(!c) a=addr;
2986 #endif
2987 #ifdef PCSX
2988           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2989 #endif
2990           //#ifdef
2991           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2992           //#else
2993           if(map>=0) {
2994             gen_tlb_addr_r(a,map);
2995             emit_movzwl_indexed(x,a,tl);
2996           }else{
2997             #ifdef RAM_OFFSET
2998             emit_movzwl_indexed(x,a,tl);
2999             #else
3000             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3001             #endif
3002           }
3003         }
3004       }
3005       if(jaddr)
3006         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3007     }
3008     else
3009       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3010   }
3011   if (opcode[i]==0x27) { // LWU
3012     assert(th>=0);
3013     if(!c||memtarget) {
3014       if(!dummy) {
3015         int a=addr;
3016 #ifdef PCSX
3017         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3018 #endif
3019         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3020         #ifdef HOST_IMM_ADDR32
3021         if(c)
3022           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3023         else
3024         #endif
3025         emit_readword_indexed_tlb(0,a,map,tl);
3026       }
3027       if(jaddr)
3028         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3029     }
3030     else {
3031       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3032     }
3033     emit_zeroreg(th);
3034   }
3035   if (opcode[i]==0x37) { // LD
3036     if(!c||memtarget) {
3037       if(!dummy) {
3038         int a=addr;
3039 #ifdef PCSX
3040         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3041 #endif
3042         //gen_tlb_addr_r(tl,map);
3043         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3044         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3045         #ifdef HOST_IMM_ADDR32
3046         if(c)
3047           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3048         else
3049         #endif
3050         emit_readdword_indexed_tlb(0,a,map,th,tl);
3051       }
3052       if(jaddr)
3053         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3054     }
3055     else
3056       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3057   }
3058  }
3059   //emit_storereg(rt1[i],tl); // DEBUG
3060   //if(opcode[i]==0x23)
3061   //if(opcode[i]==0x24)
3062   //if(opcode[i]==0x23||opcode[i]==0x24)
3063   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3064   {
3065     //emit_pusha();
3066     save_regs(0x100f);
3067         emit_readword((int)&last_count,ECX);
3068         #ifdef __i386__
3069         if(get_reg(i_regs->regmap,CCREG)<0)
3070           emit_loadreg(CCREG,HOST_CCREG);
3071         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3072         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3073         emit_writeword(HOST_CCREG,(int)&Count);
3074         #endif
3075         #ifdef __arm__
3076         if(get_reg(i_regs->regmap,CCREG)<0)
3077           emit_loadreg(CCREG,0);
3078         else
3079           emit_mov(HOST_CCREG,0);
3080         emit_add(0,ECX,0);
3081         emit_addimm(0,2*ccadj[i],0);
3082         emit_writeword(0,(int)&Count);
3083         #endif
3084     emit_call((int)memdebug);
3085     //emit_popa();
3086     restore_regs(0x100f);
3087   }/**/
3088 }
3089
3090 #ifndef loadlr_assemble
3091 void loadlr_assemble(int i,struct regstat *i_regs)
3092 {
3093   printf("Need loadlr_assemble for this architecture.\n");
3094   exit(1);
3095 }
3096 #endif
3097
3098 void store_assemble(int i,struct regstat *i_regs)
3099 {
3100   int s,th,tl,map=-1;
3101   int addr,temp;
3102   int offset;
3103   int jaddr=0,jaddr2,type;
3104   int memtarget=0,c=0;
3105   int agr=AGEN1+(i&1);
3106   u_int hr,reglist=0;
3107   th=get_reg(i_regs->regmap,rs2[i]|64);
3108   tl=get_reg(i_regs->regmap,rs2[i]);
3109   s=get_reg(i_regs->regmap,rs1[i]);
3110   temp=get_reg(i_regs->regmap,agr);
3111   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3112   offset=imm[i];
3113   if(s>=0) {
3114     c=(i_regs->wasconst>>s)&1;
3115     if(c) {
3116       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3117       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3118     }
3119   }
3120   assert(tl>=0);
3121   assert(temp>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3126   if(offset||s<0||c) addr=temp;
3127   else addr=s;
3128   if(!using_tlb) {
3129     if(!c) {
3130       #ifdef PCSX
3131       if(sp_in_mirror&&rs1[i]==29) {
3132         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3133         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3134       }
3135       else
3136       #endif
3137       #ifdef R29_HACK
3138       // Strmnnrmn's speed hack
3139       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3140       #endif
3141       emit_cmpimm(addr,RAM_SIZE);
3142       #ifdef DESTRUCTIVE_SHIFT
3143       if(s==addr) emit_mov(s,temp);
3144       #endif
3145       #ifdef R29_HACK
3146       memtarget=1;
3147       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3148       #endif
3149       {
3150         jaddr=(int)out;
3151         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3152         // Hint to branch predictor that the branch is unlikely to be taken
3153         if(rs1[i]>=28)
3154           emit_jno_unlikely(0);
3155         else
3156         #endif
3157         emit_jno(0);
3158       }
3159     }
3160   }else{ // using tlb
3161     int x=0;
3162     if (opcode[i]==0x28) x=3; // SB
3163     if (opcode[i]==0x29) x=2; // SH
3164     map=get_reg(i_regs->regmap,TLREG);
3165     assert(map>=0);
3166     reglist&=~(1<<map);
3167     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3168     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3169   }
3170
3171   if (opcode[i]==0x28) { // SB
3172     if(!c||memtarget) {
3173       int x=0,a=temp;
3174 #ifdef BIG_ENDIAN_MIPS
3175       if(!c) emit_xorimm(addr,3,temp);
3176       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3177 #else
3178       if(!c) a=addr;
3179 #endif
3180 #ifdef PCSX
3181       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3182 #endif
3183       //gen_tlb_addr_w(temp,map);
3184       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3185       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3186     }
3187     type=STOREB_STUB;
3188   }
3189   if (opcode[i]==0x29) { // SH
3190     if(!c||memtarget) {
3191       int x=0,a=temp;
3192 #ifdef BIG_ENDIAN_MIPS
3193       if(!c) emit_xorimm(addr,2,temp);
3194       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3195 #else
3196       if(!c) a=addr;
3197 #endif
3198 #ifdef PCSX
3199       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3200 #endif
3201       //#ifdef
3202       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3203       //#else
3204       if(map>=0) {
3205         gen_tlb_addr_w(a,map);
3206         emit_writehword_indexed(tl,x,a);
3207       }else
3208         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3209     }
3210     type=STOREH_STUB;
3211   }
3212   if (opcode[i]==0x2B) { // SW
3213     if(!c||memtarget) {
3214       int a=addr;
3215 #ifdef PCSX
3216       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3217 #endif
3218       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3219       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3220     }
3221     type=STOREW_STUB;
3222   }
3223   if (opcode[i]==0x3F) { // SD
3224     if(!c||memtarget) {
3225       int a=addr;
3226 #ifdef PCSX
3227       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3228 #endif
3229       if(rs2[i]) {
3230         assert(th>=0);
3231         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3232         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3233         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3234       }else{
3235         // Store zero
3236         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3237         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3238         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3239       }
3240     }
3241     type=STORED_STUB;
3242   }
3243   if(!using_tlb) {
3244     if(!c||memtarget) {
3245       #ifdef DESTRUCTIVE_SHIFT
3246       // The x86 shift operation is 'destructive'; it overwrites the
3247       // source register, so we need to make a copy first and use that.
3248       addr=temp;
3249       #endif
3250       #if defined(HOST_IMM8)
3251       int ir=get_reg(i_regs->regmap,INVCP);
3252       assert(ir>=0);
3253       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3254       #else
3255       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3256       #endif
3257       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3258       emit_callne(invalidate_addr_reg[addr]);
3259       #else
3260       jaddr2=(int)out;
3261       emit_jne(0);
3262       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3263       #endif
3264     }
3265   }
3266   if(jaddr) {
3267     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3268   } else if(c&&!memtarget) {
3269     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3270   }
3271   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3272   //if(opcode[i]==0x2B || opcode[i]==0x28)
3273   //if(opcode[i]==0x2B || opcode[i]==0x29)
3274   //if(opcode[i]==0x2B)
3275   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3276   {
3277     #ifdef __i386__
3278     emit_pusha();
3279     #endif
3280     #ifdef __arm__
3281     save_regs(0x100f);
3282     #endif
3283         emit_readword((int)&last_count,ECX);
3284         #ifdef __i386__
3285         if(get_reg(i_regs->regmap,CCREG)<0)
3286           emit_loadreg(CCREG,HOST_CCREG);
3287         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3288         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3289         emit_writeword(HOST_CCREG,(int)&Count);
3290         #endif
3291         #ifdef __arm__
3292         if(get_reg(i_regs->regmap,CCREG)<0)
3293           emit_loadreg(CCREG,0);
3294         else
3295           emit_mov(HOST_CCREG,0);
3296         emit_add(0,ECX,0);
3297         emit_addimm(0,2*ccadj[i],0);
3298         emit_writeword(0,(int)&Count);
3299         #endif
3300     emit_call((int)memdebug);
3301     #ifdef __i386__
3302     emit_popa();
3303     #endif
3304     #ifdef __arm__
3305     restore_regs(0x100f);
3306     #endif
3307   }/**/
3308 }
3309
3310 void storelr_assemble(int i,struct regstat *i_regs)
3311 {
3312   int s,th,tl;
3313   int temp;
3314   int temp2;
3315   int offset;
3316   int jaddr=0,jaddr2;
3317   int case1,case2,case3;
3318   int done0,done1,done2;
3319   int memtarget=0,c=0;
3320   int agr=AGEN1+(i&1);
3321   u_int hr,reglist=0;
3322   th=get_reg(i_regs->regmap,rs2[i]|64);
3323   tl=get_reg(i_regs->regmap,rs2[i]);
3324   s=get_reg(i_regs->regmap,rs1[i]);
3325   temp=get_reg(i_regs->regmap,agr);
3326   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3327   offset=imm[i];
3328   if(s>=0) {
3329     c=(i_regs->isconst>>s)&1;
3330     if(c) {
3331       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3332       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3333     }
3334   }
3335   assert(tl>=0);
3336   for(hr=0;hr<HOST_REGS;hr++) {
3337     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3338   }
3339   assert(temp>=0);
3340   if(!using_tlb) {
3341     if(!c) {
3342       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3343       if(!offset&&s!=temp) emit_mov(s,temp);
3344       jaddr=(int)out;
3345       emit_jno(0);
3346     }
3347     else
3348     {
3349       if(!memtarget||!rs1[i]) {
3350         jaddr=(int)out;
3351         emit_jmp(0);
3352       }
3353     }
3354     #ifdef RAM_OFFSET
3355     int map=get_reg(i_regs->regmap,ROREG);
3356     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3357     gen_tlb_addr_w(temp,map);
3358     #else
3359     if((u_int)rdram!=0x80000000) 
3360       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3361     #endif
3362   }else{ // using tlb
3363     int map=get_reg(i_regs->regmap,TLREG);
3364     assert(map>=0);
3365     reglist&=~(1<<map);
3366     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3367     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3368     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3369     if(!jaddr&&!memtarget) {
3370       jaddr=(int)out;
3371       emit_jmp(0);
3372     }
3373     gen_tlb_addr_w(temp,map);
3374   }
3375
3376   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3377     temp2=get_reg(i_regs->regmap,FTEMP);
3378     if(!rs2[i]) temp2=th=tl;
3379   }
3380
3381 #ifndef BIG_ENDIAN_MIPS
3382     emit_xorimm(temp,3,temp);
3383 #endif
3384   emit_testimm(temp,2);
3385   case2=(int)out;
3386   emit_jne(0);
3387   emit_testimm(temp,1);
3388   case1=(int)out;
3389   emit_jne(0);
3390   // 0
3391   if (opcode[i]==0x2A) { // SWL
3392     emit_writeword_indexed(tl,0,temp);
3393   }
3394   if (opcode[i]==0x2E) { // SWR
3395     emit_writebyte_indexed(tl,3,temp);
3396   }
3397   if (opcode[i]==0x2C) { // SDL
3398     emit_writeword_indexed(th,0,temp);
3399     if(rs2[i]) emit_mov(tl,temp2);
3400   }
3401   if (opcode[i]==0x2D) { // SDR
3402     emit_writebyte_indexed(tl,3,temp);
3403     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3404   }
3405   done0=(int)out;
3406   emit_jmp(0);
3407   // 1
3408   set_jump_target(case1,(int)out);
3409   if (opcode[i]==0x2A) { // SWL
3410     // Write 3 msb into three least significant bytes
3411     if(rs2[i]) emit_rorimm(tl,8,tl);
3412     emit_writehword_indexed(tl,-1,temp);
3413     if(rs2[i]) emit_rorimm(tl,16,tl);
3414     emit_writebyte_indexed(tl,1,temp);
3415     if(rs2[i]) emit_rorimm(tl,8,tl);
3416   }
3417   if (opcode[i]==0x2E) { // SWR
3418     // Write two lsb into two most significant bytes
3419     emit_writehword_indexed(tl,1,temp);
3420   }
3421   if (opcode[i]==0x2C) { // SDL
3422     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3423     // Write 3 msb into three least significant bytes
3424     if(rs2[i]) emit_rorimm(th,8,th);
3425     emit_writehword_indexed(th,-1,temp);
3426     if(rs2[i]) emit_rorimm(th,16,th);
3427     emit_writebyte_indexed(th,1,temp);
3428     if(rs2[i]) emit_rorimm(th,8,th);
3429   }
3430   if (opcode[i]==0x2D) { // SDR
3431     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3432     // Write two lsb into two most significant bytes
3433     emit_writehword_indexed(tl,1,temp);
3434   }
3435   done1=(int)out;
3436   emit_jmp(0);
3437   // 2
3438   set_jump_target(case2,(int)out);
3439   emit_testimm(temp,1);
3440   case3=(int)out;
3441   emit_jne(0);
3442   if (opcode[i]==0x2A) { // SWL
3443     // Write two msb into two least significant bytes
3444     if(rs2[i]) emit_rorimm(tl,16,tl);
3445     emit_writehword_indexed(tl,-2,temp);
3446     if(rs2[i]) emit_rorimm(tl,16,tl);
3447   }
3448   if (opcode[i]==0x2E) { // SWR
3449     // Write 3 lsb into three most significant bytes
3450     emit_writebyte_indexed(tl,-1,temp);
3451     if(rs2[i]) emit_rorimm(tl,8,tl);
3452     emit_writehword_indexed(tl,0,temp);
3453     if(rs2[i]) emit_rorimm(tl,24,tl);
3454   }
3455   if (opcode[i]==0x2C) { // SDL
3456     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3457     // Write two msb into two least significant bytes
3458     if(rs2[i]) emit_rorimm(th,16,th);
3459     emit_writehword_indexed(th,-2,temp);
3460     if(rs2[i]) emit_rorimm(th,16,th);
3461   }
3462   if (opcode[i]==0x2D) { // SDR
3463     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3464     // Write 3 lsb into three most significant bytes
3465     emit_writebyte_indexed(tl,-1,temp);
3466     if(rs2[i]) emit_rorimm(tl,8,tl);
3467     emit_writehword_indexed(tl,0,temp);
3468     if(rs2[i]) emit_rorimm(tl,24,tl);
3469   }
3470   done2=(int)out;
3471   emit_jmp(0);
3472   // 3
3473   set_jump_target(case3,(int)out);
3474   if (opcode[i]==0x2A) { // SWL
3475     // Write msb into least significant byte
3476     if(rs2[i]) emit_rorimm(tl,24,tl);
3477     emit_writebyte_indexed(tl,-3,temp);
3478     if(rs2[i]) emit_rorimm(tl,8,tl);
3479   }
3480   if (opcode[i]==0x2E) { // SWR
3481     // Write entire word
3482     emit_writeword_indexed(tl,-3,temp);
3483   }
3484   if (opcode[i]==0x2C) { // SDL
3485     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3486     // Write msb into least significant byte
3487     if(rs2[i]) emit_rorimm(th,24,th);
3488     emit_writebyte_indexed(th,-3,temp);
3489     if(rs2[i]) emit_rorimm(th,8,th);
3490   }
3491   if (opcode[i]==0x2D) { // SDR
3492     if(rs2[i]) emit_mov(th,temp2);
3493     // Write entire word
3494     emit_writeword_indexed(tl,-3,temp);
3495   }
3496   set_jump_target(done0,(int)out);
3497   set_jump_target(done1,(int)out);
3498   set_jump_target(done2,(int)out);
3499   if (opcode[i]==0x2C) { // SDL
3500     emit_testimm(temp,4);
3501     done0=(int)out;
3502     emit_jne(0);
3503     emit_andimm(temp,~3,temp);
3504     emit_writeword_indexed(temp2,4,temp);
3505     set_jump_target(done0,(int)out);
3506   }
3507   if (opcode[i]==0x2D) { // SDR
3508     emit_testimm(temp,4);
3509     done0=(int)out;
3510     emit_jeq(0);
3511     emit_andimm(temp,~3,temp);
3512     emit_writeword_indexed(temp2,-4,temp);
3513     set_jump_target(done0,(int)out);
3514   }
3515   if(!c||!memtarget)
3516     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3517   if(!using_tlb) {
3518     #ifdef RAM_OFFSET
3519     int map=get_reg(i_regs->regmap,ROREG);
3520     if(map<0) map=HOST_TEMPREG;
3521     gen_orig_addr_w(temp,map);
3522     #else
3523     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3524     #endif
3525     #if defined(HOST_IMM8)
3526     int ir=get_reg(i_regs->regmap,INVCP);
3527     assert(ir>=0);
3528     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3529     #else
3530     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3531     #endif
3532     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3533     emit_callne(invalidate_addr_reg[temp]);
3534     #else
3535     jaddr2=(int)out;
3536     emit_jne(0);
3537     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3538     #endif
3539   }
3540   /*
3541     emit_pusha();
3542     //save_regs(0x100f);
3543         emit_readword((int)&last_count,ECX);
3544         if(get_reg(i_regs->regmap,CCREG)<0)
3545           emit_loadreg(CCREG,HOST_CCREG);
3546         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3547         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3548         emit_writeword(HOST_CCREG,(int)&Count);
3549     emit_call((int)memdebug);
3550     emit_popa();
3551     //restore_regs(0x100f);
3552   /**/
3553 }
3554
3555 void c1ls_assemble(int i,struct regstat *i_regs)
3556 {
3557 #ifndef DISABLE_COP1
3558   int s,th,tl;
3559   int temp,ar;
3560   int map=-1;
3561   int offset;
3562   int c=0;
3563   int jaddr,jaddr2=0,jaddr3,type;
3564   int agr=AGEN1+(i&1);
3565   u_int hr,reglist=0;
3566   th=get_reg(i_regs->regmap,FTEMP|64);
3567   tl=get_reg(i_regs->regmap,FTEMP);
3568   s=get_reg(i_regs->regmap,rs1[i]);
3569   temp=get_reg(i_regs->regmap,agr);
3570   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3571   offset=imm[i];
3572   assert(tl>=0);
3573   assert(rs1[i]>0);
3574   assert(temp>=0);
3575   for(hr=0;hr<HOST_REGS;hr++) {
3576     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3577   }
3578   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3579   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3580   {
3581     // Loads use a temporary register which we need to save
3582     reglist|=1<<temp;
3583   }
3584   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3585     ar=temp;
3586   else // LWC1/LDC1
3587     ar=tl;
3588   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3589   //else c=(i_regs->wasconst>>s)&1;
3590   if(s>=0) c=(i_regs->wasconst>>s)&1;
3591   // Check cop1 unusable
3592   if(!cop1_usable) {
3593     signed char rs=get_reg(i_regs->regmap,CSREG);
3594     assert(rs>=0);
3595     emit_testimm(rs,0x20000000);
3596     jaddr=(int)out;
3597     emit_jeq(0);
3598     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3599     cop1_usable=1;
3600   }
3601   if (opcode[i]==0x39) { // SWC1 (get float address)
3602     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3603   }
3604   if (opcode[i]==0x3D) { // SDC1 (get double address)
3605     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3606   }
3607   // Generate address + offset
3608   if(!using_tlb) {
3609     if(!c)
3610       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3611   }
3612   else
3613   {
3614     map=get_reg(i_regs->regmap,TLREG);
3615     assert(map>=0);
3616     reglist&=~(1<<map);
3617     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3618       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3619     }
3620     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3621       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3622     }
3623   }
3624   if (opcode[i]==0x39) { // SWC1 (read float)
3625     emit_readword_indexed(0,tl,tl);
3626   }
3627   if (opcode[i]==0x3D) { // SDC1 (read double)
3628     emit_readword_indexed(4,tl,th);
3629     emit_readword_indexed(0,tl,tl);
3630   }
3631   if (opcode[i]==0x31) { // LWC1 (get target address)
3632     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3633   }
3634   if (opcode[i]==0x35) { // LDC1 (get target address)
3635     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3636   }
3637   if(!using_tlb) {
3638     if(!c) {
3639       jaddr2=(int)out;
3640       emit_jno(0);
3641     }
3642     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3643       jaddr2=(int)out;
3644       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3645     }
3646     #ifdef DESTRUCTIVE_SHIFT
3647     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3648       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3649     }
3650     #endif
3651   }else{
3652     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3653       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3654     }
3655     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3656       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3657     }
3658   }
3659   if (opcode[i]==0x31) { // LWC1
3660     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3661     //gen_tlb_addr_r(ar,map);
3662     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3663     #ifdef HOST_IMM_ADDR32
3664     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3665     else
3666     #endif
3667     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3668     type=LOADW_STUB;
3669   }
3670   if (opcode[i]==0x35) { // LDC1
3671     assert(th>=0);
3672     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3673     //gen_tlb_addr_r(ar,map);
3674     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3675     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3676     #ifdef HOST_IMM_ADDR32
3677     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3678     else
3679     #endif
3680     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3681     type=LOADD_STUB;
3682   }
3683   if (opcode[i]==0x39) { // SWC1
3684     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3685     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3686     type=STOREW_STUB;
3687   }
3688   if (opcode[i]==0x3D) { // SDC1
3689     assert(th>=0);
3690     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3691     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3692     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3693     type=STORED_STUB;
3694   }
3695   if(!using_tlb) {
3696     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3697       #ifndef DESTRUCTIVE_SHIFT
3698       temp=offset||c||s<0?ar:s;
3699       #endif
3700       #if defined(HOST_IMM8)
3701       int ir=get_reg(i_regs->regmap,INVCP);
3702       assert(ir>=0);
3703       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3704       #else
3705       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3706       #endif
3707       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3708       emit_callne(invalidate_addr_reg[temp]);
3709       #else
3710       jaddr3=(int)out;
3711       emit_jne(0);
3712       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3713       #endif
3714     }
3715   }
3716   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3717   if (opcode[i]==0x31) { // LWC1 (write float)
3718     emit_writeword_indexed(tl,0,temp);
3719   }
3720   if (opcode[i]==0x35) { // LDC1 (write double)
3721     emit_writeword_indexed(th,4,temp);
3722     emit_writeword_indexed(tl,0,temp);
3723   }
3724   //if(opcode[i]==0x39)
3725   /*if(opcode[i]==0x39||opcode[i]==0x31)
3726   {
3727     emit_pusha();
3728         emit_readword((int)&last_count,ECX);
3729         if(get_reg(i_regs->regmap,CCREG)<0)
3730           emit_loadreg(CCREG,HOST_CCREG);
3731         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3732         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3733         emit_writeword(HOST_CCREG,(int)&Count);
3734     emit_call((int)memdebug);
3735     emit_popa();
3736   }/**/
3737 #else
3738   cop1_unusable(i, i_regs);
3739 #endif
3740 }
3741
3742 void c2ls_assemble(int i,struct regstat *i_regs)
3743 {
3744   int s,tl;
3745   int ar;
3746   int offset;
3747   int memtarget=0,c=0;
3748   int jaddr2=0,jaddr3,type;
3749   int agr=AGEN1+(i&1);
3750   u_int hr,reglist=0;
3751   u_int copr=(source[i]>>16)&0x1f;
3752   s=get_reg(i_regs->regmap,rs1[i]);
3753   tl=get_reg(i_regs->regmap,FTEMP);
3754   offset=imm[i];
3755   assert(rs1[i]>0);
3756   assert(tl>=0);
3757   assert(!using_tlb);
3758
3759   for(hr=0;hr<HOST_REGS;hr++) {
3760     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3761   }
3762   if(i_regs->regmap[HOST_CCREG]==CCREG)
3763     reglist&=~(1<<HOST_CCREG);
3764
3765   // get the address
3766   if (opcode[i]==0x3a) { // SWC2
3767     ar=get_reg(i_regs->regmap,agr);
3768     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3769     reglist|=1<<ar;
3770   } else { // LWC2
3771     ar=tl;
3772   }
3773   if(s>=0) c=(i_regs->wasconst>>s)&1;
3774   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3775   if (!offset&&!c&&s>=0) ar=s;
3776   assert(ar>=0);
3777
3778   if (opcode[i]==0x3a) { // SWC2
3779     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3780     type=STOREW_STUB;
3781   }
3782   else
3783     type=LOADW_STUB;
3784
3785   if(c&&!memtarget) {
3786     jaddr2=(int)out;
3787     emit_jmp(0); // inline_readstub/inline_writestub?
3788   }
3789   else {
3790     if(!c) {
3791       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3792       jaddr2=(int)out;
3793       emit_jno(0);
3794     }
3795     if (opcode[i]==0x32) { // LWC2
3796       #ifdef HOST_IMM_ADDR32
3797       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3798       else
3799       #endif
3800       emit_readword_indexed(0,ar,tl);
3801     }
3802     if (opcode[i]==0x3a) { // SWC2
3803       #ifdef DESTRUCTIVE_SHIFT
3804       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3805       #endif
3806       emit_writeword_indexed(tl,0,ar);
3807     }
3808   }
3809   if(jaddr2)
3810     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3811   if (opcode[i]==0x3a) { // SWC2
3812 #if defined(HOST_IMM8)
3813     int ir=get_reg(i_regs->regmap,INVCP);
3814     assert(ir>=0);
3815     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3816 #else
3817     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3818 #endif
3819     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3820     emit_callne(invalidate_addr_reg[ar]);
3821     #else
3822     jaddr3=(int)out;
3823     emit_jne(0);
3824     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3825     #endif
3826   }
3827   if (opcode[i]==0x32) { // LWC2
3828     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3829   }
3830 }
3831
3832 #ifndef multdiv_assemble
3833 void multdiv_assemble(int i,struct regstat *i_regs)
3834 {
3835   printf("Need multdiv_assemble for this architecture.\n");
3836   exit(1);
3837 }
3838 #endif
3839
3840 void mov_assemble(int i,struct regstat *i_regs)
3841 {
3842   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3843   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3844   if(rt1[i]) {
3845     signed char sh,sl,th,tl;
3846     th=get_reg(i_regs->regmap,rt1[i]|64);
3847     tl=get_reg(i_regs->regmap,rt1[i]);
3848     //assert(tl>=0);
3849     if(tl>=0) {
3850       sh=get_reg(i_regs->regmap,rs1[i]|64);
3851       sl=get_reg(i_regs->regmap,rs1[i]);
3852       if(sl>=0) emit_mov(sl,tl);
3853       else emit_loadreg(rs1[i],tl);
3854       if(th>=0) {
3855         if(sh>=0) emit_mov(sh,th);
3856         else emit_loadreg(rs1[i]|64,th);
3857       }
3858     }
3859   }
3860 }
3861
3862 #ifndef fconv_assemble
3863 void fconv_assemble(int i,struct regstat *i_regs)
3864 {
3865   printf("Need fconv_assemble for this architecture.\n");
3866   exit(1);
3867 }
3868 #endif
3869
3870 #if 0
3871 void float_assemble(int i,struct regstat *i_regs)
3872 {
3873   printf("Need float_assemble for this architecture.\n");
3874   exit(1);
3875 }
3876 #endif
3877
3878 void syscall_assemble(int i,struct regstat *i_regs)
3879 {
3880   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3881   assert(ccreg==HOST_CCREG);
3882   assert(!is_delayslot);
3883   emit_movimm(start+i*4,EAX); // Get PC
3884   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3885   emit_jmp((int)jump_syscall_hle); // XXX
3886 }
3887
3888 void hlecall_assemble(int i,struct regstat *i_regs)
3889 {
3890   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3891   assert(ccreg==HOST_CCREG);
3892   assert(!is_delayslot);
3893   emit_movimm(start+i*4+4,0); // Get PC
3894   emit_movimm((int)psxHLEt[source[i]&7],1);
3895   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3896   emit_jmp((int)jump_hlecall);
3897 }
3898
3899 void intcall_assemble(int i,struct regstat *i_regs)
3900 {
3901   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3902   assert(ccreg==HOST_CCREG);
3903   assert(!is_delayslot);
3904   emit_movimm(start+i*4,0); // Get PC
3905   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3906   emit_jmp((int)jump_intcall);
3907 }
3908
3909 void ds_assemble(int i,struct regstat *i_regs)
3910 {
3911   is_delayslot=1;
3912   switch(itype[i]) {
3913     case ALU:
3914       alu_assemble(i,i_regs);break;
3915     case IMM16:
3916       imm16_assemble(i,i_regs);break;
3917     case SHIFT:
3918       shift_assemble(i,i_regs);break;
3919     case SHIFTIMM:
3920       shiftimm_assemble(i,i_regs);break;
3921     case LOAD:
3922       load_assemble(i,i_regs);break;
3923     case LOADLR:
3924       loadlr_assemble(i,i_regs);break;
3925     case STORE:
3926       store_assemble(i,i_regs);break;
3927     case STORELR:
3928       storelr_assemble(i,i_regs);break;
3929     case COP0:
3930       cop0_assemble(i,i_regs);break;
3931     case COP1:
3932       cop1_assemble(i,i_regs);break;
3933     case C1LS:
3934       c1ls_assemble(i,i_regs);break;
3935     case COP2:
3936       cop2_assemble(i,i_regs);break;
3937     case C2LS:
3938       c2ls_assemble(i,i_regs);break;
3939     case C2OP:
3940       c2op_assemble(i,i_regs);break;
3941     case FCONV:
3942       fconv_assemble(i,i_regs);break;
3943     case FLOAT:
3944       float_assemble(i,i_regs);break;
3945     case FCOMP:
3946       fcomp_assemble(i,i_regs);break;
3947     case MULTDIV:
3948       multdiv_assemble(i,i_regs);break;
3949     case MOV:
3950       mov_assemble(i,i_regs);break;
3951     case SYSCALL:
3952     case HLECALL:
3953     case INTCALL:
3954     case SPAN:
3955     case UJUMP:
3956     case RJUMP:
3957     case CJUMP:
3958     case SJUMP:
3959     case FJUMP:
3960       printf("Jump in the delay slot.  This is probably a bug.\n");
3961   }
3962   is_delayslot=0;
3963 }
3964
3965 // Is the branch target a valid internal jump?
3966 int internal_branch(uint64_t i_is32,int addr)
3967 {
3968   if(addr&1) return 0; // Indirect (register) jump
3969   if(addr>=start && addr<start+slen*4-4)
3970   {
3971     int t=(addr-start)>>2;
3972     // Delay slots are not valid branch targets
3973     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3974     // 64 -> 32 bit transition requires a recompile
3975     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3976     {
3977       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3978       else printf("optimizable: yes\n");
3979     }*/
3980     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3981 #ifndef FORCE32
3982     if(requires_32bit[t]&~i_is32) return 0;
3983     else
3984 #endif
3985       return 1;
3986   }
3987   return 0;
3988 }
3989
3990 #ifndef wb_invalidate
3991 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3992   uint64_t u,uint64_t uu)
3993 {
3994   int hr;
3995   for(hr=0;hr<HOST_REGS;hr++) {
3996     if(hr!=EXCLUDE_REG) {
3997       if(pre[hr]!=entry[hr]) {
3998         if(pre[hr]>=0) {
3999           if((dirty>>hr)&1) {
4000             if(get_reg(entry,pre[hr])<0) {
4001               if(pre[hr]<64) {
4002                 if(!((u>>pre[hr])&1)) {
4003                   emit_storereg(pre[hr],hr);
4004                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4005                     emit_sarimm(hr,31,hr);
4006                     emit_storereg(pre[hr]|64,hr);
4007                   }
4008                 }
4009               }else{
4010                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4011                   emit_storereg(pre[hr],hr);
4012                 }
4013               }
4014             }
4015           }
4016         }
4017       }
4018     }
4019   }
4020   // Move from one register to another (no writeback)
4021   for(hr=0;hr<HOST_REGS;hr++) {
4022     if(hr!=EXCLUDE_REG) {
4023       if(pre[hr]!=entry[hr]) {
4024         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4025           int nr;
4026           if((nr=get_reg(entry,pre[hr]))>=0) {
4027             emit_mov(hr,nr);
4028           }
4029         }
4030       }
4031     }
4032   }
4033 }
4034 #endif
4035
4036 // Load the specified registers
4037 // This only loads the registers given as arguments because
4038 // we don't want to load things that will be overwritten
4039 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4040 {
4041   int hr;
4042   // Load 32-bit regs
4043   for(hr=0;hr<HOST_REGS;hr++) {
4044     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4045       if(entry[hr]!=regmap[hr]) {
4046         if(regmap[hr]==rs1||regmap[hr]==rs2)
4047         {
4048           if(regmap[hr]==0) {
4049             emit_zeroreg(hr);
4050           }
4051           else
4052           {
4053             emit_loadreg(regmap[hr],hr);
4054           }
4055         }
4056       }
4057     }
4058   }
4059   //Load 64-bit regs
4060   for(hr=0;hr<HOST_REGS;hr++) {
4061     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4062       if(entry[hr]!=regmap[hr]) {
4063         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4064         {
4065           assert(regmap[hr]!=64);
4066           if((is32>>(regmap[hr]&63))&1) {
4067             int lr=get_reg(regmap,regmap[hr]-64);
4068             if(lr>=0)
4069               emit_sarimm(lr,31,hr);
4070             else
4071               emit_loadreg(regmap[hr],hr);
4072           }
4073           else
4074           {
4075             emit_loadreg(regmap[hr],hr);
4076           }
4077         }
4078       }
4079     }
4080   }
4081 }
4082
4083 // Load registers prior to the start of a loop
4084 // so that they are not loaded within the loop
4085 static void loop_preload(signed char pre[],signed char entry[])
4086 {
4087   int hr;
4088   for(hr=0;hr<HOST_REGS;hr++) {
4089     if(hr!=EXCLUDE_REG) {
4090       if(pre[hr]!=entry[hr]) {
4091         if(entry[hr]>=0) {
4092           if(get_reg(pre,entry[hr])<0) {
4093             assem_debug("loop preload:\n");
4094             //printf("loop preload: %d\n",hr);
4095             if(entry[hr]==0) {
4096               emit_zeroreg(hr);
4097             }
4098             else if(entry[hr]<TEMPREG)
4099             {
4100               emit_loadreg(entry[hr],hr);
4101             }
4102             else if(entry[hr]-64<TEMPREG)
4103             {
4104               emit_loadreg(entry[hr],hr);
4105             }
4106           }
4107         }
4108       }
4109     }
4110   }
4111 }
4112
4113 // Generate address for load/store instruction
4114 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4115 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4116 {
4117   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4118     int ra=-1;
4119     int agr=AGEN1+(i&1);
4120     int mgr=MGEN1+(i&1);
4121     if(itype[i]==LOAD) {
4122       ra=get_reg(i_regs->regmap,rt1[i]);
4123       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4124       assert(ra>=0);
4125     }
4126     if(itype[i]==LOADLR) {
4127       ra=get_reg(i_regs->regmap,FTEMP);
4128     }
4129     if(itype[i]==STORE||itype[i]==STORELR) {
4130       ra=get_reg(i_regs->regmap,agr);
4131       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4132     }
4133     if(itype[i]==C1LS||itype[i]==C2LS) {
4134       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4135         ra=get_reg(i_regs->regmap,FTEMP);
4136       else { // SWC1/SDC1/SWC2/SDC2
4137         ra=get_reg(i_regs->regmap,agr);
4138         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4139       }
4140     }
4141     int rs=get_reg(i_regs->regmap,rs1[i]);
4142     int rm=get_reg(i_regs->regmap,TLREG);
4143     if(ra>=0) {
4144       int offset=imm[i];
4145       int c=(i_regs->wasconst>>rs)&1;
4146       if(rs1[i]==0) {
4147         // Using r0 as a base address
4148         /*if(rm>=0) {
4149           if(!entry||entry[rm]!=mgr) {
4150             generate_map_const(offset,rm);
4151           } // else did it in the previous cycle
4152         }*/
4153         if(!entry||entry[ra]!=agr) {
4154           if (opcode[i]==0x22||opcode[i]==0x26) {
4155             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4156           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4157             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4158           }else{
4159             emit_movimm(offset,ra);
4160           }
4161         } // else did it in the previous cycle
4162       }
4163       else if(rs<0) {
4164         if(!entry||entry[ra]!=rs1[i])
4165           emit_loadreg(rs1[i],ra);
4166         //if(!entry||entry[ra]!=rs1[i])
4167         //  printf("poor load scheduling!\n");
4168       }
4169       else if(c) {
4170         if(rm>=0) {
4171           if(!entry||entry[rm]!=mgr) {
4172             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4173               // Stores to memory go thru the mapper to detect self-modifying
4174               // code, loads don't.
4175               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4176                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4177                 generate_map_const(constmap[i][rs]+offset,rm);
4178             }else{
4179               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4180                 generate_map_const(constmap[i][rs]+offset,rm);
4181             }
4182           }
4183         }
4184         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4185           if(!entry||entry[ra]!=agr) {
4186             if (opcode[i]==0x22||opcode[i]==0x26) {
4187               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4188             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4189               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4190             }else{
4191               #ifdef HOST_IMM_ADDR32
4192               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4193                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4194               #endif
4195               emit_movimm(constmap[i][rs]+offset,ra);
4196             }
4197           } // else did it in the previous cycle
4198         } // else load_consts already did it
4199       }
4200       if(offset&&!c&&rs1[i]) {
4201         if(rs>=0) {
4202           emit_addimm(rs,offset,ra);
4203         }else{
4204           emit_addimm(ra,offset,ra);
4205         }
4206       }
4207     }
4208   }
4209   // Preload constants for next instruction
4210   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4211     int agr,ra;
4212     #ifndef HOST_IMM_ADDR32
4213     // Mapper entry
4214     agr=MGEN1+((i+1)&1);
4215     ra=get_reg(i_regs->regmap,agr);
4216     if(ra>=0) {
4217       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4218       int offset=imm[i+1];
4219       int c=(regs[i+1].wasconst>>rs)&1;
4220       if(c) {
4221         if(itype[i+1]==STORE||itype[i+1]==STORELR
4222            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4223           // Stores to memory go thru the mapper to detect self-modifying
4224           // code, loads don't.
4225           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4226              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4227             generate_map_const(constmap[i+1][rs]+offset,ra);
4228         }else{
4229           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4230             generate_map_const(constmap[i+1][rs]+offset,ra);
4231         }
4232       }
4233       /*else if(rs1[i]==0) {
4234         generate_map_const(offset,ra);
4235       }*/
4236     }
4237     #endif
4238     // Actual address
4239     agr=AGEN1+((i+1)&1);
4240     ra=get_reg(i_regs->regmap,agr);
4241     if(ra>=0) {
4242       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4243       int offset=imm[i+1];
4244       int c=(regs[i+1].wasconst>>rs)&1;
4245       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4246         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4247           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4248         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4249           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4250         }else{
4251           #ifdef HOST_IMM_ADDR32
4252           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4253              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4254           #endif
4255           emit_movimm(constmap[i+1][rs]+offset,ra);
4256         }
4257       }
4258       else if(rs1[i+1]==0) {
4259         // Using r0 as a base address
4260         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4261           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4262         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4263           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4264         }else{
4265           emit_movimm(offset,ra);
4266         }
4267       }
4268     }
4269   }
4270 }
4271
4272 int get_final_value(int hr, int i, int *value)
4273 {
4274   int reg=regs[i].regmap[hr];
4275   while(i<slen-1) {
4276     if(regs[i+1].regmap[hr]!=reg) break;
4277     if(!((regs[i+1].isconst>>hr)&1)) break;
4278     if(bt[i+1]) break;
4279     i++;
4280   }
4281   if(i<slen-1) {
4282     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4283       *value=constmap[i][hr];
4284       return 1;
4285     }
4286     if(!bt[i+1]) {
4287       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4288         // Load in delay slot, out-of-order execution
4289         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4290         {
4291           #ifdef HOST_IMM_ADDR32
4292           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4293           #endif
4294           // Precompute load address
4295           *value=constmap[i][hr]+imm[i+2];
4296           return 1;
4297         }
4298       }
4299       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4300       {
4301         #ifdef HOST_IMM_ADDR32
4302         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4303         #endif
4304         // Precompute load address
4305         *value=constmap[i][hr]+imm[i+1];
4306         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4307         return 1;
4308       }
4309     }
4310   }
4311   *value=constmap[i][hr];
4312   //printf("c=%x\n",(int)constmap[i][hr]);
4313   if(i==slen-1) return 1;
4314   if(reg<64) {
4315     return !((unneeded_reg[i+1]>>reg)&1);
4316   }else{
4317     return !((unneeded_reg_upper[i+1]>>reg)&1);
4318   }
4319 }
4320
4321 // Load registers with known constants
4322 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4323 {
4324   int hr;
4325   // Load 32-bit regs
4326   for(hr=0;hr<HOST_REGS;hr++) {
4327     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4328       //if(entry[hr]!=regmap[hr]) {
4329       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4330         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4331           int value;
4332           if(get_final_value(hr,i,&value)) {
4333             if(value==0) {
4334               emit_zeroreg(hr);
4335             }
4336             else {
4337               emit_movimm(value,hr);
4338             }
4339           }
4340         }
4341       }
4342     }
4343   }
4344   // Load 64-bit regs
4345   for(hr=0;hr<HOST_REGS;hr++) {
4346     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4347       //if(entry[hr]!=regmap[hr]) {
4348       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4349         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4350           if((is32>>(regmap[hr]&63))&1) {
4351             int lr=get_reg(regmap,regmap[hr]-64);
4352             assert(lr>=0);
4353             emit_sarimm(lr,31,hr);
4354           }
4355           else
4356           {
4357             int value;
4358             if(get_final_value(hr,i,&value)) {
4359               if(value==0) {
4360                 emit_zeroreg(hr);
4361               }
4362               else {
4363                 emit_movimm(value,hr);
4364               }
4365             }
4366           }
4367         }
4368       }
4369     }
4370   }
4371 }
4372 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4373 {
4374   int hr;
4375   // Load 32-bit regs
4376   for(hr=0;hr<HOST_REGS;hr++) {
4377     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4378       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4379         int value=constmap[i][hr];
4380         if(value==0) {
4381           emit_zeroreg(hr);
4382         }
4383         else {
4384           emit_movimm(value,hr);
4385         }
4386       }
4387     }
4388   }
4389   // Load 64-bit regs
4390   for(hr=0;hr<HOST_REGS;hr++) {
4391     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4392       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4393         if((is32>>(regmap[hr]&63))&1) {
4394           int lr=get_reg(regmap,regmap[hr]-64);
4395           assert(lr>=0);
4396           emit_sarimm(lr,31,hr);
4397         }
4398         else
4399         {
4400           int value=constmap[i][hr];
4401           if(value==0) {
4402             emit_zeroreg(hr);
4403           }
4404           else {
4405             emit_movimm(value,hr);
4406           }
4407         }
4408       }
4409     }
4410   }
4411 }
4412
4413 // Write out all dirty registers (except cycle count)
4414 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4415 {
4416   int hr;
4417   for(hr=0;hr<HOST_REGS;hr++) {
4418     if(hr!=EXCLUDE_REG) {
4419       if(i_regmap[hr]>0) {
4420         if(i_regmap[hr]!=CCREG) {
4421           if((i_dirty>>hr)&1) {
4422             if(i_regmap[hr]<64) {
4423               emit_storereg(i_regmap[hr],hr);
4424 #ifndef FORCE32
4425               if( ((i_is32>>i_regmap[hr])&1) ) {
4426                 #ifdef DESTRUCTIVE_WRITEBACK
4427                 emit_sarimm(hr,31,hr);
4428                 emit_storereg(i_regmap[hr]|64,hr);
4429                 #else
4430                 emit_sarimm(hr,31,HOST_TEMPREG);
4431                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4432                 #endif
4433               }
4434 #endif
4435             }else{
4436               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4437                 emit_storereg(i_regmap[hr],hr);
4438               }
4439             }
4440           }
4441         }
4442       }
4443     }
4444   }
4445 }
4446 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4447 // This writes the registers not written by store_regs_bt
4448 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4449 {
4450   int hr;
4451   int t=(addr-start)>>2;
4452   for(hr=0;hr<HOST_REGS;hr++) {
4453     if(hr!=EXCLUDE_REG) {
4454       if(i_regmap[hr]>0) {
4455         if(i_regmap[hr]!=CCREG) {
4456           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4457             if((i_dirty>>hr)&1) {
4458               if(i_regmap[hr]<64) {
4459                 emit_storereg(i_regmap[hr],hr);
4460 #ifndef FORCE32
4461                 if( ((i_is32>>i_regmap[hr])&1) ) {
4462                   #ifdef DESTRUCTIVE_WRITEBACK
4463                   emit_sarimm(hr,31,hr);
4464                   emit_storereg(i_regmap[hr]|64,hr);
4465                   #else
4466                   emit_sarimm(hr,31,HOST_TEMPREG);
4467                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4468                   #endif
4469                 }
4470 #endif
4471               }else{
4472                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4473                   emit_storereg(i_regmap[hr],hr);
4474                 }
4475               }
4476             }
4477           }
4478         }
4479       }
4480     }
4481   }
4482 }
4483
4484 // Load all registers (except cycle count)
4485 void load_all_regs(signed char i_regmap[])
4486 {
4487   int hr;
4488   for(hr=0;hr<HOST_REGS;hr++) {
4489     if(hr!=EXCLUDE_REG) {
4490       if(i_regmap[hr]==0) {
4491         emit_zeroreg(hr);
4492       }
4493       else
4494       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4495       {
4496         emit_loadreg(i_regmap[hr],hr);
4497       }
4498     }
4499   }
4500 }
4501
4502 // Load all current registers also needed by next instruction
4503 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4504 {
4505   int hr;
4506   for(hr=0;hr<HOST_REGS;hr++) {
4507     if(hr!=EXCLUDE_REG) {
4508       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4509         if(i_regmap[hr]==0) {
4510           emit_zeroreg(hr);
4511         }
4512         else
4513         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4514         {
4515           emit_loadreg(i_regmap[hr],hr);
4516         }
4517       }
4518     }
4519   }
4520 }
4521
4522 // Load all regs, storing cycle count if necessary
4523 void load_regs_entry(int t)
4524 {
4525   int hr;
4526   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4527   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4528   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4529     emit_storereg(CCREG,HOST_CCREG);
4530   }
4531   // Load 32-bit regs
4532   for(hr=0;hr<HOST_REGS;hr++) {
4533     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4534       if(regs[t].regmap_entry[hr]==0) {
4535         emit_zeroreg(hr);
4536       }
4537       else if(regs[t].regmap_entry[hr]!=CCREG)
4538       {
4539         emit_loadreg(regs[t].regmap_entry[hr],hr);
4540       }
4541     }
4542   }
4543   // Load 64-bit regs
4544   for(hr=0;hr<HOST_REGS;hr++) {
4545     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4546       assert(regs[t].regmap_entry[hr]!=64);
4547       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4548         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4549         if(lr<0) {
4550           emit_loadreg(regs[t].regmap_entry[hr],hr);
4551         }
4552         else
4553         {
4554           emit_sarimm(lr,31,hr);
4555         }
4556       }
4557       else
4558       {
4559         emit_loadreg(regs[t].regmap_entry[hr],hr);
4560       }
4561     }
4562   }
4563 }
4564
4565 // Store dirty registers prior to branch
4566 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4567 {
4568   if(internal_branch(i_is32,addr))
4569   {
4570     int t=(addr-start)>>2;
4571     int hr;
4572     for(hr=0;hr<HOST_REGS;hr++) {
4573       if(hr!=EXCLUDE_REG) {
4574         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4575           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4576             if((i_dirty>>hr)&1) {
4577               if(i_regmap[hr]<64) {
4578                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4579                   emit_storereg(i_regmap[hr],hr);
4580                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4581                     #ifdef DESTRUCTIVE_WRITEBACK
4582                     emit_sarimm(hr,31,hr);
4583                     emit_storereg(i_regmap[hr]|64,hr);
4584                     #else
4585                     emit_sarimm(hr,31,HOST_TEMPREG);
4586                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4587                     #endif
4588                   }
4589                 }
4590               }else{
4591                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4592                   emit_storereg(i_regmap[hr],hr);
4593                 }
4594               }
4595             }
4596           }
4597         }
4598       }
4599     }
4600   }
4601   else
4602   {
4603     // Branch out of this block, write out all dirty regs
4604     wb_dirtys(i_regmap,i_is32,i_dirty);
4605   }
4606 }
4607
4608 // Load all needed registers for branch target
4609 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4610 {
4611   //if(addr>=start && addr<(start+slen*4))
4612   if(internal_branch(i_is32,addr))
4613   {
4614     int t=(addr-start)>>2;
4615     int hr;
4616     // Store the cycle count before loading something else
4617     if(i_regmap[HOST_CCREG]!=CCREG) {
4618       assert(i_regmap[HOST_CCREG]==-1);
4619     }
4620     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4621       emit_storereg(CCREG,HOST_CCREG);
4622     }
4623     // Load 32-bit regs
4624     for(hr=0;hr<HOST_REGS;hr++) {
4625       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4626         #ifdef DESTRUCTIVE_WRITEBACK
4627         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4628         #else
4629         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4630         #endif
4631           if(regs[t].regmap_entry[hr]==0) {
4632             emit_zeroreg(hr);
4633           }
4634           else if(regs[t].regmap_entry[hr]!=CCREG)
4635           {
4636             emit_loadreg(regs[t].regmap_entry[hr],hr);
4637           }
4638         }
4639       }
4640     }
4641     //Load 64-bit regs
4642     for(hr=0;hr<HOST_REGS;hr++) {
4643       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4644         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4645           assert(regs[t].regmap_entry[hr]!=64);
4646           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4647             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4648             if(lr<0) {
4649               emit_loadreg(regs[t].regmap_entry[hr],hr);
4650             }
4651             else
4652             {
4653               emit_sarimm(lr,31,hr);
4654             }
4655           }
4656           else
4657           {
4658             emit_loadreg(regs[t].regmap_entry[hr],hr);
4659           }
4660         }
4661         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4662           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4663           assert(lr>=0);
4664           emit_sarimm(lr,31,hr);
4665         }
4666       }
4667     }
4668   }
4669 }
4670
4671 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4672 {
4673   if(addr>=start && addr<start+slen*4-4)
4674   {
4675     int t=(addr-start)>>2;
4676     int hr;
4677     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4678     for(hr=0;hr<HOST_REGS;hr++)
4679     {
4680       if(hr!=EXCLUDE_REG)
4681       {
4682         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4683         {
4684           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4685           {
4686             return 0;
4687           }
4688           else 
4689           if((i_dirty>>hr)&1)
4690           {
4691             if(i_regmap[hr]<TEMPREG)
4692             {
4693               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4694                 return 0;
4695             }
4696             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4697             {
4698               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4699                 return 0;
4700             }
4701           }
4702         }
4703         else // Same register but is it 32-bit or dirty?
4704         if(i_regmap[hr]>=0)
4705         {
4706           if(!((regs[t].dirty>>hr)&1))
4707           {
4708             if((i_dirty>>hr)&1)
4709             {
4710               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4711               {
4712                 //printf("%x: dirty no match\n",addr);
4713                 return 0;
4714               }
4715             }
4716           }
4717           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4718           {
4719             //printf("%x: is32 no match\n",addr);
4720             return 0;
4721           }
4722         }
4723       }
4724     }
4725     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4726 #ifndef FORCE32
4727     if(requires_32bit[t]&~i_is32) return 0;
4728 #endif
4729     // Delay slots are not valid branch targets
4730     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4731     // Delay slots require additional processing, so do not match
4732     if(is_ds[t]) return 0;
4733   }
4734   else
4735   {
4736     int hr;
4737     for(hr=0;hr<HOST_REGS;hr++)
4738     {
4739       if(hr!=EXCLUDE_REG)
4740       {
4741         if(i_regmap[hr]>=0)
4742         {
4743           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4744           {
4745             if((i_dirty>>hr)&1)
4746             {
4747               return 0;
4748             }
4749           }
4750         }
4751       }
4752     }
4753   }
4754   return 1;
4755 }
4756
4757 // Used when a branch jumps into the delay slot of another branch
4758 void ds_assemble_entry(int i)
4759 {
4760   int t=(ba[i]-start)>>2;
4761   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4762   assem_debug("Assemble delay slot at %x\n",ba[i]);
4763   assem_debug("<->\n");
4764   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4765     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4766   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4767   address_generation(t,&regs[t],regs[t].regmap_entry);
4768   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4769     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4770   cop1_usable=0;
4771   is_delayslot=0;
4772   switch(itype[t]) {
4773     case ALU:
4774       alu_assemble(t,&regs[t]);break;
4775     case IMM16:
4776       imm16_assemble(t,&regs[t]);break;
4777     case SHIFT:
4778       shift_assemble(t,&regs[t]);break;
4779     case SHIFTIMM:
4780       shiftimm_assemble(t,&regs[t]);break;
4781     case LOAD:
4782       load_assemble(t,&regs[t]);break;
4783     case LOADLR:
4784       loadlr_assemble(t,&regs[t]);break;
4785     case STORE:
4786       store_assemble(t,&regs[t]);break;
4787     case STORELR:
4788       storelr_assemble(t,&regs[t]);break;
4789     case COP0:
4790       cop0_assemble(t,&regs[t]);break;
4791     case COP1:
4792       cop1_assemble(t,&regs[t]);break;
4793     case C1LS:
4794       c1ls_assemble(t,&regs[t]);break;
4795     case COP2:
4796       cop2_assemble(t,&regs[t]);break;
4797     case C2LS:
4798       c2ls_assemble(t,&regs[t]);break;
4799     case C2OP:
4800       c2op_assemble(t,&regs[t]);break;
4801     case FCONV:
4802       fconv_assemble(t,&regs[t]);break;
4803     case FLOAT:
4804       float_assemble(t,&regs[t]);break;
4805     case FCOMP:
4806       fcomp_assemble(t,&regs[t]);break;
4807     case MULTDIV:
4808       multdiv_assemble(t,&regs[t]);break;
4809     case MOV:
4810       mov_assemble(t,&regs[t]);break;
4811     case SYSCALL:
4812     case HLECALL:
4813     case INTCALL:
4814     case SPAN:
4815     case UJUMP:
4816     case RJUMP:
4817     case CJUMP:
4818     case SJUMP:
4819     case FJUMP:
4820       printf("Jump in the delay slot.  This is probably a bug.\n");
4821   }
4822   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4823   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4824   if(internal_branch(regs[t].is32,ba[i]+4))
4825     assem_debug("branch: internal\n");
4826   else
4827     assem_debug("branch: external\n");
4828   assert(internal_branch(regs[t].is32,ba[i]+4));
4829   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4830   emit_jmp(0);
4831 }
4832
4833 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4834 {
4835   int count;
4836   int jaddr;
4837   int idle=0;
4838   if(itype[i]==RJUMP)
4839   {
4840     *adj=0;
4841   }
4842   //if(ba[i]>=start && ba[i]<(start+slen*4))
4843   if(internal_branch(branch_regs[i].is32,ba[i]))
4844   {
4845     int t=(ba[i]-start)>>2;
4846     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4847     else *adj=ccadj[t];
4848   }
4849   else
4850   {
4851     *adj=0;
4852   }
4853   count=ccadj[i];
4854   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4855     // Idle loop
4856     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4857     idle=(int)out;
4858     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4859     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4860     jaddr=(int)out;
4861     emit_jmp(0);
4862   }
4863   else if(*adj==0||invert) {
4864     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4865     jaddr=(int)out;
4866     emit_jns(0);
4867   }
4868   else
4869   {
4870     emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2));
4871     jaddr=(int)out;
4872     emit_jns(0);
4873   }
4874   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4875 }
4876
4877 void do_ccstub(int n)
4878 {
4879   literal_pool(256);
4880   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4881   set_jump_target(stubs[n][1],(int)out);
4882   int i=stubs[n][4];
4883   if(stubs[n][6]==NULLDS) {
4884     // Delay slot instruction is nullified ("likely" branch)
4885     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4886   }
4887   else if(stubs[n][6]!=TAKEN) {
4888     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4889   }
4890   else {
4891     if(internal_branch(branch_regs[i].is32,ba[i]))
4892       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4893   }
4894   if(stubs[n][5]!=-1)
4895   {
4896     // Save PC as return address
4897     emit_movimm(stubs[n][5],EAX);
4898     emit_writeword(EAX,(int)&pcaddr);
4899   }
4900   else
4901   {
4902     // Return address depends on which way the branch goes
4903     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4904     {
4905       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4906       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4907       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4908       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4909       if(rs1[i]==0)
4910       {
4911         s1l=s2l;s1h=s2h;
4912         s2l=s2h=-1;
4913       }
4914       else if(rs2[i]==0)
4915       {
4916         s2l=s2h=-1;
4917       }
4918       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4919         s1h=s2h=-1;
4920       }
4921       assert(s1l>=0);
4922       #ifdef DESTRUCTIVE_WRITEBACK
4923       if(rs1[i]) {
4924         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4925           emit_loadreg(rs1[i],s1l);
4926       } 
4927       else {
4928         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4929           emit_loadreg(rs2[i],s1l);
4930       }
4931       if(s2l>=0)
4932         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4933           emit_loadreg(rs2[i],s2l);
4934       #endif
4935       int hr=0;
4936       int addr=-1,alt=-1,ntaddr=-1;
4937       while(hr<HOST_REGS)
4938       {
4939         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4940            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4941            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4942         {
4943           addr=hr++;break;
4944         }
4945         hr++;
4946       }
4947       while(hr<HOST_REGS)
4948       {
4949         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4950            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4951            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4952         {
4953           alt=hr++;break;
4954         }
4955         hr++;
4956       }
4957       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4958       {
4959         while(hr<HOST_REGS)
4960         {
4961           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4962              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4963              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4964           {
4965             ntaddr=hr;break;
4966           }
4967           hr++;
4968         }
4969         assert(hr<HOST_REGS);
4970       }
4971       if((opcode[i]&0x2f)==4) // BEQ
4972       {
4973         #ifdef HAVE_CMOV_IMM
4974         if(s1h<0) {
4975           if(s2l>=0) emit_cmp(s1l,s2l);
4976           else emit_test(s1l,s1l);
4977           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4978         }
4979         else
4980         #endif
4981         {
4982           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4983           if(s1h>=0) {
4984             if(s2h>=0) emit_cmp(s1h,s2h);
4985             else emit_test(s1h,s1h);
4986             emit_cmovne_reg(alt,addr);
4987           }
4988           if(s2l>=0) emit_cmp(s1l,s2l);
4989           else emit_test(s1l,s1l);
4990           emit_cmovne_reg(alt,addr);
4991         }
4992       }
4993       if((opcode[i]&0x2f)==5) // BNE
4994       {
4995         #ifdef HAVE_CMOV_IMM
4996         if(s1h<0) {
4997           if(s2l>=0) emit_cmp(s1l,s2l);
4998           else emit_test(s1l,s1l);
4999           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5000         }
5001         else
5002         #endif
5003         {
5004           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5005           if(s1h>=0) {
5006             if(s2h>=0) emit_cmp(s1h,s2h);
5007             else emit_test(s1h,s1h);
5008             emit_cmovne_reg(alt,addr);
5009           }
5010           if(s2l>=0) emit_cmp(s1l,s2l);
5011           else emit_test(s1l,s1l);
5012           emit_cmovne_reg(alt,addr);
5013         }
5014       }
5015       if((opcode[i]&0x2f)==6) // BLEZ
5016       {
5017         //emit_movimm(ba[i],alt);
5018         //emit_movimm(start+i*4+8,addr);
5019         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5020         emit_cmpimm(s1l,1);
5021         if(s1h>=0) emit_mov(addr,ntaddr);
5022         emit_cmovl_reg(alt,addr);
5023         if(s1h>=0) {
5024           emit_test(s1h,s1h);
5025           emit_cmovne_reg(ntaddr,addr);
5026           emit_cmovs_reg(alt,addr);
5027         }
5028       }
5029       if((opcode[i]&0x2f)==7) // BGTZ
5030       {
5031         //emit_movimm(ba[i],addr);
5032         //emit_movimm(start+i*4+8,ntaddr);
5033         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5034         emit_cmpimm(s1l,1);
5035         if(s1h>=0) emit_mov(addr,alt);
5036         emit_cmovl_reg(ntaddr,addr);
5037         if(s1h>=0) {
5038           emit_test(s1h,s1h);
5039           emit_cmovne_reg(alt,addr);
5040           emit_cmovs_reg(ntaddr,addr);
5041         }
5042       }
5043       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5044       {
5045         //emit_movimm(ba[i],alt);
5046         //emit_movimm(start+i*4+8,addr);
5047         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5048         if(s1h>=0) emit_test(s1h,s1h);
5049         else emit_test(s1l,s1l);
5050         emit_cmovs_reg(alt,addr);
5051       }
5052       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5053       {
5054         //emit_movimm(ba[i],addr);
5055         //emit_movimm(start+i*4+8,alt);
5056         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5057         if(s1h>=0) emit_test(s1h,s1h);
5058         else emit_test(s1l,s1l);
5059         emit_cmovs_reg(alt,addr);
5060       }
5061       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5062         if(source[i]&0x10000) // BC1T
5063         {
5064           //emit_movimm(ba[i],alt);
5065           //emit_movimm(start+i*4+8,addr);
5066           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5067           emit_testimm(s1l,0x800000);
5068           emit_cmovne_reg(alt,addr);
5069         }
5070         else // BC1F
5071         {
5072           //emit_movimm(ba[i],addr);
5073           //emit_movimm(start+i*4+8,alt);
5074           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5075           emit_testimm(s1l,0x800000);
5076           emit_cmovne_reg(alt,addr);
5077         }
5078       }
5079       emit_writeword(addr,(int)&pcaddr);
5080     }
5081     else
5082     if(itype[i]==RJUMP)
5083     {
5084       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5085       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5086         r=get_reg(branch_regs[i].regmap,RTEMP);
5087       }
5088       emit_writeword(r,(int)&pcaddr);
5089     }
5090     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5091   }
5092   // Update cycle count
5093   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5094   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5095   emit_call((int)cc_interrupt);
5096   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5097   if(stubs[n][6]==TAKEN) {
5098     if(internal_branch(branch_regs[i].is32,ba[i]))
5099       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5100     else if(itype[i]==RJUMP) {
5101       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5102         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5103       else
5104         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5105     }
5106   }else if(stubs[n][6]==NOTTAKEN) {
5107     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5108     else load_all_regs(branch_regs[i].regmap);
5109   }else if(stubs[n][6]==NULLDS) {
5110     // Delay slot instruction is nullified ("likely" branch)
5111     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5112     else load_all_regs(regs[i].regmap);
5113   }else{
5114     load_all_regs(branch_regs[i].regmap);
5115   }
5116   emit_jmp(stubs[n][2]); // return address
5117   
5118   /* This works but uses a lot of memory...
5119   emit_readword((int)&last_count,ECX);
5120   emit_add(HOST_CCREG,ECX,EAX);
5121   emit_writeword(EAX,(int)&Count);
5122   emit_call((int)gen_interupt);
5123   emit_readword((int)&Count,HOST_CCREG);
5124   emit_readword((int)&next_interupt,EAX);
5125   emit_readword((int)&pending_exception,EBX);
5126   emit_writeword(EAX,(int)&last_count);
5127   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5128   emit_test(EBX,EBX);
5129   int jne_instr=(int)out;
5130   emit_jne(0);
5131   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5132   load_all_regs(branch_regs[i].regmap);
5133   emit_jmp(stubs[n][2]); // return address
5134   set_jump_target(jne_instr,(int)out);
5135   emit_readword((int)&pcaddr,EAX);
5136   // Call get_addr_ht instead of doing the hash table here.
5137   // This code is executed infrequently and takes up a lot of space
5138   // so smaller is better.
5139   emit_storereg(CCREG,HOST_CCREG);
5140   emit_pushreg(EAX);
5141   emit_call((int)get_addr_ht);
5142   emit_loadreg(CCREG,HOST_CCREG);
5143   emit_addimm(ESP,4,ESP);
5144   emit_jmpreg(EAX);*/
5145 }
5146
5147 add_to_linker(int addr,int target,int ext)
5148 {
5149   link_addr[linkcount][0]=addr;
5150   link_addr[linkcount][1]=target;
5151   link_addr[linkcount][2]=ext;  
5152   linkcount++;
5153 }
5154
5155 void ujump_assemble(int i,struct regstat *i_regs)
5156 {
5157   signed char *i_regmap=i_regs->regmap;
5158   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5159   address_generation(i+1,i_regs,regs[i].regmap_entry);
5160   #ifdef REG_PREFETCH
5161   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5162   if(rt1[i]==31&&temp>=0) 
5163   {
5164     int return_address=start+i*4+8;
5165     if(get_reg(branch_regs[i].regmap,31)>0) 
5166     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5167   }
5168   #endif
5169   if(rt1[i]==31) {
5170     int rt;
5171     unsigned int return_address;
5172     rt=get_reg(branch_regs[i].regmap,31);
5173     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5174     //assert(rt>=0);
5175     return_address=start+i*4+8;
5176     if(rt>=0) {
5177       #ifdef USE_MINI_HT
5178       if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5179         int temp=-1; // note: must be ds-safe
5180         #ifdef HOST_TEMPREG
5181         temp=HOST_TEMPREG;
5182         #endif
5183         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5184         else emit_movimm(return_address,rt);
5185       }
5186       else
5187       #endif
5188       {
5189         #ifdef REG_PREFETCH
5190         if(temp>=0) 
5191         {
5192           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5193         }
5194         #endif
5195         emit_movimm(return_address,rt); // PC into link register
5196         #ifdef IMM_PREFETCH
5197         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5198         #endif
5199       }
5200     }
5201   }
5202   ds_assemble(i+1,i_regs);
5203   uint64_t bc_unneeded=branch_regs[i].u;
5204   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5205   bc_unneeded|=1|(1LL<<rt1[i]);
5206   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5207   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5208                 bc_unneeded,bc_unneeded_upper);
5209   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5210   int cc,adj;
5211   cc=get_reg(branch_regs[i].regmap,CCREG);
5212   assert(cc==HOST_CCREG);
5213   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5214   #ifdef REG_PREFETCH
5215   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5216   #endif
5217   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5218   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5219   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5220   if(internal_branch(branch_regs[i].is32,ba[i]))
5221     assem_debug("branch: internal\n");
5222   else
5223     assem_debug("branch: external\n");
5224   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5225     ds_assemble_entry(i);
5226   }
5227   else {
5228     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5229     emit_jmp(0);
5230   }
5231 }
5232
5233 void rjump_assemble(int i,struct regstat *i_regs)
5234 {
5235   signed char *i_regmap=i_regs->regmap;
5236   int temp;
5237   int rs,cc,adj;
5238   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5239   assert(rs>=0);
5240   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5241     // Delay slot abuse, make a copy of the branch address register
5242     temp=get_reg(branch_regs[i].regmap,RTEMP);
5243     assert(temp>=0);
5244     assert(regs[i].regmap[temp]==RTEMP);
5245     emit_mov(rs,temp);
5246     rs=temp;
5247   }
5248   address_generation(i+1,i_regs,regs[i].regmap_entry);
5249   #ifdef REG_PREFETCH
5250   if(rt1[i]==31) 
5251   {
5252     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5253       int return_address=start+i*4+8;
5254       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5255     }
5256   }
5257   #endif
5258   #ifdef USE_MINI_HT
5259   if(rs1[i]==31) {
5260     int rh=get_reg(regs[i].regmap,RHASH);
5261     if(rh>=0) do_preload_rhash(rh);
5262   }
5263   #endif
5264   ds_assemble(i+1,i_regs);
5265   uint64_t bc_unneeded=branch_regs[i].u;
5266   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5267   bc_unneeded|=1|(1LL<<rt1[i]);
5268   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5269   bc_unneeded&=~(1LL<<rs1[i]);
5270   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5271                 bc_unneeded,bc_unneeded_upper);
5272   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5273   if(rt1[i]!=0) {
5274     int rt,return_address;
5275     assert(rt1[i+1]!=rt1[i]);
5276     assert(rt2[i+1]!=rt1[i]);
5277     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5278     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5279     assert(rt>=0);
5280     return_address=start+i*4+8;
5281     #ifdef REG_PREFETCH
5282     if(temp>=0) 
5283     {
5284       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5285     }
5286     #endif
5287     emit_movimm(return_address,rt); // PC into link register
5288     #ifdef IMM_PREFETCH
5289     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5290     #endif
5291   }
5292   cc=get_reg(branch_regs[i].regmap,CCREG);
5293   assert(cc==HOST_CCREG);
5294   #ifdef USE_MINI_HT
5295   int rh=get_reg(branch_regs[i].regmap,RHASH);
5296   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5297   if(rs1[i]==31) {
5298     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5299     do_preload_rhtbl(ht);
5300     do_rhash(rs,rh);
5301   }
5302   #endif
5303   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5304   #ifdef DESTRUCTIVE_WRITEBACK
5305   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5306     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5307       emit_loadreg(rs1[i],rs);
5308     }
5309   }
5310   #endif
5311   #ifdef REG_PREFETCH
5312   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5313   #endif
5314   #ifdef USE_MINI_HT
5315   if(rs1[i]==31) {
5316     do_miniht_load(ht,rh);
5317   }
5318   #endif
5319   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5320   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5321   //assert(adj==0);
5322   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5323   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5324   emit_jns(0);
5325   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5326   #ifdef USE_MINI_HT
5327   if(rs1[i]==31) {
5328     do_miniht_jump(rs,rh,ht);
5329   }
5330   else
5331   #endif
5332   {
5333     //if(rs!=EAX) emit_mov(rs,EAX);
5334     //emit_jmp((int)jump_vaddr_eax);
5335     emit_jmp(jump_vaddr_reg[rs]);
5336   }
5337   /* Check hash table
5338   temp=!rs;
5339   emit_mov(rs,temp);
5340   emit_shrimm(rs,16,rs);
5341   emit_xor(temp,rs,rs);
5342   emit_movzwl_reg(rs,rs);
5343   emit_shlimm(rs,4,rs);
5344   emit_cmpmem_indexed((int)hash_table,rs,temp);
5345   emit_jne((int)out+14);
5346   emit_readword_indexed((int)hash_table+4,rs,rs);
5347   emit_jmpreg(rs);
5348   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5349   emit_addimm_no_flags(8,rs);
5350   emit_jeq((int)out-17);
5351   // No hit on hash table, call compiler
5352   emit_pushreg(temp);
5353 //DEBUG >
5354 #ifdef DEBUG_CYCLE_COUNT
5355   emit_readword((int)&last_count,ECX);
5356   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5357   emit_readword((int)&next_interupt,ECX);
5358   emit_writeword(HOST_CCREG,(int)&Count);
5359   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5360   emit_writeword(ECX,(int)&last_count);
5361 #endif
5362 //DEBUG <
5363   emit_storereg(CCREG,HOST_CCREG);
5364   emit_call((int)get_addr);
5365   emit_loadreg(CCREG,HOST_CCREG);
5366   emit_addimm(ESP,4,ESP);
5367   emit_jmpreg(EAX);*/
5368   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5369   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5370   #endif
5371 }
5372
5373 void cjump_assemble(int i,struct regstat *i_regs)
5374 {
5375   signed char *i_regmap=i_regs->regmap;
5376   int cc;
5377   int match;
5378   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5379   assem_debug("match=%d\n",match);
5380   int s1h,s1l,s2h,s2l;
5381   int prev_cop1_usable=cop1_usable;
5382   int unconditional=0,nop=0;
5383   int only32=0;
5384   int invert=0;
5385   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5386   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5387   if(!match) invert=1;
5388   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5389   if(i>(ba[i]-start)>>2) invert=1;
5390   #endif
5391   
5392   if(ooo[i]) {
5393     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5394     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5395     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5396     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5397   }
5398   else {
5399     s1l=get_reg(i_regmap,rs1[i]);
5400     s1h=get_reg(i_regmap,rs1[i]|64);
5401     s2l=get_reg(i_regmap,rs2[i]);
5402     s2h=get_reg(i_regmap,rs2[i]|64);
5403   }
5404   if(rs1[i]==0&&rs2[i]==0)
5405   {
5406     if(opcode[i]&1) nop=1;
5407     else unconditional=1;
5408     //assert(opcode[i]!=5);
5409     //assert(opcode[i]!=7);
5410     //assert(opcode[i]!=0x15);
5411     //assert(opcode[i]!=0x17);
5412   }
5413   else if(rs1[i]==0)
5414   {
5415     s1l=s2l;s1h=s2h;
5416     s2l=s2h=-1;
5417     only32=(regs[i].was32>>rs2[i])&1;
5418   }
5419   else if(rs2[i]==0)
5420   {
5421     s2l=s2h=-1;
5422     only32=(regs[i].was32>>rs1[i])&1;
5423   }
5424   else {
5425     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5426   }
5427
5428   if(ooo[i]) {
5429     // Out of order execution (delay slot first)
5430     //printf("OOOE\n");
5431     address_generation(i+1,i_regs,regs[i].regmap_entry);
5432     ds_assemble(i+1,i_regs);
5433     int adj;
5434     uint64_t bc_unneeded=branch_regs[i].u;
5435     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5436     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5437     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5438     bc_unneeded|=1;
5439     bc_unneeded_upper|=1;
5440     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5441                   bc_unneeded,bc_unneeded_upper);
5442     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5443     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5444     cc=get_reg(branch_regs[i].regmap,CCREG);
5445     assert(cc==HOST_CCREG);
5446     if(unconditional) 
5447       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5448     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5449     //assem_debug("cycle count (adj)\n");
5450     if(unconditional) {
5451       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5452       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5453         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5454         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5455         if(internal)
5456           assem_debug("branch: internal\n");
5457         else
5458           assem_debug("branch: external\n");
5459         if(internal&&is_ds[(ba[i]-start)>>2]) {
5460           ds_assemble_entry(i);
5461         }
5462         else {
5463           add_to_linker((int)out,ba[i],internal);
5464           emit_jmp(0);
5465         }
5466         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5467         if(((u_int)out)&7) emit_addnop(0);
5468         #endif
5469       }
5470     }
5471     else if(nop) {
5472       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5473       int jaddr=(int)out;
5474       emit_jns(0);
5475       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5476     }
5477     else {
5478       int taken=0,nottaken=0,nottaken1=0;
5479       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5480       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5481       if(!only32)
5482       {
5483         assert(s1h>=0);
5484         if(opcode[i]==4) // BEQ
5485         {
5486           if(s2h>=0) emit_cmp(s1h,s2h);
5487           else emit_test(s1h,s1h);
5488           nottaken1=(int)out;
5489           emit_jne(1);
5490         }
5491         if(opcode[i]==5) // BNE
5492         {
5493           if(s2h>=0) emit_cmp(s1h,s2h);
5494           else emit_test(s1h,s1h);
5495           if(invert) taken=(int)out;
5496           else add_to_linker((int)out,ba[i],internal);
5497           emit_jne(0);
5498         }
5499         if(opcode[i]==6) // BLEZ
5500         {
5501           emit_test(s1h,s1h);
5502           if(invert) taken=(int)out;
5503           else add_to_linker((int)out,ba[i],internal);
5504           emit_js(0);
5505           nottaken1=(int)out;
5506           emit_jne(1);
5507         }
5508         if(opcode[i]==7) // BGTZ
5509         {
5510           emit_test(s1h,s1h);
5511           nottaken1=(int)out;
5512           emit_js(1);
5513           if(invert) taken=(int)out;
5514           else add_to_linker((int)out,ba[i],internal);
5515           emit_jne(0);
5516         }
5517       } // if(!only32)
5518           
5519       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5520       assert(s1l>=0);
5521       if(opcode[i]==4) // BEQ
5522       {
5523         if(s2l>=0) emit_cmp(s1l,s2l);
5524         else emit_test(s1l,s1l);
5525         if(invert){
5526           nottaken=(int)out;
5527           emit_jne(1);
5528         }else{
5529           add_to_linker((int)out,ba[i],internal);
5530           emit_jeq(0);
5531         }
5532       }
5533       if(opcode[i]==5) // BNE
5534       {
5535         if(s2l>=0) emit_cmp(s1l,s2l);
5536         else emit_test(s1l,s1l);
5537         if(invert){
5538           nottaken=(int)out;
5539           emit_jeq(1);
5540         }else{
5541           add_to_linker((int)out,ba[i],internal);
5542           emit_jne(0);
5543         }
5544       }
5545       if(opcode[i]==6) // BLEZ
5546       {
5547         emit_cmpimm(s1l,1);
5548         if(invert){
5549           nottaken=(int)out;
5550           emit_jge(1);
5551         }else{
5552           add_to_linker((int)out,ba[i],internal);
5553           emit_jl(0);
5554         }
5555       }
5556       if(opcode[i]==7) // BGTZ
5557       {
5558         emit_cmpimm(s1l,1);
5559         if(invert){
5560           nottaken=(int)out;
5561           emit_jl(1);
5562         }else{
5563           add_to_linker((int)out,ba[i],internal);
5564           emit_jge(0);
5565         }
5566       }
5567       if(invert) {
5568         if(taken) set_jump_target(taken,(int)out);
5569         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5570         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5571           if(adj) {
5572             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5573             add_to_linker((int)out,ba[i],internal);
5574           }else{
5575             emit_addnop(13);
5576             add_to_linker((int)out,ba[i],internal*2);
5577           }
5578           emit_jmp(0);
5579         }else
5580         #endif
5581         {
5582           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5583           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5584           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5585           if(internal)
5586             assem_debug("branch: internal\n");
5587           else
5588             assem_debug("branch: external\n");
5589           if(internal&&is_ds[(ba[i]-start)>>2]) {
5590             ds_assemble_entry(i);
5591           }
5592           else {
5593             add_to_linker((int)out,ba[i],internal);
5594             emit_jmp(0);
5595           }
5596         }
5597         set_jump_target(nottaken,(int)out);
5598       }
5599
5600       if(nottaken1) set_jump_target(nottaken1,(int)out);
5601       if(adj) {
5602         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5603       }
5604     } // (!unconditional)
5605   } // if(ooo)
5606   else
5607   {
5608     // In-order execution (branch first)
5609     //if(likely[i]) printf("IOL\n");
5610     //else
5611     //printf("IOE\n");
5612     int taken=0,nottaken=0,nottaken1=0;
5613     if(!unconditional&&!nop) {
5614       if(!only32)
5615       {
5616         assert(s1h>=0);
5617         if((opcode[i]&0x2f)==4) // BEQ
5618         {
5619           if(s2h>=0) emit_cmp(s1h,s2h);
5620           else emit_test(s1h,s1h);
5621           nottaken1=(int)out;
5622           emit_jne(2);
5623         }
5624         if((opcode[i]&0x2f)==5) // BNE
5625         {
5626           if(s2h>=0) emit_cmp(s1h,s2h);
5627           else emit_test(s1h,s1h);
5628           taken=(int)out;
5629           emit_jne(1);
5630         }
5631         if((opcode[i]&0x2f)==6) // BLEZ
5632         {
5633           emit_test(s1h,s1h);
5634           taken=(int)out;
5635           emit_js(1);
5636           nottaken1=(int)out;
5637           emit_jne(2);
5638         }
5639         if((opcode[i]&0x2f)==7) // BGTZ
5640         {
5641           emit_test(s1h,s1h);
5642           nottaken1=(int)out;
5643           emit_js(2);
5644           taken=(int)out;
5645           emit_jne(1);
5646         }
5647       } // if(!only32)
5648           
5649       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5650       assert(s1l>=0);
5651       if((opcode[i]&0x2f)==4) // BEQ
5652       {
5653         if(s2l>=0) emit_cmp(s1l,s2l);
5654         else emit_test(s1l,s1l);
5655         nottaken=(int)out;
5656         emit_jne(2);
5657       }
5658       if((opcode[i]&0x2f)==5) // BNE
5659       {
5660         if(s2l>=0) emit_cmp(s1l,s2l);
5661         else emit_test(s1l,s1l);
5662         nottaken=(int)out;
5663         emit_jeq(2);
5664       }
5665       if((opcode[i]&0x2f)==6) // BLEZ
5666       {
5667         emit_cmpimm(s1l,1);
5668         nottaken=(int)out;
5669         emit_jge(2);
5670       }
5671       if((opcode[i]&0x2f)==7) // BGTZ
5672       {
5673         emit_cmpimm(s1l,1);
5674         nottaken=(int)out;
5675         emit_jl(2);
5676       }
5677     } // if(!unconditional)
5678     int adj;
5679     uint64_t ds_unneeded=branch_regs[i].u;
5680     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5681     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5682     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5683     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5684     ds_unneeded|=1;
5685     ds_unneeded_upper|=1;
5686     // branch taken
5687     if(!nop) {
5688       if(taken) set_jump_target(taken,(int)out);
5689       assem_debug("1:\n");
5690       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5691                     ds_unneeded,ds_unneeded_upper);
5692       // load regs
5693       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5694       address_generation(i+1,&branch_regs[i],0);
5695       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5696       ds_assemble(i+1,&branch_regs[i]);
5697       cc=get_reg(branch_regs[i].regmap,CCREG);
5698       if(cc==-1) {
5699         emit_loadreg(CCREG,cc=HOST_CCREG);
5700         // CHECK: Is the following instruction (fall thru) allocated ok?
5701       }
5702       assert(cc==HOST_CCREG);
5703       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5704       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5705       assem_debug("cycle count (adj)\n");
5706       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5707       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5708       if(internal)
5709         assem_debug("branch: internal\n");
5710       else
5711         assem_debug("branch: external\n");
5712       if(internal&&is_ds[(ba[i]-start)>>2]) {
5713         ds_assemble_entry(i);
5714       }
5715       else {
5716         add_to_linker((int)out,ba[i],internal);
5717         emit_jmp(0);
5718       }
5719     }
5720     // branch not taken
5721     cop1_usable=prev_cop1_usable;
5722     if(!unconditional) {
5723       if(nottaken1) set_jump_target(nottaken1,(int)out);
5724       set_jump_target(nottaken,(int)out);
5725       assem_debug("2:\n");
5726       if(!likely[i]) {
5727         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5728                       ds_unneeded,ds_unneeded_upper);
5729         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5730         address_generation(i+1,&branch_regs[i],0);
5731         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5732         ds_assemble(i+1,&branch_regs[i]);
5733       }
5734       cc=get_reg(branch_regs[i].regmap,CCREG);
5735       if(cc==-1&&!likely[i]) {
5736         // Cycle count isn't in a register, temporarily load it then write it out
5737         emit_loadreg(CCREG,HOST_CCREG);
5738         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5739         int jaddr=(int)out;
5740         emit_jns(0);
5741         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5742         emit_storereg(CCREG,HOST_CCREG);
5743       }
5744       else{
5745         cc=get_reg(i_regmap,CCREG);
5746         assert(cc==HOST_CCREG);
5747         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5748         int jaddr=(int)out;
5749         emit_jns(0);
5750         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5751       }
5752     }
5753   }
5754 }
5755
5756 void sjump_assemble(int i,struct regstat *i_regs)
5757 {
5758   signed char *i_regmap=i_regs->regmap;
5759   int cc;
5760   int match;
5761   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5762   assem_debug("smatch=%d\n",match);
5763   int s1h,s1l;
5764   int prev_cop1_usable=cop1_usable;
5765   int unconditional=0,nevertaken=0;
5766   int only32=0;
5767   int invert=0;
5768   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5769   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5770   if(!match) invert=1;
5771   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5772   if(i>(ba[i]-start)>>2) invert=1;
5773   #endif
5774
5775   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5776   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5777
5778   if(ooo[i]) {
5779     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5780     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5781   }
5782   else {
5783     s1l=get_reg(i_regmap,rs1[i]);
5784     s1h=get_reg(i_regmap,rs1[i]|64);
5785   }
5786   if(rs1[i]==0)
5787   {
5788     if(opcode2[i]&1) unconditional=1;
5789     else nevertaken=1;
5790     // These are never taken (r0 is never less than zero)
5791     //assert(opcode2[i]!=0);
5792     //assert(opcode2[i]!=2);
5793     //assert(opcode2[i]!=0x10);
5794     //assert(opcode2[i]!=0x12);
5795   }
5796   else {
5797     only32=(regs[i].was32>>rs1[i])&1;
5798   }
5799
5800   if(ooo[i]) {
5801     // Out of order execution (delay slot first)
5802     //printf("OOOE\n");
5803     address_generation(i+1,i_regs,regs[i].regmap_entry);
5804     ds_assemble(i+1,i_regs);
5805     int adj;
5806     uint64_t bc_unneeded=branch_regs[i].u;
5807     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5808     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5809     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5810     bc_unneeded|=1;
5811     bc_unneeded_upper|=1;
5812     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5813                   bc_unneeded,bc_unneeded_upper);
5814     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5815     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5816     if(rt1[i]==31) {
5817       int rt,return_address;
5818       rt=get_reg(branch_regs[i].regmap,31);
5819       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5820       if(rt>=0) {
5821         // Save the PC even if the branch is not taken
5822         return_address=start+i*4+8;
5823         emit_movimm(return_address,rt); // PC into link register
5824         #ifdef IMM_PREFETCH
5825         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5826         #endif
5827       }
5828     }
5829     cc=get_reg(branch_regs[i].regmap,CCREG);
5830     assert(cc==HOST_CCREG);
5831     if(unconditional) 
5832       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5833     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5834     assem_debug("cycle count (adj)\n");
5835     if(unconditional) {
5836       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5837       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5838         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5839         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5840         if(internal)
5841           assem_debug("branch: internal\n");
5842         else
5843           assem_debug("branch: external\n");
5844         if(internal&&is_ds[(ba[i]-start)>>2]) {
5845           ds_assemble_entry(i);
5846         }
5847         else {
5848           add_to_linker((int)out,ba[i],internal);
5849           emit_jmp(0);
5850         }
5851         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5852         if(((u_int)out)&7) emit_addnop(0);
5853         #endif
5854       }
5855     }
5856     else if(nevertaken) {
5857       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5858       int jaddr=(int)out;
5859       emit_jns(0);
5860       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5861     }
5862     else {
5863       int nottaken=0;
5864       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5865       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5866       if(!only32)
5867       {
5868         assert(s1h>=0);
5869         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5870         {
5871           emit_test(s1h,s1h);
5872           if(invert){
5873             nottaken=(int)out;
5874             emit_jns(1);
5875           }else{
5876             add_to_linker((int)out,ba[i],internal);
5877             emit_js(0);
5878           }
5879         }
5880         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5881         {
5882           emit_test(s1h,s1h);
5883           if(invert){
5884             nottaken=(int)out;
5885             emit_js(1);
5886           }else{
5887             add_to_linker((int)out,ba[i],internal);
5888             emit_jns(0);
5889           }
5890         }
5891       } // if(!only32)
5892       else
5893       {
5894         assert(s1l>=0);
5895         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5896         {
5897           emit_test(s1l,s1l);
5898           if(invert){
5899             nottaken=(int)out;
5900             emit_jns(1);
5901           }else{
5902             add_to_linker((int)out,ba[i],internal);
5903             emit_js(0);
5904           }
5905         }
5906         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5907         {
5908           emit_test(s1l,s1l);
5909           if(invert){
5910             nottaken=(int)out;
5911             emit_js(1);
5912           }else{
5913             add_to_linker((int)out,ba[i],internal);
5914             emit_jns(0);
5915           }
5916         }
5917       } // if(!only32)
5918           
5919       if(invert) {
5920         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5921         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5922           if(adj) {
5923             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5924             add_to_linker((int)out,ba[i],internal);
5925           }else{
5926             emit_addnop(13);
5927             add_to_linker((int)out,ba[i],internal*2);
5928           }
5929           emit_jmp(0);
5930         }else
5931         #endif
5932         {
5933           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5934           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5935           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5936           if(internal)
5937             assem_debug("branch: internal\n");
5938           else
5939             assem_debug("branch: external\n");
5940           if(internal&&is_ds[(ba[i]-start)>>2]) {
5941             ds_assemble_entry(i);
5942           }
5943           else {
5944             add_to_linker((int)out,ba[i],internal);
5945             emit_jmp(0);
5946           }
5947         }
5948         set_jump_target(nottaken,(int)out);
5949       }
5950
5951       if(adj) {
5952         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5953       }
5954     } // (!unconditional)
5955   } // if(ooo)
5956   else
5957   {
5958     // In-order execution (branch first)
5959     //printf("IOE\n");
5960     int nottaken=0;
5961     if(rt1[i]==31) {
5962       int rt,return_address;
5963       rt=get_reg(branch_regs[i].regmap,31);
5964       if(rt>=0) {
5965         // Save the PC even if the branch is not taken
5966         return_address=start+i*4+8;
5967         emit_movimm(return_address,rt); // PC into link register
5968         #ifdef IMM_PREFETCH
5969         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5970         #endif
5971       }
5972     }
5973     if(!unconditional) {
5974       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5975       if(!only32)
5976       {
5977         assert(s1h>=0);
5978         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5979         {
5980           emit_test(s1h,s1h);
5981           nottaken=(int)out;
5982           emit_jns(1);
5983         }
5984         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5985         {
5986           emit_test(s1h,s1h);
5987           nottaken=(int)out;
5988           emit_js(1);
5989         }
5990       } // if(!only32)
5991       else
5992       {
5993         assert(s1l>=0);
5994         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5995         {
5996           emit_test(s1l,s1l);
5997           nottaken=(int)out;
5998           emit_jns(1);
5999         }
6000         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6001         {
6002           emit_test(s1l,s1l);
6003           nottaken=(int)out;
6004           emit_js(1);
6005         }
6006       }
6007     } // if(!unconditional)
6008     int adj;
6009     uint64_t ds_unneeded=branch_regs[i].u;
6010     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6011     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6012     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6013     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6014     ds_unneeded|=1;
6015     ds_unneeded_upper|=1;
6016     // branch taken
6017     if(!nevertaken) {
6018       //assem_debug("1:\n");
6019       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6020                     ds_unneeded,ds_unneeded_upper);
6021       // load regs
6022       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6023       address_generation(i+1,&branch_regs[i],0);
6024       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6025       ds_assemble(i+1,&branch_regs[i]);
6026       cc=get_reg(branch_regs[i].regmap,CCREG);
6027       if(cc==-1) {
6028         emit_loadreg(CCREG,cc=HOST_CCREG);
6029         // CHECK: Is the following instruction (fall thru) allocated ok?
6030       }
6031       assert(cc==HOST_CCREG);
6032       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6033       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6034       assem_debug("cycle count (adj)\n");
6035       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6036       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6037       if(internal)
6038         assem_debug("branch: internal\n");
6039       else
6040         assem_debug("branch: external\n");
6041       if(internal&&is_ds[(ba[i]-start)>>2]) {
6042         ds_assemble_entry(i);
6043       }
6044       else {
6045         add_to_linker((int)out,ba[i],internal);
6046         emit_jmp(0);
6047       }
6048     }
6049     // branch not taken
6050     cop1_usable=prev_cop1_usable;
6051     if(!unconditional) {
6052       set_jump_target(nottaken,(int)out);
6053       assem_debug("1:\n");
6054       if(!likely[i]) {
6055         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6056                       ds_unneeded,ds_unneeded_upper);
6057         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6058         address_generation(i+1,&branch_regs[i],0);
6059         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6060         ds_assemble(i+1,&branch_regs[i]);
6061       }
6062       cc=get_reg(branch_regs[i].regmap,CCREG);
6063       if(cc==-1&&!likely[i]) {
6064         // Cycle count isn't in a register, temporarily load it then write it out
6065         emit_loadreg(CCREG,HOST_CCREG);
6066         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6067         int jaddr=(int)out;
6068         emit_jns(0);
6069         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6070         emit_storereg(CCREG,HOST_CCREG);
6071       }
6072       else{
6073         cc=get_reg(i_regmap,CCREG);
6074         assert(cc==HOST_CCREG);
6075         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6076         int jaddr=(int)out;
6077         emit_jns(0);
6078         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6079       }
6080     }
6081   }
6082 }
6083
6084 void fjump_assemble(int i,struct regstat *i_regs)
6085 {
6086   signed char *i_regmap=i_regs->regmap;
6087   int cc;
6088   int match;
6089   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6090   assem_debug("fmatch=%d\n",match);
6091   int fs,cs;
6092   int eaddr;
6093   int invert=0;
6094   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6095   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6096   if(!match) invert=1;
6097   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6098   if(i>(ba[i]-start)>>2) invert=1;
6099   #endif
6100
6101   if(ooo[i]) {
6102     fs=get_reg(branch_regs[i].regmap,FSREG);
6103     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6104   }
6105   else {
6106     fs=get_reg(i_regmap,FSREG);
6107   }
6108
6109   // Check cop1 unusable
6110   if(!cop1_usable) {
6111     cs=get_reg(i_regmap,CSREG);
6112     assert(cs>=0);
6113     emit_testimm(cs,0x20000000);
6114     eaddr=(int)out;
6115     emit_jeq(0);
6116     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6117     cop1_usable=1;
6118   }
6119
6120   if(ooo[i]) {
6121     // Out of order execution (delay slot first)
6122     //printf("OOOE\n");
6123     ds_assemble(i+1,i_regs);
6124     int adj;
6125     uint64_t bc_unneeded=branch_regs[i].u;
6126     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6127     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6128     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6129     bc_unneeded|=1;
6130     bc_unneeded_upper|=1;
6131     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6132                   bc_unneeded,bc_unneeded_upper);
6133     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6134     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6135     cc=get_reg(branch_regs[i].regmap,CCREG);
6136     assert(cc==HOST_CCREG);
6137     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6138     assem_debug("cycle count (adj)\n");
6139     if(1) {
6140       int nottaken=0;
6141       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6142       if(1) {
6143         assert(fs>=0);
6144         emit_testimm(fs,0x800000);
6145         if(source[i]&0x10000) // BC1T
6146         {
6147           if(invert){
6148             nottaken=(int)out;
6149             emit_jeq(1);
6150           }else{
6151             add_to_linker((int)out,ba[i],internal);
6152             emit_jne(0);
6153           }
6154         }
6155         else // BC1F
6156           if(invert){
6157             nottaken=(int)out;
6158             emit_jne(1);
6159           }else{
6160             add_to_linker((int)out,ba[i],internal);
6161             emit_jeq(0);
6162           }
6163         {
6164         }
6165       } // if(!only32)
6166           
6167       if(invert) {
6168         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6169         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6170         else if(match) emit_addnop(13);
6171         #endif
6172         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6173         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6174         if(internal)
6175           assem_debug("branch: internal\n");
6176         else
6177           assem_debug("branch: external\n");
6178         if(internal&&is_ds[(ba[i]-start)>>2]) {
6179           ds_assemble_entry(i);
6180         }
6181         else {
6182           add_to_linker((int)out,ba[i],internal);
6183           emit_jmp(0);
6184         }
6185         set_jump_target(nottaken,(int)out);
6186       }
6187
6188       if(adj) {
6189         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6190       }
6191     } // (!unconditional)
6192   } // if(ooo)
6193   else
6194   {
6195     // In-order execution (branch first)
6196     //printf("IOE\n");
6197     int nottaken=0;
6198     if(1) {
6199       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6200       if(1) {
6201         assert(fs>=0);
6202         emit_testimm(fs,0x800000);
6203         if(source[i]&0x10000) // BC1T
6204         {
6205           nottaken=(int)out;
6206           emit_jeq(1);
6207         }
6208         else // BC1F
6209         {
6210           nottaken=(int)out;
6211           emit_jne(1);
6212         }
6213       }
6214     } // if(!unconditional)
6215     int adj;
6216     uint64_t ds_unneeded=branch_regs[i].u;
6217     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6218     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6219     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6220     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6221     ds_unneeded|=1;
6222     ds_unneeded_upper|=1;
6223     // branch taken
6224     //assem_debug("1:\n");
6225     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6226                   ds_unneeded,ds_unneeded_upper);
6227     // load regs
6228     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6229     address_generation(i+1,&branch_regs[i],0);
6230     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6231     ds_assemble(i+1,&branch_regs[i]);
6232     cc=get_reg(branch_regs[i].regmap,CCREG);
6233     if(cc==-1) {
6234       emit_loadreg(CCREG,cc=HOST_CCREG);
6235       // CHECK: Is the following instruction (fall thru) allocated ok?
6236     }
6237     assert(cc==HOST_CCREG);
6238     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6239     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6240     assem_debug("cycle count (adj)\n");
6241     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6242     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6243     if(internal)
6244       assem_debug("branch: internal\n");
6245     else
6246       assem_debug("branch: external\n");
6247     if(internal&&is_ds[(ba[i]-start)>>2]) {
6248       ds_assemble_entry(i);
6249     }
6250     else {
6251       add_to_linker((int)out,ba[i],internal);
6252       emit_jmp(0);
6253     }
6254
6255     // branch not taken
6256     if(1) { // <- FIXME (don't need this)
6257       set_jump_target(nottaken,(int)out);
6258       assem_debug("1:\n");
6259       if(!likely[i]) {
6260         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6261                       ds_unneeded,ds_unneeded_upper);
6262         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6263         address_generation(i+1,&branch_regs[i],0);
6264         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6265         ds_assemble(i+1,&branch_regs[i]);
6266       }
6267       cc=get_reg(branch_regs[i].regmap,CCREG);
6268       if(cc==-1&&!likely[i]) {
6269         // Cycle count isn't in a register, temporarily load it then write it out
6270         emit_loadreg(CCREG,HOST_CCREG);
6271         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6272         int jaddr=(int)out;
6273         emit_jns(0);
6274         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6275         emit_storereg(CCREG,HOST_CCREG);
6276       }
6277       else{
6278         cc=get_reg(i_regmap,CCREG);
6279         assert(cc==HOST_CCREG);
6280         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6281         int jaddr=(int)out;
6282         emit_jns(0);
6283         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6284       }
6285     }
6286   }
6287 }
6288
6289 static void pagespan_assemble(int i,struct regstat *i_regs)
6290 {
6291   int s1l=get_reg(i_regs->regmap,rs1[i]);
6292   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6293   int s2l=get_reg(i_regs->regmap,rs2[i]);
6294   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6295   void *nt_branch=NULL;
6296   int taken=0;
6297   int nottaken=0;
6298   int unconditional=0;
6299   if(rs1[i]==0)
6300   {
6301     s1l=s2l;s1h=s2h;
6302     s2l=s2h=-1;
6303   }
6304   else if(rs2[i]==0)
6305   {
6306     s2l=s2h=-1;
6307   }
6308   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6309     s1h=s2h=-1;
6310   }
6311   int hr=0;
6312   int addr,alt,ntaddr;
6313   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6314   else {
6315     while(hr<HOST_REGS)
6316     {
6317       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6318          (i_regs->regmap[hr]&63)!=rs1[i] &&
6319          (i_regs->regmap[hr]&63)!=rs2[i] )
6320       {
6321         addr=hr++;break;
6322       }
6323       hr++;
6324     }
6325   }
6326   while(hr<HOST_REGS)
6327   {
6328     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6329        (i_regs->regmap[hr]&63)!=rs1[i] &&
6330        (i_regs->regmap[hr]&63)!=rs2[i] )
6331     {
6332       alt=hr++;break;
6333     }
6334     hr++;
6335   }
6336   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6337   {
6338     while(hr<HOST_REGS)
6339     {
6340       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6341          (i_regs->regmap[hr]&63)!=rs1[i] &&
6342          (i_regs->regmap[hr]&63)!=rs2[i] )
6343       {
6344         ntaddr=hr;break;
6345       }
6346       hr++;
6347     }
6348   }
6349   assert(hr<HOST_REGS);
6350   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6351     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6352   }
6353   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6354   if(opcode[i]==2) // J
6355   {
6356     unconditional=1;
6357   }
6358   if(opcode[i]==3) // JAL
6359   {
6360     // TODO: mini_ht
6361     int rt=get_reg(i_regs->regmap,31);
6362     emit_movimm(start+i*4+8,rt);
6363     unconditional=1;
6364   }
6365   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6366   {
6367     emit_mov(s1l,addr);
6368     if(opcode2[i]==9) // JALR
6369     {
6370       int rt=get_reg(i_regs->regmap,rt1[i]);
6371       emit_movimm(start+i*4+8,rt);
6372     }
6373   }
6374   if((opcode[i]&0x3f)==4) // BEQ
6375   {
6376     if(rs1[i]==rs2[i])
6377     {
6378       unconditional=1;
6379     }
6380     else
6381     #ifdef HAVE_CMOV_IMM
6382     if(s1h<0) {
6383       if(s2l>=0) emit_cmp(s1l,s2l);
6384       else emit_test(s1l,s1l);
6385       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6386     }
6387     else
6388     #endif
6389     {
6390       assert(s1l>=0);
6391       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6392       if(s1h>=0) {
6393         if(s2h>=0) emit_cmp(s1h,s2h);
6394         else emit_test(s1h,s1h);
6395         emit_cmovne_reg(alt,addr);
6396       }
6397       if(s2l>=0) emit_cmp(s1l,s2l);
6398       else emit_test(s1l,s1l);
6399       emit_cmovne_reg(alt,addr);
6400     }
6401   }
6402   if((opcode[i]&0x3f)==5) // BNE
6403   {
6404     #ifdef HAVE_CMOV_IMM
6405     if(s1h<0) {
6406       if(s2l>=0) emit_cmp(s1l,s2l);
6407       else emit_test(s1l,s1l);
6408       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6409     }
6410     else
6411     #endif
6412     {
6413       assert(s1l>=0);
6414       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6415       if(s1h>=0) {
6416         if(s2h>=0) emit_cmp(s1h,s2h);
6417         else emit_test(s1h,s1h);
6418         emit_cmovne_reg(alt,addr);
6419       }
6420       if(s2l>=0) emit_cmp(s1l,s2l);
6421       else emit_test(s1l,s1l);
6422       emit_cmovne_reg(alt,addr);
6423     }
6424   }
6425   if((opcode[i]&0x3f)==0x14) // BEQL
6426   {
6427     if(s1h>=0) {
6428       if(s2h>=0) emit_cmp(s1h,s2h);
6429       else emit_test(s1h,s1h);
6430       nottaken=(int)out;
6431       emit_jne(0);
6432     }
6433     if(s2l>=0) emit_cmp(s1l,s2l);
6434     else emit_test(s1l,s1l);
6435     if(nottaken) set_jump_target(nottaken,(int)out);
6436     nottaken=(int)out;
6437     emit_jne(0);
6438   }
6439   if((opcode[i]&0x3f)==0x15) // BNEL
6440   {
6441     if(s1h>=0) {
6442       if(s2h>=0) emit_cmp(s1h,s2h);
6443       else emit_test(s1h,s1h);
6444       taken=(int)out;
6445       emit_jne(0);
6446     }
6447     if(s2l>=0) emit_cmp(s1l,s2l);
6448     else emit_test(s1l,s1l);
6449     nottaken=(int)out;
6450     emit_jeq(0);
6451     if(taken) set_jump_target(taken,(int)out);
6452   }
6453   if((opcode[i]&0x3f)==6) // BLEZ
6454   {
6455     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6456     emit_cmpimm(s1l,1);
6457     if(s1h>=0) emit_mov(addr,ntaddr);
6458     emit_cmovl_reg(alt,addr);
6459     if(s1h>=0) {
6460       emit_test(s1h,s1h);
6461       emit_cmovne_reg(ntaddr,addr);
6462       emit_cmovs_reg(alt,addr);
6463     }
6464   }
6465   if((opcode[i]&0x3f)==7) // BGTZ
6466   {
6467     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6468     emit_cmpimm(s1l,1);
6469     if(s1h>=0) emit_mov(addr,alt);
6470     emit_cmovl_reg(ntaddr,addr);
6471     if(s1h>=0) {
6472       emit_test(s1h,s1h);
6473       emit_cmovne_reg(alt,addr);
6474       emit_cmovs_reg(ntaddr,addr);
6475     }
6476   }
6477   if((opcode[i]&0x3f)==0x16) // BLEZL
6478   {
6479     assert((opcode[i]&0x3f)!=0x16);
6480   }
6481   if((opcode[i]&0x3f)==0x17) // BGTZL
6482   {
6483     assert((opcode[i]&0x3f)!=0x17);
6484   }
6485   assert(opcode[i]!=1); // BLTZ/BGEZ
6486
6487   //FIXME: Check CSREG
6488   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6489     if((source[i]&0x30000)==0) // BC1F
6490     {
6491       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6492       emit_testimm(s1l,0x800000);
6493       emit_cmovne_reg(alt,addr);
6494     }
6495     if((source[i]&0x30000)==0x10000) // BC1T
6496     {
6497       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6498       emit_testimm(s1l,0x800000);
6499       emit_cmovne_reg(alt,addr);
6500     }
6501     if((source[i]&0x30000)==0x20000) // BC1FL
6502     {
6503       emit_testimm(s1l,0x800000);
6504       nottaken=(int)out;
6505       emit_jne(0);
6506     }
6507     if((source[i]&0x30000)==0x30000) // BC1TL
6508     {
6509       emit_testimm(s1l,0x800000);
6510       nottaken=(int)out;
6511       emit_jeq(0);
6512     }
6513   }
6514
6515   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6516   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6517   if(likely[i]||unconditional)
6518   {
6519     emit_movimm(ba[i],HOST_BTREG);
6520   }
6521   else if(addr!=HOST_BTREG)
6522   {
6523     emit_mov(addr,HOST_BTREG);
6524   }
6525   void *branch_addr=out;
6526   emit_jmp(0);
6527   int target_addr=start+i*4+5;
6528   void *stub=out;
6529   void *compiled_target_addr=check_addr(target_addr);
6530   emit_extjump_ds((int)branch_addr,target_addr);
6531   if(compiled_target_addr) {
6532     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6533     add_link(target_addr,stub);
6534   }
6535   else set_jump_target((int)branch_addr,(int)stub);
6536   if(likely[i]) {
6537     // Not-taken path
6538     set_jump_target((int)nottaken,(int)out);
6539     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6540     void *branch_addr=out;
6541     emit_jmp(0);
6542     int target_addr=start+i*4+8;
6543     void *stub=out;
6544     void *compiled_target_addr=check_addr(target_addr);
6545     emit_extjump_ds((int)branch_addr,target_addr);
6546     if(compiled_target_addr) {
6547       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6548       add_link(target_addr,stub);
6549     }
6550     else set_jump_target((int)branch_addr,(int)stub);
6551   }
6552 }
6553
6554 // Assemble the delay slot for the above
6555 static void pagespan_ds()
6556 {
6557   assem_debug("initial delay slot:\n");
6558   u_int vaddr=start+1;
6559   u_int page=get_page(vaddr);
6560   u_int vpage=get_vpage(vaddr);
6561   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6562   do_dirty_stub_ds();
6563   ll_add(jump_in+page,vaddr,(void *)out);
6564   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6565   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6566     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6567   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6568     emit_writeword(HOST_BTREG,(int)&branch_target);
6569   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6570   address_generation(0,&regs[0],regs[0].regmap_entry);
6571   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6572     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6573   cop1_usable=0;
6574   is_delayslot=0;
6575   switch(itype[0]) {
6576     case ALU:
6577       alu_assemble(0,&regs[0]);break;
6578     case IMM16:
6579       imm16_assemble(0,&regs[0]);break;
6580     case SHIFT:
6581       shift_assemble(0,&regs[0]);break;
6582     case SHIFTIMM:
6583       shiftimm_assemble(0,&regs[0]);break;
6584     case LOAD:
6585       load_assemble(0,&regs[0]);break;
6586     case LOADLR:
6587       loadlr_assemble(0,&regs[0]);break;
6588     case STORE:
6589       store_assemble(0,&regs[0]);break;
6590     case STORELR:
6591       storelr_assemble(0,&regs[0]);break;
6592     case COP0:
6593       cop0_assemble(0,&regs[0]);break;
6594     case COP1:
6595       cop1_assemble(0,&regs[0]);break;
6596     case C1LS:
6597       c1ls_assemble(0,&regs[0]);break;
6598     case COP2:
6599       cop2_assemble(0,&regs[0]);break;
6600     case C2LS:
6601       c2ls_assemble(0,&regs[0]);break;
6602     case C2OP:
6603       c2op_assemble(0,&regs[0]);break;
6604     case FCONV:
6605       fconv_assemble(0,&regs[0]);break;
6606     case FLOAT:
6607       float_assemble(0,&regs[0]);break;
6608     case FCOMP:
6609       fcomp_assemble(0,&regs[0]);break;
6610     case MULTDIV:
6611       multdiv_assemble(0,&regs[0]);break;
6612     case MOV:
6613       mov_assemble(0,&regs[0]);break;
6614     case SYSCALL:
6615     case HLECALL:
6616     case INTCALL:
6617     case SPAN:
6618     case UJUMP:
6619     case RJUMP:
6620     case CJUMP:
6621     case SJUMP:
6622     case FJUMP:
6623       printf("Jump in the delay slot.  This is probably a bug.\n");
6624   }
6625   int btaddr=get_reg(regs[0].regmap,BTREG);
6626   if(btaddr<0) {
6627     btaddr=get_reg(regs[0].regmap,-1);
6628     emit_readword((int)&branch_target,btaddr);
6629   }
6630   assert(btaddr!=HOST_CCREG);
6631   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6632 #ifdef HOST_IMM8
6633   emit_movimm(start+4,HOST_TEMPREG);
6634   emit_cmp(btaddr,HOST_TEMPREG);
6635 #else
6636   emit_cmpimm(btaddr,start+4);
6637 #endif
6638   int branch=(int)out;
6639   emit_jeq(0);
6640   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6641   emit_jmp(jump_vaddr_reg[btaddr]);
6642   set_jump_target(branch,(int)out);
6643   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6644   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6645 }
6646
6647 // Basic liveness analysis for MIPS registers
6648 void unneeded_registers(int istart,int iend,int r)
6649 {
6650   int i;
6651   uint64_t u,uu,b,bu;
6652   uint64_t temp_u,temp_uu;
6653   uint64_t tdep;
6654   if(iend==slen-1) {
6655     u=1;uu=1;
6656   }else{
6657     u=unneeded_reg[iend+1];
6658     uu=unneeded_reg_upper[iend+1];
6659     u=1;uu=1;
6660   }
6661   for (i=iend;i>=istart;i--)
6662   {
6663     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6664     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6665     {
6666       // If subroutine call, flag return address as a possible branch target
6667       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6668       
6669       if(ba[i]<start || ba[i]>=(start+slen*4))
6670       {
6671         // Branch out of this block, flush all regs
6672         u=1;
6673         uu=1;
6674         /* Hexagon hack 
6675         if(itype[i]==UJUMP&&rt1[i]==31)
6676         {
6677           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6678         }
6679         if(itype[i]==RJUMP&&rs1[i]==31)
6680         {
6681           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6682         }
6683         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6684           if(itype[i]==UJUMP&&rt1[i]==31)
6685           {
6686             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6687             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6688           }
6689           if(itype[i]==RJUMP&&rs1[i]==31)
6690           {
6691             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6692             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6693           }
6694         }*/
6695         branch_unneeded_reg[i]=u;
6696         branch_unneeded_reg_upper[i]=uu;
6697         // Merge in delay slot
6698         tdep=(~uu>>rt1[i+1])&1;
6699         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6700         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6701         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6702         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6703         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6704         u|=1;uu|=1;
6705         // If branch is "likely" (and conditional)
6706         // then we skip the delay slot on the fall-thru path
6707         if(likely[i]) {
6708           if(i<slen-1) {
6709             u&=unneeded_reg[i+2];
6710             uu&=unneeded_reg_upper[i+2];
6711           }
6712           else
6713           {
6714             u=1;
6715             uu=1;
6716           }
6717         }
6718       }
6719       else
6720       {
6721         // Internal branch, flag target
6722         bt[(ba[i]-start)>>2]=1;
6723         if(ba[i]<=start+i*4) {
6724           // Backward branch
6725           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6726           {
6727             // Unconditional branch
6728             temp_u=1;temp_uu=1;
6729           } else {
6730             // Conditional branch (not taken case)
6731             temp_u=unneeded_reg[i+2];
6732             temp_uu=unneeded_reg_upper[i+2];
6733           }
6734           // Merge in delay slot
6735           tdep=(~temp_uu>>rt1[i+1])&1;
6736           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6737           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6738           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6739           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6740           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6741           temp_u|=1;temp_uu|=1;
6742           // If branch is "likely" (and conditional)
6743           // then we skip the delay slot on the fall-thru path
6744           if(likely[i]) {
6745             if(i<slen-1) {
6746               temp_u&=unneeded_reg[i+2];
6747               temp_uu&=unneeded_reg_upper[i+2];
6748             }
6749             else
6750             {
6751               temp_u=1;
6752               temp_uu=1;
6753             }
6754           }
6755           tdep=(~temp_uu>>rt1[i])&1;
6756           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6757           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6758           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6759           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6760           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6761           temp_u|=1;temp_uu|=1;
6762           unneeded_reg[i]=temp_u;
6763           unneeded_reg_upper[i]=temp_uu;
6764           // Only go three levels deep.  This recursion can take an
6765           // excessive amount of time if there are a lot of nested loops.
6766           if(r<2) {
6767             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6768           }else{
6769             unneeded_reg[(ba[i]-start)>>2]=1;
6770             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6771           }
6772         } /*else*/ if(1) {
6773           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6774           {
6775             // Unconditional branch
6776             u=unneeded_reg[(ba[i]-start)>>2];
6777             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6778             branch_unneeded_reg[i]=u;
6779             branch_unneeded_reg_upper[i]=uu;
6780         //u=1;
6781         //uu=1;
6782         //branch_unneeded_reg[i]=u;
6783         //branch_unneeded_reg_upper[i]=uu;
6784             // Merge in delay slot
6785             tdep=(~uu>>rt1[i+1])&1;
6786             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6787             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6788             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6789             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6790             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6791             u|=1;uu|=1;
6792           } else {
6793             // Conditional branch
6794             b=unneeded_reg[(ba[i]-start)>>2];
6795             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6796             branch_unneeded_reg[i]=b;
6797             branch_unneeded_reg_upper[i]=bu;
6798         //b=1;
6799         //bu=1;
6800         //branch_unneeded_reg[i]=b;
6801         //branch_unneeded_reg_upper[i]=bu;
6802             // Branch delay slot
6803             tdep=(~uu>>rt1[i+1])&1;
6804             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6805             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6806             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6807             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6808             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6809             b|=1;bu|=1;
6810             // If branch is "likely" then we skip the
6811             // delay slot on the fall-thru path
6812             if(likely[i]) {
6813               u=b;
6814               uu=bu;
6815               if(i<slen-1) {
6816                 u&=unneeded_reg[i+2];
6817                 uu&=unneeded_reg_upper[i+2];
6818         //u=1;
6819         //uu=1;
6820               }
6821             } else {
6822               u&=b;
6823               uu&=bu;
6824         //u=1;
6825         //uu=1;
6826             }
6827             if(i<slen-1) {
6828               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6829               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6830         //branch_unneeded_reg[i]=1;
6831         //branch_unneeded_reg_upper[i]=1;
6832             } else {
6833               branch_unneeded_reg[i]=1;
6834               branch_unneeded_reg_upper[i]=1;
6835             }
6836           }
6837         }
6838       }
6839     }
6840     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6841     {
6842       // SYSCALL instruction (software interrupt)
6843       u=1;
6844       uu=1;
6845     }
6846     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6847     {
6848       // ERET instruction (return from interrupt)
6849       u=1;
6850       uu=1;
6851     }
6852     //u=uu=1; // DEBUG
6853     tdep=(~uu>>rt1[i])&1;
6854     // Written registers are unneeded
6855     u|=1LL<<rt1[i];
6856     u|=1LL<<rt2[i];
6857     uu|=1LL<<rt1[i];
6858     uu|=1LL<<rt2[i];
6859     // Accessed registers are needed
6860     u&=~(1LL<<rs1[i]);
6861     u&=~(1LL<<rs2[i]);
6862     uu&=~(1LL<<us1[i]);
6863     uu&=~(1LL<<us2[i]);
6864     // Source-target dependencies
6865     uu&=~(tdep<<dep1[i]);
6866     uu&=~(tdep<<dep2[i]);
6867     // R0 is always unneeded
6868     u|=1;uu|=1;
6869     // Save it
6870     unneeded_reg[i]=u;
6871     unneeded_reg_upper[i]=uu;
6872     /*
6873     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6874     printf("U:");
6875     int r;
6876     for(r=1;r<=CCREG;r++) {
6877       if((unneeded_reg[i]>>r)&1) {
6878         if(r==HIREG) printf(" HI");
6879         else if(r==LOREG) printf(" LO");
6880         else printf(" r%d",r);
6881       }
6882     }
6883     printf(" UU:");
6884     for(r=1;r<=CCREG;r++) {
6885       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6886         if(r==HIREG) printf(" HI");
6887         else if(r==LOREG) printf(" LO");
6888         else printf(" r%d",r);
6889       }
6890     }
6891     printf("\n");*/
6892   }
6893 #ifdef FORCE32
6894   for (i=iend;i>=istart;i--)
6895   {
6896     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6897   }
6898 #endif
6899 }
6900
6901 // Identify registers which are likely to contain 32-bit values
6902 // This is used to predict whether any branches will jump to a
6903 // location with 64-bit values in registers.
6904 static void provisional_32bit()
6905 {
6906   int i,j;
6907   uint64_t is32=1;
6908   uint64_t lastbranch=1;
6909   
6910   for(i=0;i<slen;i++)
6911   {
6912     if(i>0) {
6913       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6914         if(i>1) is32=lastbranch;
6915         else is32=1;
6916       }
6917     }
6918     if(i>1)
6919     {
6920       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6921         if(likely[i-2]) {
6922           if(i>2) is32=lastbranch;
6923           else is32=1;
6924         }
6925       }
6926       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6927       {
6928         if(rs1[i-2]==0||rs2[i-2]==0)
6929         {
6930           if(rs1[i-2]) {
6931             is32|=1LL<<rs1[i-2];
6932           }
6933           if(rs2[i-2]) {
6934             is32|=1LL<<rs2[i-2];
6935           }
6936         }
6937       }
6938     }
6939     // If something jumps here with 64-bit values
6940     // then promote those registers to 64 bits
6941     if(bt[i])
6942     {
6943       uint64_t temp_is32=is32;
6944       for(j=i-1;j>=0;j--)
6945       {
6946         if(ba[j]==start+i*4) 
6947           //temp_is32&=branch_regs[j].is32;
6948           temp_is32&=p32[j];
6949       }
6950       for(j=i;j<slen;j++)
6951       {
6952         if(ba[j]==start+i*4) 
6953           temp_is32=1;
6954       }
6955       is32=temp_is32;
6956     }
6957     int type=itype[i];
6958     int op=opcode[i];
6959     int op2=opcode2[i];
6960     int rt=rt1[i];
6961     int s1=rs1[i];
6962     int s2=rs2[i];
6963     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6964       // Branches don't write registers, consider the delay slot instead.
6965       type=itype[i+1];
6966       op=opcode[i+1];
6967       op2=opcode2[i+1];
6968       rt=rt1[i+1];
6969       s1=rs1[i+1];
6970       s2=rs2[i+1];
6971       lastbranch=is32;
6972     }
6973     switch(type) {
6974       case LOAD:
6975         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6976            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6977           is32&=~(1LL<<rt);
6978         else
6979           is32|=1LL<<rt;
6980         break;
6981       case STORE:
6982       case STORELR:
6983         break;
6984       case LOADLR:
6985         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6986         if(op==0x22) is32|=1LL<<rt; // LWL
6987         break;
6988       case IMM16:
6989         if (op==0x08||op==0x09|| // ADDI/ADDIU
6990             op==0x0a||op==0x0b|| // SLTI/SLTIU
6991             op==0x0c|| // ANDI
6992             op==0x0f)  // LUI
6993         {
6994           is32|=1LL<<rt;
6995         }
6996         if(op==0x18||op==0x19) { // DADDI/DADDIU
6997           is32&=~(1LL<<rt);
6998           //if(imm[i]==0)
6999           //  is32|=((is32>>s1)&1LL)<<rt;
7000         }
7001         if(op==0x0d||op==0x0e) { // ORI/XORI
7002           uint64_t sr=((is32>>s1)&1LL);
7003           is32&=~(1LL<<rt);
7004           is32|=sr<<rt;
7005         }
7006         break;
7007       case UJUMP:
7008         break;
7009       case RJUMP:
7010         break;
7011       case CJUMP:
7012         break;
7013       case SJUMP:
7014         break;
7015       case FJUMP:
7016         break;
7017       case ALU:
7018         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7019           is32|=1LL<<rt;
7020         }
7021         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7022           is32|=1LL<<rt;
7023         }
7024         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7025           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7026           is32&=~(1LL<<rt);
7027           is32|=sr<<rt;
7028         }
7029         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7030           if(s1==0&&s2==0) {
7031             is32|=1LL<<rt;
7032           }
7033           else if(s2==0) {
7034             uint64_t sr=((is32>>s1)&1LL);
7035             is32&=~(1LL<<rt);
7036             is32|=sr<<rt;
7037           }
7038           else if(s1==0) {
7039             uint64_t sr=((is32>>s2)&1LL);
7040             is32&=~(1LL<<rt);
7041             is32|=sr<<rt;
7042           }
7043           else {
7044             is32&=~(1LL<<rt);
7045           }
7046         }
7047         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7048           if(s1==0&&s2==0) {
7049             is32|=1LL<<rt;
7050           }
7051           else if(s2==0) {
7052             uint64_t sr=((is32>>s1)&1LL);
7053             is32&=~(1LL<<rt);
7054             is32|=sr<<rt;
7055           }
7056           else {
7057             is32&=~(1LL<<rt);
7058           }
7059         }
7060         break;
7061       case MULTDIV:
7062         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7063           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7064         }
7065         else {
7066           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7067         }
7068         break;
7069       case MOV:
7070         {
7071           uint64_t sr=((is32>>s1)&1LL);
7072           is32&=~(1LL<<rt);
7073           is32|=sr<<rt;
7074         }
7075         break;
7076       case SHIFT:
7077         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7078         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7079         break;
7080       case SHIFTIMM:
7081         is32|=1LL<<rt;
7082         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7083         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7084         break;
7085       case COP0:
7086         if(op2==0) is32|=1LL<<rt; // MFC0
7087         break;
7088       case COP1:
7089       case COP2:
7090         if(op2==0) is32|=1LL<<rt; // MFC1
7091         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7092         if(op2==2) is32|=1LL<<rt; // CFC1
7093         break;
7094       case C1LS:
7095       case C2LS:
7096         break;
7097       case FLOAT:
7098       case FCONV:
7099         break;
7100       case FCOMP:
7101         break;
7102       case C2OP:
7103       case SYSCALL:
7104       case HLECALL:
7105         break;
7106       default:
7107         break;
7108     }
7109     is32|=1;
7110     p32[i]=is32;
7111
7112     if(i>0)
7113     {
7114       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7115       {
7116         if(rt1[i-1]==31) // JAL/JALR
7117         {
7118           // Subroutine call will return here, don't alloc any registers
7119           is32=1;
7120         }
7121         else if(i+1<slen)
7122         {
7123           // Internal branch will jump here, match registers to caller
7124           is32=0x3FFFFFFFFLL;
7125         }
7126       }
7127     }
7128   }
7129 }
7130
7131 // Identify registers which may be assumed to contain 32-bit values
7132 // and where optimizations will rely on this.
7133 // This is used to determine whether backward branches can safely
7134 // jump to a location with 64-bit values in registers.
7135 static void provisional_r32()
7136 {
7137   u_int r32=0;
7138   int i;
7139   
7140   for (i=slen-1;i>=0;i--)
7141   {
7142     int hr;
7143     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7144     {
7145       if(ba[i]<start || ba[i]>=(start+slen*4))
7146       {
7147         // Branch out of this block, don't need anything
7148         r32=0;
7149       }
7150       else
7151       {
7152         // Internal branch
7153         // Need whatever matches the target
7154         // (and doesn't get overwritten by the delay slot instruction)
7155         r32=0;
7156         int t=(ba[i]-start)>>2;
7157         if(ba[i]>start+i*4) {
7158           // Forward branch
7159           //if(!(requires_32bit[t]&~regs[i].was32))
7160           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7161           if(!(pr32[t]&~regs[i].was32))
7162             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7163         }else{
7164           // Backward branch
7165           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7166             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7167         }
7168       }
7169       // Conditional branch may need registers for following instructions
7170       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7171       {
7172         if(i<slen-2) {
7173           //r32|=requires_32bit[i+2];
7174           r32|=pr32[i+2];
7175           r32&=regs[i].was32;
7176           // Mark this address as a branch target since it may be called
7177           // upon return from interrupt
7178           //bt[i+2]=1;
7179         }
7180       }
7181       // Merge in delay slot
7182       if(!likely[i]) {
7183         // These are overwritten unless the branch is "likely"
7184         // and the delay slot is nullified if not taken
7185         r32&=~(1LL<<rt1[i+1]);
7186         r32&=~(1LL<<rt2[i+1]);
7187       }
7188       // Assume these are needed (delay slot)
7189       if(us1[i+1]>0)
7190       {
7191         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7192       }
7193       if(us2[i+1]>0)
7194       {
7195         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7196       }
7197       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7198       {
7199         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7200       }
7201       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7202       {
7203         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7204       }
7205     }
7206     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7207     {
7208       // SYSCALL instruction (software interrupt)
7209       r32=0;
7210     }
7211     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7212     {
7213       // ERET instruction (return from interrupt)
7214       r32=0;
7215     }
7216     // Check 32 bits
7217     r32&=~(1LL<<rt1[i]);
7218     r32&=~(1LL<<rt2[i]);
7219     if(us1[i]>0)
7220     {
7221       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7222     }
7223     if(us2[i]>0)
7224     {
7225       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7226     }
7227     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7228     {
7229       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7230     }
7231     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7232     {
7233       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7234     }
7235     //requires_32bit[i]=r32;
7236     pr32[i]=r32;
7237     
7238     // Dirty registers which are 32-bit, require 32-bit input
7239     // as they will be written as 32-bit values
7240     for(hr=0;hr<HOST_REGS;hr++)
7241     {
7242       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7243         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7244           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7245           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7246           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7247         }
7248       }
7249     }
7250   }
7251 }
7252
7253 // Write back dirty registers as soon as we will no longer modify them,
7254 // so that we don't end up with lots of writes at the branches.
7255 void clean_registers(int istart,int iend,int wr)
7256 {
7257   int i;
7258   int r;
7259   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7260   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7261   if(iend==slen-1) {
7262     will_dirty_i=will_dirty_next=0;
7263     wont_dirty_i=wont_dirty_next=0;
7264   }else{
7265     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7266     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7267   }
7268   for (i=iend;i>=istart;i--)
7269   {
7270     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7271     {
7272       if(ba[i]<start || ba[i]>=(start+slen*4))
7273       {
7274         // Branch out of this block, flush all regs
7275         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7276         {
7277           // Unconditional branch
7278           will_dirty_i=0;
7279           wont_dirty_i=0;
7280           // Merge in delay slot (will dirty)
7281           for(r=0;r<HOST_REGS;r++) {
7282             if(r!=EXCLUDE_REG) {
7283               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7284               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7285               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7286               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7287               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7288               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7289               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7290               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7291               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7292               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7293               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7294               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7295               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7296               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7297             }
7298           }
7299         }
7300         else
7301         {
7302           // Conditional branch
7303           will_dirty_i=0;
7304           wont_dirty_i=wont_dirty_next;
7305           // Merge in delay slot (will dirty)
7306           for(r=0;r<HOST_REGS;r++) {
7307             if(r!=EXCLUDE_REG) {
7308               if(!likely[i]) {
7309                 // Might not dirty if likely branch is not taken
7310                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7311                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7312                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7313                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7314                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7315                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7316                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7317                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7318                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7319                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7320                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7321                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7322                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7323                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7324               }
7325             }
7326           }
7327         }
7328         // Merge in delay slot (wont dirty)
7329         for(r=0;r<HOST_REGS;r++) {
7330           if(r!=EXCLUDE_REG) {
7331             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7332             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7333             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7334             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7335             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7336             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7337             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7338             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7339             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7340             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7341           }
7342         }
7343         if(wr) {
7344           #ifndef DESTRUCTIVE_WRITEBACK
7345           branch_regs[i].dirty&=wont_dirty_i;
7346           #endif
7347           branch_regs[i].dirty|=will_dirty_i;
7348         }
7349       }
7350       else
7351       {
7352         // Internal branch
7353         if(ba[i]<=start+i*4) {
7354           // Backward branch
7355           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7356           {
7357             // Unconditional branch
7358             temp_will_dirty=0;
7359             temp_wont_dirty=0;
7360             // Merge in delay slot (will dirty)
7361             for(r=0;r<HOST_REGS;r++) {
7362               if(r!=EXCLUDE_REG) {
7363                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7364                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7365                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7366                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7367                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7368                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7369                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7370                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7371                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7372                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7373                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7374                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7375                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7376                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7377               }
7378             }
7379           } else {
7380             // Conditional branch (not taken case)
7381             temp_will_dirty=will_dirty_next;
7382             temp_wont_dirty=wont_dirty_next;
7383             // Merge in delay slot (will dirty)
7384             for(r=0;r<HOST_REGS;r++) {
7385               if(r!=EXCLUDE_REG) {
7386                 if(!likely[i]) {
7387                   // Will not dirty if likely branch is not taken
7388                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7389                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7390                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7391                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7392                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7393                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7394                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7395                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7396                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7397                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7398                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7399                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7400                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7401                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7402                 }
7403               }
7404             }
7405           }
7406           // Merge in delay slot (wont dirty)
7407           for(r=0;r<HOST_REGS;r++) {
7408             if(r!=EXCLUDE_REG) {
7409               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7410               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7411               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7412               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7413               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7414               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7415               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7416               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7417               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7418               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7419             }
7420           }
7421           // Deal with changed mappings
7422           if(i<iend) {
7423             for(r=0;r<HOST_REGS;r++) {
7424               if(r!=EXCLUDE_REG) {
7425                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7426                   temp_will_dirty&=~(1<<r);
7427                   temp_wont_dirty&=~(1<<r);
7428                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7429                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7430                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7431                   } else {
7432                     temp_will_dirty|=1<<r;
7433                     temp_wont_dirty|=1<<r;
7434                   }
7435                 }
7436               }
7437             }
7438           }
7439           if(wr) {
7440             will_dirty[i]=temp_will_dirty;
7441             wont_dirty[i]=temp_wont_dirty;
7442             clean_registers((ba[i]-start)>>2,i-1,0);
7443           }else{
7444             // Limit recursion.  It can take an excessive amount
7445             // of time if there are a lot of nested loops.
7446             will_dirty[(ba[i]-start)>>2]=0;
7447             wont_dirty[(ba[i]-start)>>2]=-1;
7448           }
7449         }
7450         /*else*/ if(1)
7451         {
7452           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7453           {
7454             // Unconditional branch
7455             will_dirty_i=0;
7456             wont_dirty_i=0;
7457           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7458             for(r=0;r<HOST_REGS;r++) {
7459               if(r!=EXCLUDE_REG) {
7460                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7461                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7462                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7463                 }
7464                 if(branch_regs[i].regmap[r]>=0) {
7465                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7466                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7467                 }
7468               }
7469             }
7470           //}
7471             // Merge in delay slot
7472             for(r=0;r<HOST_REGS;r++) {
7473               if(r!=EXCLUDE_REG) {
7474                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7475                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7476                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7477                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7478                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7479                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7480                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7481                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7482                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7483                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7484                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7485                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7486                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7487                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7488               }
7489             }
7490           } else {
7491             // Conditional branch
7492             will_dirty_i=will_dirty_next;
7493             wont_dirty_i=wont_dirty_next;
7494           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7495             for(r=0;r<HOST_REGS;r++) {
7496               if(r!=EXCLUDE_REG) {
7497                 signed char target_reg=branch_regs[i].regmap[r];
7498                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7499                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7500                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7501                 }
7502                 else if(target_reg>=0) {
7503                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7504                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7505                 }
7506                 // Treat delay slot as part of branch too
7507                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7508                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7509                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7510                 }
7511                 else
7512                 {
7513                   will_dirty[i+1]&=~(1<<r);
7514                 }*/
7515               }
7516             }
7517           //}
7518             // Merge in delay slot
7519             for(r=0;r<HOST_REGS;r++) {
7520               if(r!=EXCLUDE_REG) {
7521                 if(!likely[i]) {
7522                   // Might not dirty if likely branch is not taken
7523                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7524                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7525                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7526                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7527                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7528                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7529                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7530                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7531                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7532                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7533                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7534                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7535                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7536                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7537                 }
7538               }
7539             }
7540           }
7541           // Merge in delay slot (won't dirty)
7542           for(r=0;r<HOST_REGS;r++) {
7543             if(r!=EXCLUDE_REG) {
7544               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7545               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7546               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7547               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7548               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7549               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7550               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7551               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7552               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7553               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7554             }
7555           }
7556           if(wr) {
7557             #ifndef DESTRUCTIVE_WRITEBACK
7558             branch_regs[i].dirty&=wont_dirty_i;
7559             #endif
7560             branch_regs[i].dirty|=will_dirty_i;
7561           }
7562         }
7563       }
7564     }
7565     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7566     {
7567       // SYSCALL instruction (software interrupt)
7568       will_dirty_i=0;
7569       wont_dirty_i=0;
7570     }
7571     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7572     {
7573       // ERET instruction (return from interrupt)
7574       will_dirty_i=0;
7575       wont_dirty_i=0;
7576     }
7577     will_dirty_next=will_dirty_i;
7578     wont_dirty_next=wont_dirty_i;
7579     for(r=0;r<HOST_REGS;r++) {
7580       if(r!=EXCLUDE_REG) {
7581         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7582         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7583         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7584         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7585         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7586         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7587         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7588         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7589         if(i>istart) {
7590           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7591           {
7592             // Don't store a register immediately after writing it,
7593             // may prevent dual-issue.
7594             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7595             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7596           }
7597         }
7598       }
7599     }
7600     // Save it
7601     will_dirty[i]=will_dirty_i;
7602     wont_dirty[i]=wont_dirty_i;
7603     // Mark registers that won't be dirtied as not dirty
7604     if(wr) {
7605       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7606       for(r=0;r<HOST_REGS;r++) {
7607         if((will_dirty_i>>r)&1) {
7608           printf(" r%d",r);
7609         }
7610       }
7611       printf("\n");*/
7612
7613       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7614         regs[i].dirty|=will_dirty_i;
7615         #ifndef DESTRUCTIVE_WRITEBACK
7616         regs[i].dirty&=wont_dirty_i;
7617         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7618         {
7619           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7620             for(r=0;r<HOST_REGS;r++) {
7621               if(r!=EXCLUDE_REG) {
7622                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7623                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7624                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7625               }
7626             }
7627           }
7628         }
7629         else
7630         {
7631           if(i<iend) {
7632             for(r=0;r<HOST_REGS;r++) {
7633               if(r!=EXCLUDE_REG) {
7634                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7635                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7636                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7637               }
7638             }
7639           }
7640         }
7641         #endif
7642       //}
7643     }
7644     // Deal with changed mappings
7645     temp_will_dirty=will_dirty_i;
7646     temp_wont_dirty=wont_dirty_i;
7647     for(r=0;r<HOST_REGS;r++) {
7648       if(r!=EXCLUDE_REG) {
7649         int nr;
7650         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7651           if(wr) {
7652             #ifndef DESTRUCTIVE_WRITEBACK
7653             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7654             #endif
7655             regs[i].wasdirty|=will_dirty_i&(1<<r);
7656           }
7657         }
7658         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7659           // Register moved to a different register
7660           will_dirty_i&=~(1<<r);
7661           wont_dirty_i&=~(1<<r);
7662           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7663           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7664           if(wr) {
7665             #ifndef DESTRUCTIVE_WRITEBACK
7666             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7667             #endif
7668             regs[i].wasdirty|=will_dirty_i&(1<<r);
7669           }
7670         }
7671         else {
7672           will_dirty_i&=~(1<<r);
7673           wont_dirty_i&=~(1<<r);
7674           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7675             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7676             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7677           } else {
7678             wont_dirty_i|=1<<r;
7679             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7680           }
7681         }
7682       }
7683     }
7684   }
7685 }
7686
7687   /* disassembly */
7688 void disassemble_inst(int i)
7689 {
7690     if (bt[i]) printf("*"); else printf(" ");
7691     switch(itype[i]) {
7692       case UJUMP:
7693         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7694       case CJUMP:
7695         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7696       case SJUMP:
7697         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7698       case FJUMP:
7699         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7700       case RJUMP:
7701         if (opcode[i]==0x9&&rt1[i]!=31)
7702           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7703         else
7704           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7705         break;
7706       case SPAN:
7707         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7708       case IMM16:
7709         if(opcode[i]==0xf) //LUI
7710           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7711         else
7712           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7713         break;
7714       case LOAD:
7715       case LOADLR:
7716         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7717         break;
7718       case STORE:
7719       case STORELR:
7720         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7721         break;
7722       case ALU:
7723       case SHIFT:
7724         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7725         break;
7726       case MULTDIV:
7727         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7728         break;
7729       case SHIFTIMM:
7730         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7731         break;
7732       case MOV:
7733         if((opcode2[i]&0x1d)==0x10)
7734           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7735         else if((opcode2[i]&0x1d)==0x11)
7736           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7737         else
7738           printf (" %x: %s\n",start+i*4,insn[i]);
7739         break;
7740       case COP0:
7741         if(opcode2[i]==0)
7742           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7743         else if(opcode2[i]==4)
7744           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7745         else printf (" %x: %s\n",start+i*4,insn[i]);
7746         break;
7747       case COP1:
7748         if(opcode2[i]<3)
7749           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7750         else if(opcode2[i]>3)
7751           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7752         else printf (" %x: %s\n",start+i*4,insn[i]);
7753         break;
7754       case COP2:
7755         if(opcode2[i]<3)
7756           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7757         else if(opcode2[i]>3)
7758           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7759         else printf (" %x: %s\n",start+i*4,insn[i]);
7760         break;
7761       case C1LS:
7762         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7763         break;
7764       case C2LS:
7765         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7766         break;
7767       case INTCALL:
7768         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7769         break;
7770       default:
7771         //printf (" %s %8x\n",insn[i],source[i]);
7772         printf (" %x: %s\n",start+i*4,insn[i]);
7773     }
7774 }
7775
7776 // clear the state completely, instead of just marking
7777 // things invalid like invalidate_all_pages() does
7778 void new_dynarec_clear_full()
7779 {
7780   int n;
7781   out=(u_char *)BASE_ADDR;
7782   memset(invalid_code,1,sizeof(invalid_code));
7783   memset(hash_table,0xff,sizeof(hash_table));
7784   memset(mini_ht,-1,sizeof(mini_ht));
7785   memset(restore_candidate,0,sizeof(restore_candidate));
7786   memset(shadow,0,sizeof(shadow));
7787   copy=shadow;
7788   expirep=16384; // Expiry pointer, +2 blocks
7789   pending_exception=0;
7790   literalcount=0;
7791   stop_after_jal=0;
7792   // TLB
7793 #ifndef DISABLE_TLB
7794   using_tlb=0;
7795 #endif
7796   sp_in_mirror=0;
7797   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7798     memory_map[n]=-1;
7799   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7800     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7801   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7802     memory_map[n]=-1;
7803   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7804   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7805   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7806 }
7807
7808 void new_dynarec_init()
7809 {
7810   printf("Init new dynarec\n");
7811   out=(u_char *)BASE_ADDR;
7812   if (mmap (out, 1<<TARGET_SIZE_2,
7813             PROT_READ | PROT_WRITE | PROT_EXEC,
7814             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7815             -1, 0) <= 0) {printf("mmap() failed\n");}
7816 #ifdef MUPEN64
7817   rdword=&readmem_dword;
7818   fake_pc.f.r.rs=&readmem_dword;
7819   fake_pc.f.r.rt=&readmem_dword;
7820   fake_pc.f.r.rd=&readmem_dword;
7821 #endif
7822   int n;
7823   new_dynarec_clear_full();
7824 #ifdef HOST_IMM8
7825   // Copy this into local area so we don't have to put it in every literal pool
7826   invc_ptr=invalid_code;
7827 #endif
7828 #ifdef MUPEN64
7829   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7830     writemem[n] = write_nomem_new;
7831     writememb[n] = write_nomemb_new;
7832     writememh[n] = write_nomemh_new;
7833 #ifndef FORCE32
7834     writememd[n] = write_nomemd_new;
7835 #endif
7836     readmem[n] = read_nomem_new;
7837     readmemb[n] = read_nomemb_new;
7838     readmemh[n] = read_nomemh_new;
7839 #ifndef FORCE32
7840     readmemd[n] = read_nomemd_new;
7841 #endif
7842   }
7843   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7844     writemem[n] = write_rdram_new;
7845     writememb[n] = write_rdramb_new;
7846     writememh[n] = write_rdramh_new;
7847 #ifndef FORCE32
7848     writememd[n] = write_rdramd_new;
7849 #endif
7850   }
7851   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7852     writemem[n] = write_nomem_new;
7853     writememb[n] = write_nomemb_new;
7854     writememh[n] = write_nomemh_new;
7855 #ifndef FORCE32
7856     writememd[n] = write_nomemd_new;
7857 #endif
7858     readmem[n] = read_nomem_new;
7859     readmemb[n] = read_nomemb_new;
7860     readmemh[n] = read_nomemh_new;
7861 #ifndef FORCE32
7862     readmemd[n] = read_nomemd_new;
7863 #endif
7864   }
7865 #endif
7866   tlb_hacks();
7867   arch_init();
7868 }
7869
7870 void new_dynarec_cleanup()
7871 {
7872   int n;
7873   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7874   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7875   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7876   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7877   #ifdef ROM_COPY
7878   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7879   #endif
7880 }
7881
7882 int new_recompile_block(int addr)
7883 {
7884 /*
7885   if(addr==0x800cd050) {
7886     int block;
7887     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7888     int n;
7889     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7890   }
7891 */
7892   //if(Count==365117028) tracedebug=1;
7893   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7894   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7895   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7896   //if(debug) 
7897   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7898   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7899   /*if(Count>=312978186) {
7900     rlist();
7901   }*/
7902   //rlist();
7903   start = (u_int)addr&~3;
7904   //assert(((u_int)addr&1)==0);
7905 #ifdef PCSX
7906   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
7907      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
7908     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp, psxRegs.pc);
7909     sp_in_mirror=1;
7910   }
7911   if (Config.HLE && start == 0x80001000) // hlecall
7912   {
7913     // XXX: is this enough? Maybe check hleSoftCall?
7914     u_int beginning=(u_int)out;
7915     u_int page=get_page(start);
7916     invalid_code[start>>12]=0;
7917     emit_movimm(start,0);
7918     emit_writeword(0,(int)&pcaddr);
7919     emit_jmp((int)new_dyna_leave);
7920 #ifdef __arm__
7921     __clear_cache((void *)beginning,out);
7922 #endif
7923     ll_add(jump_in+page,start,(void *)beginning);
7924     return 0;
7925   }
7926   else if ((u_int)addr < 0x00200000 ||
7927     (0xa0000000 <= addr && addr < 0xa0200000)) {
7928     // used for BIOS calls mostly?
7929     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7930     pagelimit = (addr&0xa0000000)|0x00200000;
7931   }
7932   else if (!Config.HLE && (
7933 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7934     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7935     // BIOS
7936     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7937     pagelimit = (addr&0xfff00000)|0x80000;
7938   }
7939   else
7940 #endif
7941 #ifdef MUPEN64
7942   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7943     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7944     pagelimit = 0xa4001000;
7945   }
7946   else
7947 #endif
7948   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7949     source = (u_int *)((u_int)rdram+start-0x80000000);
7950     pagelimit = 0x80000000+RAM_SIZE;
7951   }
7952 #ifndef DISABLE_TLB
7953   else if ((signed int)addr >= (signed int)0xC0000000) {
7954     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7955     //if(tlb_LUT_r[start>>12])
7956       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7957     if((signed int)memory_map[start>>12]>=0) {
7958       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7959       pagelimit=(start+4096)&0xFFFFF000;
7960       int map=memory_map[start>>12];
7961       int i;
7962       for(i=0;i<5;i++) {
7963         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7964         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7965       }
7966       assem_debug("pagelimit=%x\n",pagelimit);
7967       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7968     }
7969     else {
7970       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7971       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7972       return -1; // Caller will invoke exception handler
7973     }
7974     //printf("source= %x\n",(int)source);
7975   }
7976 #endif
7977   else {
7978     printf("Compile at bogus memory address: %x \n", (int)addr);
7979     exit(1);
7980   }
7981
7982   /* Pass 1: disassemble */
7983   /* Pass 2: register dependencies, branch targets */
7984   /* Pass 3: register allocation */
7985   /* Pass 4: branch dependencies */
7986   /* Pass 5: pre-alloc */
7987   /* Pass 6: optimize clean/dirty state */
7988   /* Pass 7: flag 32-bit registers */
7989   /* Pass 8: assembly */
7990   /* Pass 9: linker */
7991   /* Pass 10: garbage collection / free memory */
7992
7993   int i,j;
7994   int done=0;
7995   unsigned int type,op,op2;
7996
7997   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7998   
7999   /* Pass 1 disassembly */
8000
8001   for(i=0;!done;i++) {
8002     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8003     minimum_free_regs[i]=0;
8004     opcode[i]=op=source[i]>>26;
8005     switch(op)
8006     {
8007       case 0x00: strcpy(insn[i],"special"); type=NI;
8008         op2=source[i]&0x3f;
8009         switch(op2)
8010         {
8011           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8012           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8013           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8014           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8015           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8016           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8017           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8018           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8019           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8020           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8021           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8022           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8023           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8024           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8025           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8026           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8027           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8028           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8029           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8030           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8031           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8032           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8033           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8034           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8035           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8036           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8037           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8038           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8039           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8040           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8041           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8042           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8043           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8044           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8045           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8046 #ifndef FORCE32
8047           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8048           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8049           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8050           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8051           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8052           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8053           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8054           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8055           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8056           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8057           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8058           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8059           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8060           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8061           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8062           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8063           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8064 #endif
8065         }
8066         break;
8067       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8068         op2=(source[i]>>16)&0x1f;
8069         switch(op2)
8070         {
8071           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8072           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8073           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8074           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8075           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8076           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8077           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8078           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8079           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8080           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8081           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8082           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8083           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8084           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8085         }
8086         break;
8087       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8088       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8089       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8090       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8091       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8092       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8093       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8094       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8095       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8096       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8097       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8098       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8099       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8100       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8101       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8102         op2=(source[i]>>21)&0x1f;
8103         switch(op2)
8104         {
8105           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8106           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8107           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8108           switch(source[i]&0x3f)
8109           {
8110             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8111             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8112             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8113             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8114 #ifdef PCSX
8115             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8116 #else
8117             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8118 #endif
8119           }
8120         }
8121         break;
8122       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8123         op2=(source[i]>>21)&0x1f;
8124         switch(op2)
8125         {
8126           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8127           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8128           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8129           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8130           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8131           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8132           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8133           switch((source[i]>>16)&0x3)
8134           {
8135             case 0x00: strcpy(insn[i],"BC1F"); break;
8136             case 0x01: strcpy(insn[i],"BC1T"); break;
8137             case 0x02: strcpy(insn[i],"BC1FL"); break;
8138             case 0x03: strcpy(insn[i],"BC1TL"); break;
8139           }
8140           break;
8141           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8142           switch(source[i]&0x3f)
8143           {
8144             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8145             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8146             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8147             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8148             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8149             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8150             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8151             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8152             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8153             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8154             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8155             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8156             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8157             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8158             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8159             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8160             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8161             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8162             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8163             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8164             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8165             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8166             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8167             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8168             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8169             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8170             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8171             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8172             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8173             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8174             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8175             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8176             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8177             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8178             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8179           }
8180           break;
8181           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8182           switch(source[i]&0x3f)
8183           {
8184             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8185             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8186             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8187             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8188             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8189             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8190             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8191             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8192             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8193             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8194             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8195             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8196             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8197             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8198             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8199             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8200             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8201             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8202             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8203             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8204             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8205             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8206             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8207             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8208             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8209             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8210             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8211             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8212             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8213             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8214             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8215             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8216             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8217             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8218             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8219           }
8220           break;
8221           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8222           switch(source[i]&0x3f)
8223           {
8224             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8225             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8226           }
8227           break;
8228           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8229           switch(source[i]&0x3f)
8230           {
8231             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8232             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8233           }
8234           break;
8235         }
8236         break;
8237 #ifndef FORCE32
8238       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8239       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8240       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8241       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8242       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8243       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8244       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8245       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8246 #endif
8247       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8248       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8249       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8250       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8251       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8252       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8253       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8254 #ifndef FORCE32
8255       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8256 #endif
8257       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8258       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8259       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8260       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8261 #ifndef FORCE32
8262       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8263       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8264 #endif
8265       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8266       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8267       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8268       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8269 #ifndef FORCE32
8270       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8271       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8272       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8273 #endif
8274       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8275       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8276 #ifndef FORCE32
8277       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8278       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8279       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8280 #endif
8281 #ifdef PCSX
8282       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8283         // note: COP MIPS-1 encoding differs from MIPS32
8284         op2=(source[i]>>21)&0x1f;
8285         if (source[i]&0x3f) {
8286           if (gte_handlers[source[i]&0x3f]!=NULL) {
8287             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8288             type=C2OP;
8289           }
8290         }
8291         else switch(op2)
8292         {
8293           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8294           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8295           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8296           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8297         }
8298         break;
8299       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8300       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8301       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8302 #endif
8303       default: strcpy(insn[i],"???"); type=NI;
8304         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8305         break;
8306     }
8307     itype[i]=type;
8308     opcode2[i]=op2;
8309     /* Get registers/immediates */
8310     lt1[i]=0;
8311     us1[i]=0;
8312     us2[i]=0;
8313     dep1[i]=0;
8314     dep2[i]=0;
8315     switch(type) {
8316       case LOAD:
8317         rs1[i]=(source[i]>>21)&0x1f;
8318         rs2[i]=0;
8319         rt1[i]=(source[i]>>16)&0x1f;
8320         rt2[i]=0;
8321         imm[i]=(short)source[i];
8322         break;
8323       case STORE:
8324       case STORELR:
8325         rs1[i]=(source[i]>>21)&0x1f;
8326         rs2[i]=(source[i]>>16)&0x1f;
8327         rt1[i]=0;
8328         rt2[i]=0;
8329         imm[i]=(short)source[i];
8330         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8331         break;
8332       case LOADLR:
8333         // LWL/LWR only load part of the register,
8334         // therefore the target register must be treated as a source too
8335         rs1[i]=(source[i]>>21)&0x1f;
8336         rs2[i]=(source[i]>>16)&0x1f;
8337         rt1[i]=(source[i]>>16)&0x1f;
8338         rt2[i]=0;
8339         imm[i]=(short)source[i];
8340         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8341         if(op==0x26) dep1[i]=rt1[i]; // LWR
8342         break;
8343       case IMM16:
8344         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8345         else rs1[i]=(source[i]>>21)&0x1f;
8346         rs2[i]=0;
8347         rt1[i]=(source[i]>>16)&0x1f;
8348         rt2[i]=0;
8349         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8350           imm[i]=(unsigned short)source[i];
8351         }else{
8352           imm[i]=(short)source[i];
8353         }
8354         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8355         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8356         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8357         break;
8358       case UJUMP:
8359         rs1[i]=0;
8360         rs2[i]=0;
8361         rt1[i]=0;
8362         rt2[i]=0;
8363         // The JAL instruction writes to r31.
8364         if (op&1) {
8365           rt1[i]=31;
8366         }
8367         rs2[i]=CCREG;
8368         break;
8369       case RJUMP:
8370         rs1[i]=(source[i]>>21)&0x1f;
8371         rs2[i]=0;
8372         rt1[i]=0;
8373         rt2[i]=0;
8374         // The JALR instruction writes to rd.
8375         if (op2&1) {
8376           rt1[i]=(source[i]>>11)&0x1f;
8377         }
8378         rs2[i]=CCREG;
8379         break;
8380       case CJUMP:
8381         rs1[i]=(source[i]>>21)&0x1f;
8382         rs2[i]=(source[i]>>16)&0x1f;
8383         rt1[i]=0;
8384         rt2[i]=0;
8385         if(op&2) { // BGTZ/BLEZ
8386           rs2[i]=0;
8387         }
8388         us1[i]=rs1[i];
8389         us2[i]=rs2[i];
8390         likely[i]=op>>4;
8391         break;
8392       case SJUMP:
8393         rs1[i]=(source[i]>>21)&0x1f;
8394         rs2[i]=CCREG;
8395         rt1[i]=0;
8396         rt2[i]=0;
8397         us1[i]=rs1[i];
8398         if(op2&0x10) { // BxxAL
8399           rt1[i]=31;
8400           // NOTE: If the branch is not taken, r31 is still overwritten
8401         }
8402         likely[i]=(op2&2)>>1;
8403         break;
8404       case FJUMP:
8405         rs1[i]=FSREG;
8406         rs2[i]=CSREG;
8407         rt1[i]=0;
8408         rt2[i]=0;
8409         likely[i]=((source[i])>>17)&1;
8410         break;
8411       case ALU:
8412         rs1[i]=(source[i]>>21)&0x1f; // source
8413         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8414         rt1[i]=(source[i]>>11)&0x1f; // destination
8415         rt2[i]=0;
8416         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8417           us1[i]=rs1[i];us2[i]=rs2[i];
8418         }
8419         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8420           dep1[i]=rs1[i];dep2[i]=rs2[i];
8421         }
8422         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8423           dep1[i]=rs1[i];dep2[i]=rs2[i];
8424         }
8425         break;
8426       case MULTDIV:
8427         rs1[i]=(source[i]>>21)&0x1f; // source
8428         rs2[i]=(source[i]>>16)&0x1f; // divisor
8429         rt1[i]=HIREG;
8430         rt2[i]=LOREG;
8431         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8432           us1[i]=rs1[i];us2[i]=rs2[i];
8433         }
8434         break;
8435       case MOV:
8436         rs1[i]=0;
8437         rs2[i]=0;
8438         rt1[i]=0;
8439         rt2[i]=0;
8440         if(op2==0x10) rs1[i]=HIREG; // MFHI
8441         if(op2==0x11) rt1[i]=HIREG; // MTHI
8442         if(op2==0x12) rs1[i]=LOREG; // MFLO
8443         if(op2==0x13) rt1[i]=LOREG; // MTLO
8444         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8445         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8446         dep1[i]=rs1[i];
8447         break;
8448       case SHIFT:
8449         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8450         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8451         rt1[i]=(source[i]>>11)&0x1f; // destination
8452         rt2[i]=0;
8453         // DSLLV/DSRLV/DSRAV are 64-bit
8454         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8455         break;
8456       case SHIFTIMM:
8457         rs1[i]=(source[i]>>16)&0x1f;
8458         rs2[i]=0;
8459         rt1[i]=(source[i]>>11)&0x1f;
8460         rt2[i]=0;
8461         imm[i]=(source[i]>>6)&0x1f;
8462         // DSxx32 instructions
8463         if(op2>=0x3c) imm[i]|=0x20;
8464         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8465         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8466         break;
8467       case COP0:
8468         rs1[i]=0;
8469         rs2[i]=0;
8470         rt1[i]=0;
8471         rt2[i]=0;
8472         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8473         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8474         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8475         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8476         break;
8477       case COP1:
8478       case COP2:
8479         rs1[i]=0;
8480         rs2[i]=0;
8481         rt1[i]=0;
8482         rt2[i]=0;
8483         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8484         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8485         if(op2==5) us1[i]=rs1[i]; // DMTC1
8486         rs2[i]=CSREG;
8487         break;
8488       case C1LS:
8489         rs1[i]=(source[i]>>21)&0x1F;
8490         rs2[i]=CSREG;
8491         rt1[i]=0;
8492         rt2[i]=0;
8493         imm[i]=(short)source[i];
8494         break;
8495       case C2LS:
8496         rs1[i]=(source[i]>>21)&0x1F;
8497         rs2[i]=0;
8498         rt1[i]=0;
8499         rt2[i]=0;
8500         imm[i]=(short)source[i];
8501         break;
8502       case FLOAT:
8503       case FCONV:
8504         rs1[i]=0;
8505         rs2[i]=CSREG;
8506         rt1[i]=0;
8507         rt2[i]=0;
8508         break;
8509       case FCOMP:
8510         rs1[i]=FSREG;
8511         rs2[i]=CSREG;
8512         rt1[i]=FSREG;
8513         rt2[i]=0;
8514         break;
8515       case SYSCALL:
8516       case HLECALL:
8517       case INTCALL:
8518         rs1[i]=CCREG;
8519         rs2[i]=0;
8520         rt1[i]=0;
8521         rt2[i]=0;
8522         break;
8523       default:
8524         rs1[i]=0;
8525         rs2[i]=0;
8526         rt1[i]=0;
8527         rt2[i]=0;
8528     }
8529     /* Calculate branch target addresses */
8530     if(type==UJUMP)
8531       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8532     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8533       ba[i]=start+i*4+8; // Ignore never taken branch
8534     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8535       ba[i]=start+i*4+8; // Ignore never taken branch
8536     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8537       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8538     else ba[i]=-1;
8539 #ifdef PCSX
8540     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8541       int do_in_intrp=0;
8542       // branch in delay slot?
8543       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8544         // don't handle first branch and call interpreter if it's hit
8545         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8546         do_in_intrp=1;
8547       }
8548       // basic load delay detection
8549       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8550         int t=(ba[i-1]-start)/4;
8551         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8552           // jump target wants DS result - potential load delay effect
8553           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8554           do_in_intrp=1;
8555           bt[t+1]=1; // expected return from interpreter
8556         }
8557         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8558               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8559           // v0 overwrite like this is a sign of trouble, bail out
8560           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8561           do_in_intrp=1;
8562         }
8563       }
8564       if(do_in_intrp) {
8565         rs1[i-1]=CCREG;
8566         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8567         ba[i-1]=-1;
8568         itype[i-1]=INTCALL;
8569         done=2;
8570         i--; // don't compile the DS
8571       }
8572     }
8573 #endif
8574     /* Is this the end of the block? */
8575     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8576       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8577         done=2;
8578       }
8579       else {
8580         if(stop_after_jal) done=1;
8581         // Stop on BREAK
8582         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8583       }
8584       // Don't recompile stuff that's already compiled
8585       if(check_addr(start+i*4+4)) done=1;
8586       // Don't get too close to the limit
8587       if(i>MAXBLOCK/2) done=1;
8588     }
8589     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8590     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8591     if(done==2) {
8592       // Does the block continue due to a branch?
8593       for(j=i-1;j>=0;j--)
8594       {
8595         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8596         if(ba[j]==start+i*4+4) done=j=0;
8597         if(ba[j]==start+i*4+8) done=j=0;
8598       }
8599     }
8600     //assert(i<MAXBLOCK-1);
8601     if(start+i*4==pagelimit-4) done=1;
8602     assert(start+i*4<pagelimit);
8603     if (i==MAXBLOCK-1) done=1;
8604     // Stop if we're compiling junk
8605     if(itype[i]==NI&&opcode[i]==0x11) {
8606       done=stop_after_jal=1;
8607       printf("Disabled speculative precompilation\n");
8608     }
8609   }
8610   slen=i;
8611   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8612     if(start+i*4==pagelimit) {
8613       itype[i-1]=SPAN;
8614     }
8615   }
8616   assert(slen>0);
8617
8618   /* Pass 2 - Register dependencies and branch targets */
8619
8620   unneeded_registers(0,slen-1,0);
8621   
8622   /* Pass 3 - Register allocation */
8623
8624   struct regstat current; // Current register allocations/status
8625   current.is32=1;
8626   current.dirty=0;
8627   current.u=unneeded_reg[0];
8628   current.uu=unneeded_reg_upper[0];
8629   clear_all_regs(current.regmap);
8630   alloc_reg(&current,0,CCREG);
8631   dirty_reg(&current,CCREG);
8632   current.isconst=0;
8633   current.wasconst=0;
8634   int ds=0;
8635   int cc=0;
8636   int hr=-1;
8637
8638 #ifndef FORCE32
8639   provisional_32bit();
8640 #endif
8641   if((u_int)addr&1) {
8642     // First instruction is delay slot
8643     cc=-1;
8644     bt[1]=1;
8645     ds=1;
8646     unneeded_reg[0]=1;
8647     unneeded_reg_upper[0]=1;
8648     current.regmap[HOST_BTREG]=BTREG;
8649   }
8650   
8651   for(i=0;i<slen;i++)
8652   {
8653     if(bt[i])
8654     {
8655       int hr;
8656       for(hr=0;hr<HOST_REGS;hr++)
8657       {
8658         // Is this really necessary?
8659         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8660       }
8661       current.isconst=0;
8662     }
8663     if(i>1)
8664     {
8665       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8666       {
8667         if(rs1[i-2]==0||rs2[i-2]==0)
8668         {
8669           if(rs1[i-2]) {
8670             current.is32|=1LL<<rs1[i-2];
8671             int hr=get_reg(current.regmap,rs1[i-2]|64);
8672             if(hr>=0) current.regmap[hr]=-1;
8673           }
8674           if(rs2[i-2]) {
8675             current.is32|=1LL<<rs2[i-2];
8676             int hr=get_reg(current.regmap,rs2[i-2]|64);
8677             if(hr>=0) current.regmap[hr]=-1;
8678           }
8679         }
8680       }
8681     }
8682 #ifndef FORCE32
8683     // If something jumps here with 64-bit values
8684     // then promote those registers to 64 bits
8685     if(bt[i])
8686     {
8687       uint64_t temp_is32=current.is32;
8688       for(j=i-1;j>=0;j--)
8689       {
8690         if(ba[j]==start+i*4) 
8691           temp_is32&=branch_regs[j].is32;
8692       }
8693       for(j=i;j<slen;j++)
8694       {
8695         if(ba[j]==start+i*4) 
8696           //temp_is32=1;
8697           temp_is32&=p32[j];
8698       }
8699       if(temp_is32!=current.is32) {
8700         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8701         #ifndef DESTRUCTIVE_WRITEBACK
8702         if(ds)
8703         #endif
8704         for(hr=0;hr<HOST_REGS;hr++)
8705         {
8706           int r=current.regmap[hr];
8707           if(r>0&&r<64)
8708           {
8709             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8710               temp_is32|=1LL<<r;
8711               //printf("restore %d\n",r);
8712             }
8713           }
8714         }
8715         current.is32=temp_is32;
8716       }
8717     }
8718 #else
8719     current.is32=-1LL;
8720 #endif
8721
8722     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8723     regs[i].wasconst=current.isconst;
8724     regs[i].was32=current.is32;
8725     regs[i].wasdirty=current.dirty;
8726     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8727     // To change a dirty register from 32 to 64 bits, we must write
8728     // it out during the previous cycle (for branches, 2 cycles)
8729     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8730     {
8731       uint64_t temp_is32=current.is32;
8732       for(j=i-1;j>=0;j--)
8733       {
8734         if(ba[j]==start+i*4+4) 
8735           temp_is32&=branch_regs[j].is32;
8736       }
8737       for(j=i;j<slen;j++)
8738       {
8739         if(ba[j]==start+i*4+4) 
8740           //temp_is32=1;
8741           temp_is32&=p32[j];
8742       }
8743       if(temp_is32!=current.is32) {
8744         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8745         for(hr=0;hr<HOST_REGS;hr++)
8746         {
8747           int r=current.regmap[hr];
8748           if(r>0)
8749           {
8750             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8751               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8752               {
8753                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8754                 {
8755                   //printf("dump %d/r%d\n",hr,r);
8756                   current.regmap[hr]=-1;
8757                   if(get_reg(current.regmap,r|64)>=0) 
8758                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8759                 }
8760               }
8761             }
8762           }
8763         }
8764       }
8765     }
8766     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8767     {
8768       uint64_t temp_is32=current.is32;
8769       for(j=i-1;j>=0;j--)
8770       {
8771         if(ba[j]==start+i*4+8) 
8772           temp_is32&=branch_regs[j].is32;
8773       }
8774       for(j=i;j<slen;j++)
8775       {
8776         if(ba[j]==start+i*4+8) 
8777           //temp_is32=1;
8778           temp_is32&=p32[j];
8779       }
8780       if(temp_is32!=current.is32) {
8781         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8782         for(hr=0;hr<HOST_REGS;hr++)
8783         {
8784           int r=current.regmap[hr];
8785           if(r>0)
8786           {
8787             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8788               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8789               {
8790                 //printf("dump %d/r%d\n",hr,r);
8791                 current.regmap[hr]=-1;
8792                 if(get_reg(current.regmap,r|64)>=0) 
8793                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8794               }
8795             }
8796           }
8797         }
8798       }
8799     }
8800     #endif
8801     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8802       if(i+1<slen) {
8803         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8804         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8805         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8806         current.u|=1;
8807         current.uu|=1;
8808       } else {
8809         current.u=1;
8810         current.uu=1;
8811       }
8812     } else {
8813       if(i+1<slen) {
8814         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8815         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8816         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8817         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8818         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8819         current.u|=1;
8820         current.uu|=1;
8821       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8822     }
8823     is_ds[i]=ds;
8824     if(ds) {
8825       ds=0; // Skip delay slot, already allocated as part of branch
8826       // ...but we need to alloc it in case something jumps here
8827       if(i+1<slen) {
8828         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8829         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8830       }else{
8831         current.u=branch_unneeded_reg[i-1];
8832         current.uu=branch_unneeded_reg_upper[i-1];
8833       }
8834       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8835       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8836       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8837       current.u|=1;
8838       current.uu|=1;
8839       struct regstat temp;
8840       memcpy(&temp,&current,sizeof(current));
8841       temp.wasdirty=temp.dirty;
8842       temp.was32=temp.is32;
8843       // TODO: Take into account unconditional branches, as below
8844       delayslot_alloc(&temp,i);
8845       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8846       regs[i].wasdirty=temp.wasdirty;
8847       regs[i].was32=temp.was32;
8848       regs[i].dirty=temp.dirty;
8849       regs[i].is32=temp.is32;
8850       regs[i].isconst=0;
8851       regs[i].wasconst=0;
8852       current.isconst=0;
8853       // Create entry (branch target) regmap
8854       for(hr=0;hr<HOST_REGS;hr++)
8855       {
8856         int r=temp.regmap[hr];
8857         if(r>=0) {
8858           if(r!=regmap_pre[i][hr]) {
8859             regs[i].regmap_entry[hr]=-1;
8860           }
8861           else
8862           {
8863             if(r<64){
8864               if((current.u>>r)&1) {
8865                 regs[i].regmap_entry[hr]=-1;
8866                 regs[i].regmap[hr]=-1;
8867                 //Don't clear regs in the delay slot as the branch might need them
8868                 //current.regmap[hr]=-1;
8869               }else
8870                 regs[i].regmap_entry[hr]=r;
8871             }
8872             else {
8873               if((current.uu>>(r&63))&1) {
8874                 regs[i].regmap_entry[hr]=-1;
8875                 regs[i].regmap[hr]=-1;
8876                 //Don't clear regs in the delay slot as the branch might need them
8877                 //current.regmap[hr]=-1;
8878               }else
8879                 regs[i].regmap_entry[hr]=r;
8880             }
8881           }
8882         } else {
8883           // First instruction expects CCREG to be allocated
8884           if(i==0&&hr==HOST_CCREG) 
8885             regs[i].regmap_entry[hr]=CCREG;
8886           else
8887             regs[i].regmap_entry[hr]=-1;
8888         }
8889       }
8890     }
8891     else { // Not delay slot
8892       switch(itype[i]) {
8893         case UJUMP:
8894           //current.isconst=0; // DEBUG
8895           //current.wasconst=0; // DEBUG
8896           //regs[i].wasconst=0; // DEBUG
8897           clear_const(&current,rt1[i]);
8898           alloc_cc(&current,i);
8899           dirty_reg(&current,CCREG);
8900           ooo[i]=1;
8901           delayslot_alloc(&current,i+1);
8902           if (rt1[i]==31) {
8903             alloc_reg(&current,i,31);
8904             dirty_reg(&current,31);
8905             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8906             //assert(rt1[i+1]!=rt1[i]);
8907             #ifdef REG_PREFETCH
8908             alloc_reg(&current,i,PTEMP);
8909             #endif
8910             //current.is32|=1LL<<rt1[i];
8911           }
8912           //current.isconst=0; // DEBUG
8913           ds=1;
8914           //printf("i=%d, isconst=%x\n",i,current.isconst);
8915           break;
8916         case RJUMP:
8917           //current.isconst=0;
8918           //current.wasconst=0;
8919           //regs[i].wasconst=0;
8920           clear_const(&current,rs1[i]);
8921           clear_const(&current,rt1[i]);
8922           alloc_cc(&current,i);
8923           dirty_reg(&current,CCREG);
8924           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8925             alloc_reg(&current,i,rs1[i]);
8926             if (rt1[i]!=0) {
8927               alloc_reg(&current,i,rt1[i]);
8928               dirty_reg(&current,rt1[i]);
8929               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8930               assert(rt1[i+1]!=rt1[i]);
8931               #ifdef REG_PREFETCH
8932               alloc_reg(&current,i,PTEMP);
8933               #endif
8934             }
8935             #ifdef USE_MINI_HT
8936             if(rs1[i]==31) { // JALR
8937               alloc_reg(&current,i,RHASH);
8938               #ifndef HOST_IMM_ADDR32
8939               alloc_reg(&current,i,RHTBL);
8940               #endif
8941             }
8942             #endif
8943             delayslot_alloc(&current,i+1);
8944           } else {
8945             // The delay slot overwrites our source register,
8946             // allocate a temporary register to hold the old value.
8947             current.isconst=0;
8948             current.wasconst=0;
8949             regs[i].wasconst=0;
8950             delayslot_alloc(&current,i+1);
8951             current.isconst=0;
8952             alloc_reg(&current,i,RTEMP);
8953           }
8954           //current.isconst=0; // DEBUG
8955           ooo[i]=1;
8956           ds=1;
8957           break;
8958         case CJUMP:
8959           //current.isconst=0;
8960           //current.wasconst=0;
8961           //regs[i].wasconst=0;
8962           clear_const(&current,rs1[i]);
8963           clear_const(&current,rs2[i]);
8964           if((opcode[i]&0x3E)==4) // BEQ/BNE
8965           {
8966             alloc_cc(&current,i);
8967             dirty_reg(&current,CCREG);
8968             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8969             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8970             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8971             {
8972               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8973               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8974             }
8975             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8976                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8977               // The delay slot overwrites one of our conditions.
8978               // Allocate the branch condition registers instead.
8979               current.isconst=0;
8980               current.wasconst=0;
8981               regs[i].wasconst=0;
8982               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8983               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8984               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8985               {
8986                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8987                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8988               }
8989             }
8990             else
8991             {
8992               ooo[i]=1;
8993               delayslot_alloc(&current,i+1);
8994             }
8995           }
8996           else
8997           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8998           {
8999             alloc_cc(&current,i);
9000             dirty_reg(&current,CCREG);
9001             alloc_reg(&current,i,rs1[i]);
9002             if(!(current.is32>>rs1[i]&1))
9003             {
9004               alloc_reg64(&current,i,rs1[i]);
9005             }
9006             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9007               // The delay slot overwrites one of our conditions.
9008               // Allocate the branch condition registers instead.
9009               current.isconst=0;
9010               current.wasconst=0;
9011               regs[i].wasconst=0;
9012               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9013               if(!((current.is32>>rs1[i])&1))
9014               {
9015                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9016               }
9017             }
9018             else
9019             {
9020               ooo[i]=1;
9021               delayslot_alloc(&current,i+1);
9022             }
9023           }
9024           else
9025           // Don't alloc the delay slot yet because we might not execute it
9026           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9027           {
9028             current.isconst=0;
9029             current.wasconst=0;
9030             regs[i].wasconst=0;
9031             alloc_cc(&current,i);
9032             dirty_reg(&current,CCREG);
9033             alloc_reg(&current,i,rs1[i]);
9034             alloc_reg(&current,i,rs2[i]);
9035             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9036             {
9037               alloc_reg64(&current,i,rs1[i]);
9038               alloc_reg64(&current,i,rs2[i]);
9039             }
9040           }
9041           else
9042           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9043           {
9044             current.isconst=0;
9045             current.wasconst=0;
9046             regs[i].wasconst=0;
9047             alloc_cc(&current,i);
9048             dirty_reg(&current,CCREG);
9049             alloc_reg(&current,i,rs1[i]);
9050             if(!(current.is32>>rs1[i]&1))
9051             {
9052               alloc_reg64(&current,i,rs1[i]);
9053             }
9054           }
9055           ds=1;
9056           //current.isconst=0;
9057           break;
9058         case SJUMP:
9059           //current.isconst=0;
9060           //current.wasconst=0;
9061           //regs[i].wasconst=0;
9062           clear_const(&current,rs1[i]);
9063           clear_const(&current,rt1[i]);
9064           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9065           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9066           {
9067             alloc_cc(&current,i);
9068             dirty_reg(&current,CCREG);
9069             alloc_reg(&current,i,rs1[i]);
9070             if(!(current.is32>>rs1[i]&1))
9071             {
9072               alloc_reg64(&current,i,rs1[i]);
9073             }
9074             if (rt1[i]==31) { // BLTZAL/BGEZAL
9075               alloc_reg(&current,i,31);
9076               dirty_reg(&current,31);
9077               //#ifdef REG_PREFETCH
9078               //alloc_reg(&current,i,PTEMP);
9079               //#endif
9080               //current.is32|=1LL<<rt1[i];
9081             }
9082             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9083                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9084               // Allocate the branch condition registers instead.
9085               current.isconst=0;
9086               current.wasconst=0;
9087               regs[i].wasconst=0;
9088               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9089               if(!((current.is32>>rs1[i])&1))
9090               {
9091                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9092               }
9093             }
9094             else
9095             {
9096               ooo[i]=1;
9097               delayslot_alloc(&current,i+1);
9098             }
9099           }
9100           else
9101           // Don't alloc the delay slot yet because we might not execute it
9102           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9103           {
9104             current.isconst=0;
9105             current.wasconst=0;
9106             regs[i].wasconst=0;
9107             alloc_cc(&current,i);
9108             dirty_reg(&current,CCREG);
9109             alloc_reg(&current,i,rs1[i]);
9110             if(!(current.is32>>rs1[i]&1))
9111             {
9112               alloc_reg64(&current,i,rs1[i]);
9113             }
9114           }
9115           ds=1;
9116           //current.isconst=0;
9117           break;
9118         case FJUMP:
9119           current.isconst=0;
9120           current.wasconst=0;
9121           regs[i].wasconst=0;
9122           if(likely[i]==0) // BC1F/BC1T
9123           {
9124             // TODO: Theoretically we can run out of registers here on x86.
9125             // The delay slot can allocate up to six, and we need to check
9126             // CSREG before executing the delay slot.  Possibly we can drop
9127             // the cycle count and then reload it after checking that the
9128             // FPU is in a usable state, or don't do out-of-order execution.
9129             alloc_cc(&current,i);
9130             dirty_reg(&current,CCREG);
9131             alloc_reg(&current,i,FSREG);
9132             alloc_reg(&current,i,CSREG);
9133             if(itype[i+1]==FCOMP) {
9134               // The delay slot overwrites the branch condition.
9135               // Allocate the branch condition registers instead.
9136               alloc_cc(&current,i);
9137               dirty_reg(&current,CCREG);
9138               alloc_reg(&current,i,CSREG);
9139               alloc_reg(&current,i,FSREG);
9140             }
9141             else {
9142               ooo[i]=1;
9143               delayslot_alloc(&current,i+1);
9144               alloc_reg(&current,i+1,CSREG);
9145             }
9146           }
9147           else
9148           // Don't alloc the delay slot yet because we might not execute it
9149           if(likely[i]) // BC1FL/BC1TL
9150           {
9151             alloc_cc(&current,i);
9152             dirty_reg(&current,CCREG);
9153             alloc_reg(&current,i,CSREG);
9154             alloc_reg(&current,i,FSREG);
9155           }
9156           ds=1;
9157           current.isconst=0;
9158           break;
9159         case IMM16:
9160           imm16_alloc(&current,i);
9161           break;
9162         case LOAD:
9163         case LOADLR:
9164           load_alloc(&current,i);
9165           break;
9166         case STORE:
9167         case STORELR:
9168           store_alloc(&current,i);
9169           break;
9170         case ALU:
9171           alu_alloc(&current,i);
9172           break;
9173         case SHIFT:
9174           shift_alloc(&current,i);
9175           break;
9176         case MULTDIV:
9177           multdiv_alloc(&current,i);
9178           break;
9179         case SHIFTIMM:
9180           shiftimm_alloc(&current,i);
9181           break;
9182         case MOV:
9183           mov_alloc(&current,i);
9184           break;
9185         case COP0:
9186           cop0_alloc(&current,i);
9187           break;
9188         case COP1:
9189         case COP2:
9190           cop1_alloc(&current,i);
9191           break;
9192         case C1LS:
9193           c1ls_alloc(&current,i);
9194           break;
9195         case C2LS:
9196           c2ls_alloc(&current,i);
9197           break;
9198         case C2OP:
9199           c2op_alloc(&current,i);
9200           break;
9201         case FCONV:
9202           fconv_alloc(&current,i);
9203           break;
9204         case FLOAT:
9205           float_alloc(&current,i);
9206           break;
9207         case FCOMP:
9208           fcomp_alloc(&current,i);
9209           break;
9210         case SYSCALL:
9211         case HLECALL:
9212         case INTCALL:
9213           syscall_alloc(&current,i);
9214           break;
9215         case SPAN:
9216           pagespan_alloc(&current,i);
9217           break;
9218       }
9219       
9220       // Drop the upper half of registers that have become 32-bit
9221       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9222       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9223         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9224         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9225         current.uu|=1;
9226       } else {
9227         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9228         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9229         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9230         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9231         current.uu|=1;
9232       }
9233
9234       // Create entry (branch target) regmap
9235       for(hr=0;hr<HOST_REGS;hr++)
9236       {
9237         int r,or,er;
9238         r=current.regmap[hr];
9239         if(r>=0) {
9240           if(r!=regmap_pre[i][hr]) {
9241             // TODO: delay slot (?)
9242             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9243             if(or<0||(r&63)>=TEMPREG){
9244               regs[i].regmap_entry[hr]=-1;
9245             }
9246             else
9247             {
9248               // Just move it to a different register
9249               regs[i].regmap_entry[hr]=r;
9250               // If it was dirty before, it's still dirty
9251               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9252             }
9253           }
9254           else
9255           {
9256             // Unneeded
9257             if(r==0){
9258               regs[i].regmap_entry[hr]=0;
9259             }
9260             else
9261             if(r<64){
9262               if((current.u>>r)&1) {
9263                 regs[i].regmap_entry[hr]=-1;
9264                 //regs[i].regmap[hr]=-1;
9265                 current.regmap[hr]=-1;
9266               }else
9267                 regs[i].regmap_entry[hr]=r;
9268             }
9269             else {
9270               if((current.uu>>(r&63))&1) {
9271                 regs[i].regmap_entry[hr]=-1;
9272                 //regs[i].regmap[hr]=-1;
9273                 current.regmap[hr]=-1;
9274               }else
9275                 regs[i].regmap_entry[hr]=r;
9276             }
9277           }
9278         } else {
9279           // Branches expect CCREG to be allocated at the target
9280           if(regmap_pre[i][hr]==CCREG) 
9281             regs[i].regmap_entry[hr]=CCREG;
9282           else
9283             regs[i].regmap_entry[hr]=-1;
9284         }
9285       }
9286       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9287     }
9288     /* Branch post-alloc */
9289     if(i>0)
9290     {
9291       current.was32=current.is32;
9292       current.wasdirty=current.dirty;
9293       switch(itype[i-1]) {
9294         case UJUMP:
9295           memcpy(&branch_regs[i-1],&current,sizeof(current));
9296           branch_regs[i-1].isconst=0;
9297           branch_regs[i-1].wasconst=0;
9298           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9299           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9300           alloc_cc(&branch_regs[i-1],i-1);
9301           dirty_reg(&branch_regs[i-1],CCREG);
9302           if(rt1[i-1]==31) { // JAL
9303             alloc_reg(&branch_regs[i-1],i-1,31);
9304             dirty_reg(&branch_regs[i-1],31);
9305             branch_regs[i-1].is32|=1LL<<31;
9306           }
9307           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9308           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9309           break;
9310         case RJUMP:
9311           memcpy(&branch_regs[i-1],&current,sizeof(current));
9312           branch_regs[i-1].isconst=0;
9313           branch_regs[i-1].wasconst=0;
9314           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9315           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9316           alloc_cc(&branch_regs[i-1],i-1);
9317           dirty_reg(&branch_regs[i-1],CCREG);
9318           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9319           if(rt1[i-1]!=0) { // JALR
9320             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9321             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9322             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9323           }
9324           #ifdef USE_MINI_HT
9325           if(rs1[i-1]==31) { // JALR
9326             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9327             #ifndef HOST_IMM_ADDR32
9328             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9329             #endif
9330           }
9331           #endif
9332           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9333           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9334           break;
9335         case CJUMP:
9336           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9337           {
9338             alloc_cc(&current,i-1);
9339             dirty_reg(&current,CCREG);
9340             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9341                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9342               // The delay slot overwrote one of our conditions
9343               // Delay slot goes after the test (in order)
9344               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9345               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9346               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9347               current.u|=1;
9348               current.uu|=1;
9349               delayslot_alloc(&current,i);
9350               current.isconst=0;
9351             }
9352             else
9353             {
9354               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9355               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9356               // Alloc the branch condition registers
9357               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9358               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9359               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9360               {
9361                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9362                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9363               }
9364             }
9365             memcpy(&branch_regs[i-1],&current,sizeof(current));
9366             branch_regs[i-1].isconst=0;
9367             branch_regs[i-1].wasconst=0;
9368             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9369             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9370           }
9371           else
9372           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9373           {
9374             alloc_cc(&current,i-1);
9375             dirty_reg(&current,CCREG);
9376             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9377               // The delay slot overwrote the branch condition
9378               // Delay slot goes after the test (in order)
9379               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9380               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9381               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9382               current.u|=1;
9383               current.uu|=1;
9384               delayslot_alloc(&current,i);
9385               current.isconst=0;
9386             }
9387             else
9388             {
9389               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9390               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9391               // Alloc the branch condition register
9392               alloc_reg(&current,i-1,rs1[i-1]);
9393               if(!(current.is32>>rs1[i-1]&1))
9394               {
9395                 alloc_reg64(&current,i-1,rs1[i-1]);
9396               }
9397             }
9398             memcpy(&branch_regs[i-1],&current,sizeof(current));
9399             branch_regs[i-1].isconst=0;
9400             branch_regs[i-1].wasconst=0;
9401             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9402             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9403           }
9404           else
9405           // Alloc the delay slot in case the branch is taken
9406           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9407           {
9408             memcpy(&branch_regs[i-1],&current,sizeof(current));
9409             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9410             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9411             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9412             alloc_cc(&branch_regs[i-1],i);
9413             dirty_reg(&branch_regs[i-1],CCREG);
9414             delayslot_alloc(&branch_regs[i-1],i);
9415             branch_regs[i-1].isconst=0;
9416             alloc_reg(&current,i,CCREG); // Not taken path
9417             dirty_reg(&current,CCREG);
9418             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9419           }
9420           else
9421           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9422           {
9423             memcpy(&branch_regs[i-1],&current,sizeof(current));
9424             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9425             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9426             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9427             alloc_cc(&branch_regs[i-1],i);
9428             dirty_reg(&branch_regs[i-1],CCREG);
9429             delayslot_alloc(&branch_regs[i-1],i);
9430             branch_regs[i-1].isconst=0;
9431             alloc_reg(&current,i,CCREG); // Not taken path
9432             dirty_reg(&current,CCREG);
9433             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9434           }
9435           break;
9436         case SJUMP:
9437           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9438           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9439           {
9440             alloc_cc(&current,i-1);
9441             dirty_reg(&current,CCREG);
9442             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9443               // The delay slot overwrote the branch condition
9444               // Delay slot goes after the test (in order)
9445               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9446               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9447               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9448               current.u|=1;
9449               current.uu|=1;
9450               delayslot_alloc(&current,i);
9451               current.isconst=0;
9452             }
9453             else
9454             {
9455               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9456               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9457               // Alloc the branch condition register
9458               alloc_reg(&current,i-1,rs1[i-1]);
9459               if(!(current.is32>>rs1[i-1]&1))
9460               {
9461                 alloc_reg64(&current,i-1,rs1[i-1]);
9462               }
9463             }
9464             memcpy(&branch_regs[i-1],&current,sizeof(current));
9465             branch_regs[i-1].isconst=0;
9466             branch_regs[i-1].wasconst=0;
9467             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9468             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9469           }
9470           else
9471           // Alloc the delay slot in case the branch is taken
9472           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9473           {
9474             memcpy(&branch_regs[i-1],&current,sizeof(current));
9475             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9476             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9477             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9478             alloc_cc(&branch_regs[i-1],i);
9479             dirty_reg(&branch_regs[i-1],CCREG);
9480             delayslot_alloc(&branch_regs[i-1],i);
9481             branch_regs[i-1].isconst=0;
9482             alloc_reg(&current,i,CCREG); // Not taken path
9483             dirty_reg(&current,CCREG);
9484             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9485           }
9486           // FIXME: BLTZAL/BGEZAL
9487           if(opcode2[i-1]&0x10) { // BxxZAL
9488             alloc_reg(&branch_regs[i-1],i-1,31);
9489             dirty_reg(&branch_regs[i-1],31);
9490             branch_regs[i-1].is32|=1LL<<31;
9491           }
9492           break;
9493         case FJUMP:
9494           if(likely[i-1]==0) // BC1F/BC1T
9495           {
9496             alloc_cc(&current,i-1);
9497             dirty_reg(&current,CCREG);
9498             if(itype[i]==FCOMP) {
9499               // The delay slot overwrote the branch condition
9500               // Delay slot goes after the test (in order)
9501               delayslot_alloc(&current,i);
9502               current.isconst=0;
9503             }
9504             else
9505             {
9506               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9507               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9508               // Alloc the branch condition register
9509               alloc_reg(&current,i-1,FSREG);
9510             }
9511             memcpy(&branch_regs[i-1],&current,sizeof(current));
9512             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9513           }
9514           else // BC1FL/BC1TL
9515           {
9516             // Alloc the delay slot in case the branch is taken
9517             memcpy(&branch_regs[i-1],&current,sizeof(current));
9518             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9519             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9520             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9521             alloc_cc(&branch_regs[i-1],i);
9522             dirty_reg(&branch_regs[i-1],CCREG);
9523             delayslot_alloc(&branch_regs[i-1],i);
9524             branch_regs[i-1].isconst=0;
9525             alloc_reg(&current,i,CCREG); // Not taken path
9526             dirty_reg(&current,CCREG);
9527             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9528           }
9529           break;
9530       }
9531
9532       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9533       {
9534         if(rt1[i-1]==31) // JAL/JALR
9535         {
9536           // Subroutine call will return here, don't alloc any registers
9537           current.is32=1;
9538           current.dirty=0;
9539           clear_all_regs(current.regmap);
9540           alloc_reg(&current,i,CCREG);
9541           dirty_reg(&current,CCREG);
9542         }
9543         else if(i+1<slen)
9544         {
9545           // Internal branch will jump here, match registers to caller
9546           current.is32=0x3FFFFFFFFLL;
9547           current.dirty=0;
9548           clear_all_regs(current.regmap);
9549           alloc_reg(&current,i,CCREG);
9550           dirty_reg(&current,CCREG);
9551           for(j=i-1;j>=0;j--)
9552           {
9553             if(ba[j]==start+i*4+4) {
9554               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9555               current.is32=branch_regs[j].is32;
9556               current.dirty=branch_regs[j].dirty;
9557               break;
9558             }
9559           }
9560           while(j>=0) {
9561             if(ba[j]==start+i*4+4) {
9562               for(hr=0;hr<HOST_REGS;hr++) {
9563                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9564                   current.regmap[hr]=-1;
9565                 }
9566                 current.is32&=branch_regs[j].is32;
9567                 current.dirty&=branch_regs[j].dirty;
9568               }
9569             }
9570             j--;
9571           }
9572         }
9573       }
9574     }
9575
9576     // Count cycles in between branches
9577     ccadj[i]=cc;
9578     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9579     {
9580       cc=0;
9581     }
9582 #ifdef PCSX
9583     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9584     {
9585       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9586     }
9587     else if(itype[i]==C2LS)
9588     {
9589       cc+=4;
9590     }
9591 #endif
9592     else
9593     {
9594       cc++;
9595     }
9596
9597     flush_dirty_uppers(&current);
9598     if(!is_ds[i]) {
9599       regs[i].is32=current.is32;
9600       regs[i].dirty=current.dirty;
9601       regs[i].isconst=current.isconst;
9602       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9603     }
9604     for(hr=0;hr<HOST_REGS;hr++) {
9605       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9606         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9607           regs[i].wasconst&=~(1<<hr);
9608         }
9609       }
9610     }
9611     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9612   }
9613   
9614   /* Pass 4 - Cull unused host registers */
9615   
9616   uint64_t nr=0;
9617   
9618   for (i=slen-1;i>=0;i--)
9619   {
9620     int hr;
9621     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9622     {
9623       if(ba[i]<start || ba[i]>=(start+slen*4))
9624       {
9625         // Branch out of this block, don't need anything
9626         nr=0;
9627       }
9628       else
9629       {
9630         // Internal branch
9631         // Need whatever matches the target
9632         nr=0;
9633         int t=(ba[i]-start)>>2;
9634         for(hr=0;hr<HOST_REGS;hr++)
9635         {
9636           if(regs[i].regmap_entry[hr]>=0) {
9637             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9638           }
9639         }
9640       }
9641       // Conditional branch may need registers for following instructions
9642       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9643       {
9644         if(i<slen-2) {
9645           nr|=needed_reg[i+2];
9646           for(hr=0;hr<HOST_REGS;hr++)
9647           {
9648             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9649             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9650           }
9651         }
9652       }
9653       // Don't need stuff which is overwritten
9654       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9655       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9656       // Merge in delay slot
9657       for(hr=0;hr<HOST_REGS;hr++)
9658       {
9659         if(!likely[i]) {
9660           // These are overwritten unless the branch is "likely"
9661           // and the delay slot is nullified if not taken
9662           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9663           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9664         }
9665         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9666         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9667         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9668         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9669         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9670         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9671         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9672         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9673         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9674           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9675           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9676         }
9677         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9678           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9679           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9680         }
9681         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9682           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9683           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9684         }
9685       }
9686     }
9687     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9688     {
9689       // SYSCALL instruction (software interrupt)
9690       nr=0;
9691     }
9692     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9693     {
9694       // ERET instruction (return from interrupt)
9695       nr=0;
9696     }
9697     else // Non-branch
9698     {
9699       if(i<slen-1) {
9700         for(hr=0;hr<HOST_REGS;hr++) {
9701           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9702           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9703           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9704           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9705         }
9706       }
9707     }
9708     for(hr=0;hr<HOST_REGS;hr++)
9709     {
9710       // Overwritten registers are not needed
9711       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9712       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9713       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9714       // Source registers are needed
9715       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9716       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9717       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9718       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9719       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9720       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9721       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9722       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9723       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9724         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9725         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9726       }
9727       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9728         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9729         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9730       }
9731       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9732         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9733         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9734       }
9735       // Don't store a register immediately after writing it,
9736       // may prevent dual-issue.
9737       // But do so if this is a branch target, otherwise we
9738       // might have to load the register before the branch.
9739       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9740         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9741            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9742           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9743           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9744         }
9745         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9746            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9747           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9748           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9749         }
9750       }
9751     }
9752     // Cycle count is needed at branches.  Assume it is needed at the target too.
9753     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9754       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9755       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9756     }
9757     // Save it
9758     needed_reg[i]=nr;
9759     
9760     // Deallocate unneeded registers
9761     for(hr=0;hr<HOST_REGS;hr++)
9762     {
9763       if(!((nr>>hr)&1)) {
9764         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9765         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9766            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9767            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9768         {
9769           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9770           {
9771             if(likely[i]) {
9772               regs[i].regmap[hr]=-1;
9773               regs[i].isconst&=~(1<<hr);
9774               if(i<slen-2) {
9775                 regmap_pre[i+2][hr]=-1;
9776                 regs[i+2].wasconst&=~(1<<hr);
9777               }
9778             }
9779           }
9780         }
9781         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9782         {
9783           int d1=0,d2=0,map=0,temp=0;
9784           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9785           {
9786             d1=dep1[i+1];
9787             d2=dep2[i+1];
9788           }
9789           if(using_tlb) {
9790             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9791                itype[i+1]==STORE || itype[i+1]==STORELR ||
9792                itype[i+1]==C1LS || itype[i+1]==C2LS)
9793             map=TLREG;
9794           } else
9795           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9796              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9797             map=INVCP;
9798           }
9799           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9800              itype[i+1]==C1LS || itype[i+1]==C2LS)
9801             temp=FTEMP;
9802           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9803              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9804              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9805              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9806              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9807              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9808              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9809              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9810              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9811              regs[i].regmap[hr]!=map )
9812           {
9813             regs[i].regmap[hr]=-1;
9814             regs[i].isconst&=~(1<<hr);
9815             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9816                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9817                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9818                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9819                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9820                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9821                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9822                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9823                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9824                branch_regs[i].regmap[hr]!=map)
9825             {
9826               branch_regs[i].regmap[hr]=-1;
9827               branch_regs[i].regmap_entry[hr]=-1;
9828               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9829               {
9830                 if(!likely[i]&&i<slen-2) {
9831                   regmap_pre[i+2][hr]=-1;
9832                   regs[i+2].wasconst&=~(1<<hr);
9833                 }
9834               }
9835             }
9836           }
9837         }
9838         else
9839         {
9840           // Non-branch
9841           if(i>0)
9842           {
9843             int d1=0,d2=0,map=-1,temp=-1;
9844             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9845             {
9846               d1=dep1[i];
9847               d2=dep2[i];
9848             }
9849             if(using_tlb) {
9850               if(itype[i]==LOAD || itype[i]==LOADLR ||
9851                  itype[i]==STORE || itype[i]==STORELR ||
9852                  itype[i]==C1LS || itype[i]==C2LS)
9853               map=TLREG;
9854             } else if(itype[i]==STORE || itype[i]==STORELR ||
9855                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9856               map=INVCP;
9857             }
9858             if(itype[i]==LOADLR || itype[i]==STORELR ||
9859                itype[i]==C1LS || itype[i]==C2LS)
9860               temp=FTEMP;
9861             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9862                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9863                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9864                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9865                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9866                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9867             {
9868               if(i<slen-1&&!is_ds[i]) {
9869                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9870                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9871                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9872                 {
9873                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9874                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9875                 }
9876                 regmap_pre[i+1][hr]=-1;
9877                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9878                 regs[i+1].wasconst&=~(1<<hr);
9879               }
9880               regs[i].regmap[hr]=-1;
9881               regs[i].isconst&=~(1<<hr);
9882             }
9883           }
9884         }
9885       }
9886     }
9887   }
9888   
9889   /* Pass 5 - Pre-allocate registers */
9890   
9891   // If a register is allocated during a loop, try to allocate it for the
9892   // entire loop, if possible.  This avoids loading/storing registers
9893   // inside of the loop.
9894   
9895   signed char f_regmap[HOST_REGS];
9896   clear_all_regs(f_regmap);
9897   for(i=0;i<slen-1;i++)
9898   {
9899     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9900     {
9901       if(ba[i]>=start && ba[i]<(start+i*4)) 
9902       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9903       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9904       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9905       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9906       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9907       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9908       {
9909         int t=(ba[i]-start)>>2;
9910         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9911         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9912         for(hr=0;hr<HOST_REGS;hr++)
9913         {
9914           if(regs[i].regmap[hr]>64) {
9915             if(!((regs[i].dirty>>hr)&1))
9916               f_regmap[hr]=regs[i].regmap[hr];
9917             else f_regmap[hr]=-1;
9918           }
9919           else if(regs[i].regmap[hr]>=0) {
9920             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9921               // dealloc old register
9922               int n;
9923               for(n=0;n<HOST_REGS;n++)
9924               {
9925                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9926               }
9927               // and alloc new one
9928               f_regmap[hr]=regs[i].regmap[hr];
9929             }
9930           }
9931           if(branch_regs[i].regmap[hr]>64) {
9932             if(!((branch_regs[i].dirty>>hr)&1))
9933               f_regmap[hr]=branch_regs[i].regmap[hr];
9934             else f_regmap[hr]=-1;
9935           }
9936           else if(branch_regs[i].regmap[hr]>=0) {
9937             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9938               // dealloc old register
9939               int n;
9940               for(n=0;n<HOST_REGS;n++)
9941               {
9942                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9943               }
9944               // and alloc new one
9945               f_regmap[hr]=branch_regs[i].regmap[hr];
9946             }
9947           }
9948           if(ooo[i]) {
9949             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9950               f_regmap[hr]=branch_regs[i].regmap[hr];
9951           }else{
9952             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9953               f_regmap[hr]=branch_regs[i].regmap[hr];
9954           }
9955           // Avoid dirty->clean transition
9956           #ifdef DESTRUCTIVE_WRITEBACK
9957           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9958           #endif
9959           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9960           // case above, however it's always a good idea.  We can't hoist the
9961           // load if the register was already allocated, so there's no point
9962           // wasting time analyzing most of these cases.  It only "succeeds"
9963           // when the mapping was different and the load can be replaced with
9964           // a mov, which is of negligible benefit.  So such cases are
9965           // skipped below.
9966           if(f_regmap[hr]>0) {
9967             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9968               int r=f_regmap[hr];
9969               for(j=t;j<=i;j++)
9970               {
9971                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9972                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9973                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9974                 if(r>63) {
9975                   // NB This can exclude the case where the upper-half
9976                   // register is lower numbered than the lower-half
9977                   // register.  Not sure if it's worth fixing...
9978                   if(get_reg(regs[j].regmap,r&63)<0) break;
9979                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9980                   if(regs[j].is32&(1LL<<(r&63))) break;
9981                 }
9982                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9983                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9984                   int k;
9985                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9986                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9987                     if(r>63) {
9988                       if(get_reg(regs[i].regmap,r&63)<0) break;
9989                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9990                     }
9991                     k=i;
9992                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9993                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9994                         //printf("no free regs for store %x\n",start+(k-1)*4);
9995                         break;
9996                       }
9997                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9998                         //printf("no-match due to different register\n");
9999                         break;
10000                       }
10001                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10002                         //printf("no-match due to branch\n");
10003                         break;
10004                       }
10005                       // call/ret fast path assumes no registers allocated
10006                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10007                         break;
10008                       }
10009                       if(r>63) {
10010                         // NB This can exclude the case where the upper-half
10011                         // register is lower numbered than the lower-half
10012                         // register.  Not sure if it's worth fixing...
10013                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10014                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10015                       }
10016                       k--;
10017                     }
10018                     if(i<slen-1) {
10019                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10020                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10021                         //printf("bad match after branch\n");
10022                         break;
10023                       }
10024                     }
10025                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10026                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10027                       while(k<i) {
10028                         regs[k].regmap_entry[hr]=f_regmap[hr];
10029                         regs[k].regmap[hr]=f_regmap[hr];
10030                         regmap_pre[k+1][hr]=f_regmap[hr];
10031                         regs[k].wasdirty&=~(1<<hr);
10032                         regs[k].dirty&=~(1<<hr);
10033                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10034                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10035                         regs[k].wasconst&=~(1<<hr);
10036                         regs[k].isconst&=~(1<<hr);
10037                         k++;
10038                       }
10039                     }
10040                     else {
10041                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10042                       break;
10043                     }
10044                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10045                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10046                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10047                       regs[i].regmap_entry[hr]=f_regmap[hr];
10048                       regs[i].regmap[hr]=f_regmap[hr];
10049                       regs[i].wasdirty&=~(1<<hr);
10050                       regs[i].dirty&=~(1<<hr);
10051                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10052                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10053                       regs[i].wasconst&=~(1<<hr);
10054                       regs[i].isconst&=~(1<<hr);
10055                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10056                       branch_regs[i].wasdirty&=~(1<<hr);
10057                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10058                       branch_regs[i].regmap[hr]=f_regmap[hr];
10059                       branch_regs[i].dirty&=~(1<<hr);
10060                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10061                       branch_regs[i].wasconst&=~(1<<hr);
10062                       branch_regs[i].isconst&=~(1<<hr);
10063                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10064                         regmap_pre[i+2][hr]=f_regmap[hr];
10065                         regs[i+2].wasdirty&=~(1<<hr);
10066                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10067                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10068                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10069                       }
10070                     }
10071                   }
10072                   for(k=t;k<j;k++) {
10073                     // Alloc register clean at beginning of loop,
10074                     // but may dirty it in pass 6
10075                     regs[k].regmap_entry[hr]=f_regmap[hr];
10076                     regs[k].regmap[hr]=f_regmap[hr];
10077                     regs[k].dirty&=~(1<<hr);
10078                     regs[k].wasconst&=~(1<<hr);
10079                     regs[k].isconst&=~(1<<hr);
10080                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10081                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10082                       branch_regs[k].regmap[hr]=f_regmap[hr];
10083                       branch_regs[k].dirty&=~(1<<hr);
10084                       branch_regs[k].wasconst&=~(1<<hr);
10085                       branch_regs[k].isconst&=~(1<<hr);
10086                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10087                         regmap_pre[k+2][hr]=f_regmap[hr];
10088                         regs[k+2].wasdirty&=~(1<<hr);
10089                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10090                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10091                       }
10092                     }
10093                     else
10094                     {
10095                       regmap_pre[k+1][hr]=f_regmap[hr];
10096                       regs[k+1].wasdirty&=~(1<<hr);
10097                     }
10098                   }
10099                   if(regs[j].regmap[hr]==f_regmap[hr])
10100                     regs[j].regmap_entry[hr]=f_regmap[hr];
10101                   break;
10102                 }
10103                 if(j==i) break;
10104                 if(regs[j].regmap[hr]>=0)
10105                   break;
10106                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10107                   //printf("no-match due to different register\n");
10108                   break;
10109                 }
10110                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10111                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10112                   break;
10113                 }
10114                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10115                 {
10116                   // Stop on unconditional branch
10117                   break;
10118                 }
10119                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10120                 {
10121                   if(ooo[j]) {
10122                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10123                       break;
10124                   }else{
10125                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10126                       break;
10127                   }
10128                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10129                     //printf("no-match due to different register (branch)\n");
10130                     break;
10131                   }
10132                 }
10133                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10134                   //printf("No free regs for store %x\n",start+j*4);
10135                   break;
10136                 }
10137                 if(f_regmap[hr]>=64) {
10138                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10139                     break;
10140                   }
10141                   else
10142                   {
10143                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10144                       break;
10145                     }
10146                   }
10147                 }
10148               }
10149             }
10150           }
10151         }
10152       }
10153     }else{
10154       // Non branch or undetermined branch target
10155       for(hr=0;hr<HOST_REGS;hr++)
10156       {
10157         if(hr!=EXCLUDE_REG) {
10158           if(regs[i].regmap[hr]>64) {
10159             if(!((regs[i].dirty>>hr)&1))
10160               f_regmap[hr]=regs[i].regmap[hr];
10161           }
10162           else if(regs[i].regmap[hr]>=0) {
10163             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10164               // dealloc old register
10165               int n;
10166               for(n=0;n<HOST_REGS;n++)
10167               {
10168                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10169               }
10170               // and alloc new one
10171               f_regmap[hr]=regs[i].regmap[hr];
10172             }
10173           }
10174         }
10175       }
10176       // Try to restore cycle count at branch targets
10177       if(bt[i]) {
10178         for(j=i;j<slen-1;j++) {
10179           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10180           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10181             //printf("no free regs for store %x\n",start+j*4);
10182             break;
10183           }
10184         }
10185         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10186           int k=i;
10187           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10188           while(k<j) {
10189             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10190             regs[k].regmap[HOST_CCREG]=CCREG;
10191             regmap_pre[k+1][HOST_CCREG]=CCREG;
10192             regs[k+1].wasdirty|=1<<HOST_CCREG;
10193             regs[k].dirty|=1<<HOST_CCREG;
10194             regs[k].wasconst&=~(1<<HOST_CCREG);
10195             regs[k].isconst&=~(1<<HOST_CCREG);
10196             k++;
10197           }
10198           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10199         }
10200         // Work backwards from the branch target
10201         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10202         {
10203           //printf("Extend backwards\n");
10204           int k;
10205           k=i;
10206           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10207             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10208               //printf("no free regs for store %x\n",start+(k-1)*4);
10209               break;
10210             }
10211             k--;
10212           }
10213           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10214             //printf("Extend CC, %x ->\n",start+k*4);
10215             while(k<=i) {
10216               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10217               regs[k].regmap[HOST_CCREG]=CCREG;
10218               regmap_pre[k+1][HOST_CCREG]=CCREG;
10219               regs[k+1].wasdirty|=1<<HOST_CCREG;
10220               regs[k].dirty|=1<<HOST_CCREG;
10221               regs[k].wasconst&=~(1<<HOST_CCREG);
10222               regs[k].isconst&=~(1<<HOST_CCREG);
10223               k++;
10224             }
10225           }
10226           else {
10227             //printf("Fail Extend CC, %x ->\n",start+k*4);
10228           }
10229         }
10230       }
10231       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10232          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10233          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10234          itype[i]!=FCONV&&itype[i]!=FCOMP)
10235       {
10236         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10237       }
10238     }
10239   }
10240   
10241   // Cache memory offset or tlb map pointer if a register is available
10242   #ifndef HOST_IMM_ADDR32
10243   #ifndef RAM_OFFSET
10244   if(using_tlb)
10245   #endif
10246   {
10247     int earliest_available[HOST_REGS];
10248     int loop_start[HOST_REGS];
10249     int score[HOST_REGS];
10250     int end[HOST_REGS];
10251     int reg=using_tlb?MMREG:ROREG;
10252
10253     // Init
10254     for(hr=0;hr<HOST_REGS;hr++) {
10255       score[hr]=0;earliest_available[hr]=0;
10256       loop_start[hr]=MAXBLOCK;
10257     }
10258     for(i=0;i<slen-1;i++)
10259     {
10260       // Can't do anything if no registers are available
10261       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10262         for(hr=0;hr<HOST_REGS;hr++) {
10263           score[hr]=0;earliest_available[hr]=i+1;
10264           loop_start[hr]=MAXBLOCK;
10265         }
10266       }
10267       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10268         if(!ooo[i]) {
10269           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10270             for(hr=0;hr<HOST_REGS;hr++) {
10271               score[hr]=0;earliest_available[hr]=i+1;
10272               loop_start[hr]=MAXBLOCK;
10273             }
10274           }
10275         }else{
10276           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10277             for(hr=0;hr<HOST_REGS;hr++) {
10278               score[hr]=0;earliest_available[hr]=i+1;
10279               loop_start[hr]=MAXBLOCK;
10280             }
10281           }
10282         }
10283       }
10284       // Mark unavailable registers
10285       for(hr=0;hr<HOST_REGS;hr++) {
10286         if(regs[i].regmap[hr]>=0) {
10287           score[hr]=0;earliest_available[hr]=i+1;
10288           loop_start[hr]=MAXBLOCK;
10289         }
10290         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10291           if(branch_regs[i].regmap[hr]>=0) {
10292             score[hr]=0;earliest_available[hr]=i+2;
10293             loop_start[hr]=MAXBLOCK;
10294           }
10295         }
10296       }
10297       // No register allocations after unconditional jumps
10298       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10299       {
10300         for(hr=0;hr<HOST_REGS;hr++) {
10301           score[hr]=0;earliest_available[hr]=i+2;
10302           loop_start[hr]=MAXBLOCK;
10303         }
10304         i++; // Skip delay slot too
10305         //printf("skip delay slot: %x\n",start+i*4);
10306       }
10307       else
10308       // Possible match
10309       if(itype[i]==LOAD||itype[i]==LOADLR||
10310          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10311         for(hr=0;hr<HOST_REGS;hr++) {
10312           if(hr!=EXCLUDE_REG) {
10313             end[hr]=i-1;
10314             for(j=i;j<slen-1;j++) {
10315               if(regs[j].regmap[hr]>=0) break;
10316               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10317                 if(branch_regs[j].regmap[hr]>=0) break;
10318                 if(ooo[j]) {
10319                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10320                 }else{
10321                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10322                 }
10323               }
10324               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10325               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10326                 int t=(ba[j]-start)>>2;
10327                 if(t<j&&t>=earliest_available[hr]) {
10328                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10329                     // Score a point for hoisting loop invariant
10330                     if(t<loop_start[hr]) loop_start[hr]=t;
10331                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10332                     score[hr]++;
10333                     end[hr]=j;
10334                   }
10335                 }
10336                 else if(t<j) {
10337                   if(regs[t].regmap[hr]==reg) {
10338                     // Score a point if the branch target matches this register
10339                     score[hr]++;
10340                     end[hr]=j;
10341                   }
10342                 }
10343                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10344                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10345                   score[hr]++;
10346                   end[hr]=j;
10347                 }
10348               }
10349               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10350               {
10351                 // Stop on unconditional branch
10352                 break;
10353               }
10354               else
10355               if(itype[j]==LOAD||itype[j]==LOADLR||
10356                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10357                 score[hr]++;
10358                 end[hr]=j;
10359               }
10360             }
10361           }
10362         }
10363         // Find highest score and allocate that register
10364         int maxscore=0;
10365         for(hr=0;hr<HOST_REGS;hr++) {
10366           if(hr!=EXCLUDE_REG) {
10367             if(score[hr]>score[maxscore]) {
10368               maxscore=hr;
10369               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10370             }
10371           }
10372         }
10373         if(score[maxscore]>1)
10374         {
10375           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10376           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10377             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10378             assert(regs[j].regmap[maxscore]<0);
10379             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10380             regs[j].regmap[maxscore]=reg;
10381             regs[j].dirty&=~(1<<maxscore);
10382             regs[j].wasconst&=~(1<<maxscore);
10383             regs[j].isconst&=~(1<<maxscore);
10384             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10385               branch_regs[j].regmap[maxscore]=reg;
10386               branch_regs[j].wasdirty&=~(1<<maxscore);
10387               branch_regs[j].dirty&=~(1<<maxscore);
10388               branch_regs[j].wasconst&=~(1<<maxscore);
10389               branch_regs[j].isconst&=~(1<<maxscore);
10390               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10391                 regmap_pre[j+2][maxscore]=reg;
10392                 regs[j+2].wasdirty&=~(1<<maxscore);
10393               }
10394               // loop optimization (loop_preload)
10395               int t=(ba[j]-start)>>2;
10396               if(t==loop_start[maxscore]) {
10397                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10398                   regs[t].regmap_entry[maxscore]=reg;
10399               }
10400             }
10401             else
10402             {
10403               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10404                 regmap_pre[j+1][maxscore]=reg;
10405                 regs[j+1].wasdirty&=~(1<<maxscore);
10406               }
10407             }
10408           }
10409           i=j-1;
10410           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10411           for(hr=0;hr<HOST_REGS;hr++) {
10412             score[hr]=0;earliest_available[hr]=i+i;
10413             loop_start[hr]=MAXBLOCK;
10414           }
10415         }
10416       }
10417     }
10418   }
10419   #endif
10420   
10421   // This allocates registers (if possible) one instruction prior
10422   // to use, which can avoid a load-use penalty on certain CPUs.
10423   for(i=0;i<slen-1;i++)
10424   {
10425     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10426     {
10427       if(!bt[i+1])
10428       {
10429         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10430            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10431         {
10432           if(rs1[i+1]) {
10433             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10434             {
10435               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10436               {
10437                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10438                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10439                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10440                 regs[i].isconst&=~(1<<hr);
10441                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10442                 constmap[i][hr]=constmap[i+1][hr];
10443                 regs[i+1].wasdirty&=~(1<<hr);
10444                 regs[i].dirty&=~(1<<hr);
10445               }
10446             }
10447           }
10448           if(rs2[i+1]) {
10449             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10450             {
10451               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10452               {
10453                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10454                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10455                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10456                 regs[i].isconst&=~(1<<hr);
10457                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10458                 constmap[i][hr]=constmap[i+1][hr];
10459                 regs[i+1].wasdirty&=~(1<<hr);
10460                 regs[i].dirty&=~(1<<hr);
10461               }
10462             }
10463           }
10464           // Preload target address for load instruction (non-constant)
10465           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10466             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10467             {
10468               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10469               {
10470                 regs[i].regmap[hr]=rs1[i+1];
10471                 regmap_pre[i+1][hr]=rs1[i+1];
10472                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10473                 regs[i].isconst&=~(1<<hr);
10474                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10475                 constmap[i][hr]=constmap[i+1][hr];
10476                 regs[i+1].wasdirty&=~(1<<hr);
10477                 regs[i].dirty&=~(1<<hr);
10478               }
10479             }
10480           }
10481           // Load source into target register 
10482           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10483             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10484             {
10485               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10486               {
10487                 regs[i].regmap[hr]=rs1[i+1];
10488                 regmap_pre[i+1][hr]=rs1[i+1];
10489                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10490                 regs[i].isconst&=~(1<<hr);
10491                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10492                 constmap[i][hr]=constmap[i+1][hr];
10493                 regs[i+1].wasdirty&=~(1<<hr);
10494                 regs[i].dirty&=~(1<<hr);
10495               }
10496             }
10497           }
10498           // Preload map address
10499           #ifndef HOST_IMM_ADDR32
10500           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10501             hr=get_reg(regs[i+1].regmap,TLREG);
10502             if(hr>=0) {
10503               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10504               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10505                 int nr;
10506                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10507                 {
10508                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10509                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10510                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10511                   regs[i].isconst&=~(1<<hr);
10512                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10513                   constmap[i][hr]=constmap[i+1][hr];
10514                   regs[i+1].wasdirty&=~(1<<hr);
10515                   regs[i].dirty&=~(1<<hr);
10516                 }
10517                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10518                 {
10519                   // move it to another register
10520                   regs[i+1].regmap[hr]=-1;
10521                   regmap_pre[i+2][hr]=-1;
10522                   regs[i+1].regmap[nr]=TLREG;
10523                   regmap_pre[i+2][nr]=TLREG;
10524                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10525                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10526                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10527                   regs[i].isconst&=~(1<<nr);
10528                   regs[i+1].isconst&=~(1<<nr);
10529                   regs[i].dirty&=~(1<<nr);
10530                   regs[i+1].wasdirty&=~(1<<nr);
10531                   regs[i+1].dirty&=~(1<<nr);
10532                   regs[i+2].wasdirty&=~(1<<nr);
10533                 }
10534               }
10535             }
10536           }
10537           #endif
10538           // Address for store instruction (non-constant)
10539           if(itype[i+1]==STORE||itype[i+1]==STORELR
10540              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10541             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10542               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10543               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10544               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10545               assert(hr>=0);
10546               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10547               {
10548                 regs[i].regmap[hr]=rs1[i+1];
10549                 regmap_pre[i+1][hr]=rs1[i+1];
10550                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10551                 regs[i].isconst&=~(1<<hr);
10552                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10553                 constmap[i][hr]=constmap[i+1][hr];
10554                 regs[i+1].wasdirty&=~(1<<hr);
10555                 regs[i].dirty&=~(1<<hr);
10556               }
10557             }
10558           }
10559           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10560             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10561               int nr;
10562               hr=get_reg(regs[i+1].regmap,FTEMP);
10563               assert(hr>=0);
10564               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10565               {
10566                 regs[i].regmap[hr]=rs1[i+1];
10567                 regmap_pre[i+1][hr]=rs1[i+1];
10568                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10569                 regs[i].isconst&=~(1<<hr);
10570                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10571                 constmap[i][hr]=constmap[i+1][hr];
10572                 regs[i+1].wasdirty&=~(1<<hr);
10573                 regs[i].dirty&=~(1<<hr);
10574               }
10575               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10576               {
10577                 // move it to another register
10578                 regs[i+1].regmap[hr]=-1;
10579                 regmap_pre[i+2][hr]=-1;
10580                 regs[i+1].regmap[nr]=FTEMP;
10581                 regmap_pre[i+2][nr]=FTEMP;
10582                 regs[i].regmap[nr]=rs1[i+1];
10583                 regmap_pre[i+1][nr]=rs1[i+1];
10584                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10585                 regs[i].isconst&=~(1<<nr);
10586                 regs[i+1].isconst&=~(1<<nr);
10587                 regs[i].dirty&=~(1<<nr);
10588                 regs[i+1].wasdirty&=~(1<<nr);
10589                 regs[i+1].dirty&=~(1<<nr);
10590                 regs[i+2].wasdirty&=~(1<<nr);
10591               }
10592             }
10593           }
10594           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10595             if(itype[i+1]==LOAD) 
10596               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10597             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10598               hr=get_reg(regs[i+1].regmap,FTEMP);
10599             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10600               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10601               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10602             }
10603             if(hr>=0&&regs[i].regmap[hr]<0) {
10604               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10605               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10606                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10607                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10608                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10609                 regs[i].isconst&=~(1<<hr);
10610                 regs[i+1].wasdirty&=~(1<<hr);
10611                 regs[i].dirty&=~(1<<hr);
10612               }
10613             }
10614           }
10615         }
10616       }
10617     }
10618   }
10619   
10620   /* Pass 6 - Optimize clean/dirty state */
10621   clean_registers(0,slen-1,1);
10622   
10623   /* Pass 7 - Identify 32-bit registers */
10624 #ifndef FORCE32
10625   provisional_r32();
10626
10627   u_int r32=0;
10628   
10629   for (i=slen-1;i>=0;i--)
10630   {
10631     int hr;
10632     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10633     {
10634       if(ba[i]<start || ba[i]>=(start+slen*4))
10635       {
10636         // Branch out of this block, don't need anything
10637         r32=0;
10638       }
10639       else
10640       {
10641         // Internal branch
10642         // Need whatever matches the target
10643         // (and doesn't get overwritten by the delay slot instruction)
10644         r32=0;
10645         int t=(ba[i]-start)>>2;
10646         if(ba[i]>start+i*4) {
10647           // Forward branch
10648           if(!(requires_32bit[t]&~regs[i].was32))
10649             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10650         }else{
10651           // Backward branch
10652           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10653           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10654           if(!(pr32[t]&~regs[i].was32))
10655             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10656         }
10657       }
10658       // Conditional branch may need registers for following instructions
10659       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10660       {
10661         if(i<slen-2) {
10662           r32|=requires_32bit[i+2];
10663           r32&=regs[i].was32;
10664           // Mark this address as a branch target since it may be called
10665           // upon return from interrupt
10666           bt[i+2]=1;
10667         }
10668       }
10669       // Merge in delay slot
10670       if(!likely[i]) {
10671         // These are overwritten unless the branch is "likely"
10672         // and the delay slot is nullified if not taken
10673         r32&=~(1LL<<rt1[i+1]);
10674         r32&=~(1LL<<rt2[i+1]);
10675       }
10676       // Assume these are needed (delay slot)
10677       if(us1[i+1]>0)
10678       {
10679         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10680       }
10681       if(us2[i+1]>0)
10682       {
10683         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10684       }
10685       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10686       {
10687         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10688       }
10689       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10690       {
10691         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10692       }
10693     }
10694     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10695     {
10696       // SYSCALL instruction (software interrupt)
10697       r32=0;
10698     }
10699     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10700     {
10701       // ERET instruction (return from interrupt)
10702       r32=0;
10703     }
10704     // Check 32 bits
10705     r32&=~(1LL<<rt1[i]);
10706     r32&=~(1LL<<rt2[i]);
10707     if(us1[i]>0)
10708     {
10709       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10710     }
10711     if(us2[i]>0)
10712     {
10713       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10714     }
10715     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10716     {
10717       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10718     }
10719     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10720     {
10721       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10722     }
10723     requires_32bit[i]=r32;
10724     
10725     // Dirty registers which are 32-bit, require 32-bit input
10726     // as they will be written as 32-bit values
10727     for(hr=0;hr<HOST_REGS;hr++)
10728     {
10729       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10730         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10731           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10732           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10733         }
10734       }
10735     }
10736     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10737   }
10738 #else
10739   for (i=slen-1;i>=0;i--)
10740   {
10741     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10742     {
10743       // Conditional branch
10744       if((source[i]>>16)!=0x1000&&i<slen-2) {
10745         // Mark this address as a branch target since it may be called
10746         // upon return from interrupt
10747         bt[i+2]=1;
10748       }
10749     }
10750   }
10751 #endif
10752
10753   if(itype[slen-1]==SPAN) {
10754     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10755   }
10756   
10757   /* Debug/disassembly */
10758   if((void*)assem_debug==(void*)printf) 
10759   for(i=0;i<slen;i++)
10760   {
10761     printf("U:");
10762     int r;
10763     for(r=1;r<=CCREG;r++) {
10764       if((unneeded_reg[i]>>r)&1) {
10765         if(r==HIREG) printf(" HI");
10766         else if(r==LOREG) printf(" LO");
10767         else printf(" r%d",r);
10768       }
10769     }
10770 #ifndef FORCE32
10771     printf(" UU:");
10772     for(r=1;r<=CCREG;r++) {
10773       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10774         if(r==HIREG) printf(" HI");
10775         else if(r==LOREG) printf(" LO");
10776         else printf(" r%d",r);
10777       }
10778     }
10779     printf(" 32:");
10780     for(r=0;r<=CCREG;r++) {
10781       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10782       if((regs[i].was32>>r)&1) {
10783         if(r==CCREG) printf(" CC");
10784         else if(r==HIREG) printf(" HI");
10785         else if(r==LOREG) printf(" LO");
10786         else printf(" r%d",r);
10787       }
10788     }
10789 #endif
10790     printf("\n");
10791     #if defined(__i386__) || defined(__x86_64__)
10792     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10793     #endif
10794     #ifdef __arm__
10795     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10796     #endif
10797     printf("needs: ");
10798     if(needed_reg[i]&1) printf("eax ");
10799     if((needed_reg[i]>>1)&1) printf("ecx ");
10800     if((needed_reg[i]>>2)&1) printf("edx ");
10801     if((needed_reg[i]>>3)&1) printf("ebx ");
10802     if((needed_reg[i]>>5)&1) printf("ebp ");
10803     if((needed_reg[i]>>6)&1) printf("esi ");
10804     if((needed_reg[i]>>7)&1) printf("edi ");
10805     printf("r:");
10806     for(r=0;r<=CCREG;r++) {
10807       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10808       if((requires_32bit[i]>>r)&1) {
10809         if(r==CCREG) printf(" CC");
10810         else if(r==HIREG) printf(" HI");
10811         else if(r==LOREG) printf(" LO");
10812         else printf(" r%d",r);
10813       }
10814     }
10815     printf("\n");
10816     /*printf("pr:");
10817     for(r=0;r<=CCREG;r++) {
10818       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10819       if((pr32[i]>>r)&1) {
10820         if(r==CCREG) printf(" CC");
10821         else if(r==HIREG) printf(" HI");
10822         else if(r==LOREG) printf(" LO");
10823         else printf(" r%d",r);
10824       }
10825     }
10826     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10827     printf("\n");*/
10828     #if defined(__i386__) || defined(__x86_64__)
10829     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10830     printf("dirty: ");
10831     if(regs[i].wasdirty&1) printf("eax ");
10832     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10833     if((regs[i].wasdirty>>2)&1) printf("edx ");
10834     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10835     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10836     if((regs[i].wasdirty>>6)&1) printf("esi ");
10837     if((regs[i].wasdirty>>7)&1) printf("edi ");
10838     #endif
10839     #ifdef __arm__
10840     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10841     printf("dirty: ");
10842     if(regs[i].wasdirty&1) printf("r0 ");
10843     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10844     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10845     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10846     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10847     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10848     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10849     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10850     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10851     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10852     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10853     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10854     #endif
10855     printf("\n");
10856     disassemble_inst(i);
10857     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10858     #if defined(__i386__) || defined(__x86_64__)
10859     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10860     if(regs[i].dirty&1) printf("eax ");
10861     if((regs[i].dirty>>1)&1) printf("ecx ");
10862     if((regs[i].dirty>>2)&1) printf("edx ");
10863     if((regs[i].dirty>>3)&1) printf("ebx ");
10864     if((regs[i].dirty>>5)&1) printf("ebp ");
10865     if((regs[i].dirty>>6)&1) printf("esi ");
10866     if((regs[i].dirty>>7)&1) printf("edi ");
10867     #endif
10868     #ifdef __arm__
10869     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10870     if(regs[i].dirty&1) printf("r0 ");
10871     if((regs[i].dirty>>1)&1) printf("r1 ");
10872     if((regs[i].dirty>>2)&1) printf("r2 ");
10873     if((regs[i].dirty>>3)&1) printf("r3 ");
10874     if((regs[i].dirty>>4)&1) printf("r4 ");
10875     if((regs[i].dirty>>5)&1) printf("r5 ");
10876     if((regs[i].dirty>>6)&1) printf("r6 ");
10877     if((regs[i].dirty>>7)&1) printf("r7 ");
10878     if((regs[i].dirty>>8)&1) printf("r8 ");
10879     if((regs[i].dirty>>9)&1) printf("r9 ");
10880     if((regs[i].dirty>>10)&1) printf("r10 ");
10881     if((regs[i].dirty>>12)&1) printf("r12 ");
10882     #endif
10883     printf("\n");
10884     if(regs[i].isconst) {
10885       printf("constants: ");
10886       #if defined(__i386__) || defined(__x86_64__)
10887       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10888       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10889       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10890       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10891       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10892       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10893       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10894       #endif
10895       #ifdef __arm__
10896       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10897       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10898       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10899       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10900       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10901       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10902       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10903       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10904       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10905       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10906       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10907       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10908       #endif
10909       printf("\n");
10910     }
10911 #ifndef FORCE32
10912     printf(" 32:");
10913     for(r=0;r<=CCREG;r++) {
10914       if((regs[i].is32>>r)&1) {
10915         if(r==CCREG) printf(" CC");
10916         else if(r==HIREG) printf(" HI");
10917         else if(r==LOREG) printf(" LO");
10918         else printf(" r%d",r);
10919       }
10920     }
10921     printf("\n");
10922 #endif
10923     /*printf(" p32:");
10924     for(r=0;r<=CCREG;r++) {
10925       if((p32[i]>>r)&1) {
10926         if(r==CCREG) printf(" CC");
10927         else if(r==HIREG) printf(" HI");
10928         else if(r==LOREG) printf(" LO");
10929         else printf(" r%d",r);
10930       }
10931     }
10932     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10933     else printf("\n");*/
10934     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10935       #if defined(__i386__) || defined(__x86_64__)
10936       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10937       if(branch_regs[i].dirty&1) printf("eax ");
10938       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10939       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10940       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10941       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10942       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10943       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10944       #endif
10945       #ifdef __arm__
10946       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10947       if(branch_regs[i].dirty&1) printf("r0 ");
10948       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10949       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10950       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10951       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10952       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10953       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10954       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10955       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10956       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10957       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10958       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10959       #endif
10960 #ifndef FORCE32
10961       printf(" 32:");
10962       for(r=0;r<=CCREG;r++) {
10963         if((branch_regs[i].is32>>r)&1) {
10964           if(r==CCREG) printf(" CC");
10965           else if(r==HIREG) printf(" HI");
10966           else if(r==LOREG) printf(" LO");
10967           else printf(" r%d",r);
10968         }
10969       }
10970       printf("\n");
10971 #endif
10972     }
10973   }
10974
10975   /* Pass 8 - Assembly */
10976   linkcount=0;stubcount=0;
10977   ds=0;is_delayslot=0;
10978   cop1_usable=0;
10979   uint64_t is32_pre=0;
10980   u_int dirty_pre=0;
10981   u_int beginning=(u_int)out;
10982   if((u_int)addr&1) {
10983     ds=1;
10984     pagespan_ds();
10985   }
10986   u_int instr_addr0_override=0;
10987
10988 #ifdef PCSX
10989   if (start == 0x80030000) {
10990     // nasty hack for fastbios thing
10991     // override block entry to this code
10992     instr_addr0_override=(u_int)out;
10993     emit_movimm(start,0);
10994     // abuse io address var as a flag that we
10995     // have already returned here once
10996     emit_readword((int)&address,1);
10997     emit_writeword(0,(int)&pcaddr);
10998     emit_writeword(0,(int)&address);
10999     emit_cmp(0,1);
11000     emit_jne((int)new_dyna_leave);
11001   }
11002 #endif
11003   for(i=0;i<slen;i++)
11004   {
11005     //if(ds) printf("ds: ");
11006     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
11007     if(ds) {
11008       ds=0; // Skip delay slot
11009       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11010       instr_addr[i]=0;
11011     } else {
11012       #ifndef DESTRUCTIVE_WRITEBACK
11013       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11014       {
11015         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11016               unneeded_reg[i],unneeded_reg_upper[i]);
11017         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11018               unneeded_reg[i],unneeded_reg_upper[i]);
11019       }
11020       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11021         is32_pre=branch_regs[i].is32;
11022         dirty_pre=branch_regs[i].dirty;
11023       }else{
11024         is32_pre=regs[i].is32;
11025         dirty_pre=regs[i].dirty;
11026       }
11027       #endif
11028       // write back
11029       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11030       {
11031         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11032                       unneeded_reg[i],unneeded_reg_upper[i]);
11033         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11034       }
11035       // branch target entry point
11036       instr_addr[i]=(u_int)out;
11037       assem_debug("<->\n");
11038       // load regs
11039       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11040         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11041       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11042       address_generation(i,&regs[i],regs[i].regmap_entry);
11043       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11044       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11045       {
11046         // Load the delay slot registers if necessary
11047         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11048           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11049         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11050           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11051         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11052           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11053       }
11054       else if(i+1<slen)
11055       {
11056         // Preload registers for following instruction
11057         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11058           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11059             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11060         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11061           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11062             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11063       }
11064       // TODO: if(is_ooo(i)) address_generation(i+1);
11065       if(itype[i]==CJUMP||itype[i]==FJUMP)
11066         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11067       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11068         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11069       if(bt[i]) cop1_usable=0;
11070       // assemble
11071       switch(itype[i]) {
11072         case ALU:
11073           alu_assemble(i,&regs[i]);break;
11074         case IMM16:
11075           imm16_assemble(i,&regs[i]);break;
11076         case SHIFT:
11077           shift_assemble(i,&regs[i]);break;
11078         case SHIFTIMM:
11079           shiftimm_assemble(i,&regs[i]);break;
11080         case LOAD:
11081           load_assemble(i,&regs[i]);break;
11082         case LOADLR:
11083           loadlr_assemble(i,&regs[i]);break;
11084         case STORE:
11085           store_assemble(i,&regs[i]);break;
11086         case STORELR:
11087           storelr_assemble(i,&regs[i]);break;
11088         case COP0:
11089           cop0_assemble(i,&regs[i]);break;
11090         case COP1:
11091           cop1_assemble(i,&regs[i]);break;
11092         case C1LS:
11093           c1ls_assemble(i,&regs[i]);break;
11094         case COP2:
11095           cop2_assemble(i,&regs[i]);break;
11096         case C2LS:
11097           c2ls_assemble(i,&regs[i]);break;
11098         case C2OP:
11099           c2op_assemble(i,&regs[i]);break;
11100         case FCONV:
11101           fconv_assemble(i,&regs[i]);break;
11102         case FLOAT:
11103           float_assemble(i,&regs[i]);break;
11104         case FCOMP:
11105           fcomp_assemble(i,&regs[i]);break;
11106         case MULTDIV:
11107           multdiv_assemble(i,&regs[i]);break;
11108         case MOV:
11109           mov_assemble(i,&regs[i]);break;
11110         case SYSCALL:
11111           syscall_assemble(i,&regs[i]);break;
11112         case HLECALL:
11113           hlecall_assemble(i,&regs[i]);break;
11114         case INTCALL:
11115           intcall_assemble(i,&regs[i]);break;
11116         case UJUMP:
11117           ujump_assemble(i,&regs[i]);ds=1;break;
11118         case RJUMP:
11119           rjump_assemble(i,&regs[i]);ds=1;break;
11120         case CJUMP:
11121           cjump_assemble(i,&regs[i]);ds=1;break;
11122         case SJUMP:
11123           sjump_assemble(i,&regs[i]);ds=1;break;
11124         case FJUMP:
11125           fjump_assemble(i,&regs[i]);ds=1;break;
11126         case SPAN:
11127           pagespan_assemble(i,&regs[i]);break;
11128       }
11129       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11130         literal_pool(1024);
11131       else
11132         literal_pool_jumpover(256);
11133     }
11134   }
11135   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11136   // If the block did not end with an unconditional branch,
11137   // add a jump to the next instruction.
11138   if(i>1) {
11139     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11140       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11141       assert(i==slen);
11142       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11143         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11144         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11145           emit_loadreg(CCREG,HOST_CCREG);
11146         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11147       }
11148       else if(!likely[i-2])
11149       {
11150         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11151         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11152       }
11153       else
11154       {
11155         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11156         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11157       }
11158       add_to_linker((int)out,start+i*4,0);
11159       emit_jmp(0);
11160     }
11161   }
11162   else
11163   {
11164     assert(i>0);
11165     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11166     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11167     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11168       emit_loadreg(CCREG,HOST_CCREG);
11169     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11170     add_to_linker((int)out,start+i*4,0);
11171     emit_jmp(0);
11172   }
11173
11174   // TODO: delay slot stubs?
11175   // Stubs
11176   for(i=0;i<stubcount;i++)
11177   {
11178     switch(stubs[i][0])
11179     {
11180       case LOADB_STUB:
11181       case LOADH_STUB:
11182       case LOADW_STUB:
11183       case LOADD_STUB:
11184       case LOADBU_STUB:
11185       case LOADHU_STUB:
11186         do_readstub(i);break;
11187       case STOREB_STUB:
11188       case STOREH_STUB:
11189       case STOREW_STUB:
11190       case STORED_STUB:
11191         do_writestub(i);break;
11192       case CC_STUB:
11193         do_ccstub(i);break;
11194       case INVCODE_STUB:
11195         do_invstub(i);break;
11196       case FP_STUB:
11197         do_cop1stub(i);break;
11198       case STORELR_STUB:
11199         do_unalignedwritestub(i);break;
11200     }
11201   }
11202
11203   if (instr_addr0_override)
11204     instr_addr[0] = instr_addr0_override;
11205
11206   /* Pass 9 - Linker */
11207   for(i=0;i<linkcount;i++)
11208   {
11209     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11210     literal_pool(64);
11211     if(!link_addr[i][2])
11212     {
11213       void *stub=out;
11214       void *addr=check_addr(link_addr[i][1]);
11215       emit_extjump(link_addr[i][0],link_addr[i][1]);
11216       if(addr) {
11217         set_jump_target(link_addr[i][0],(int)addr);
11218         add_link(link_addr[i][1],stub);
11219       }
11220       else set_jump_target(link_addr[i][0],(int)stub);
11221     }
11222     else
11223     {
11224       // Internal branch
11225       int target=(link_addr[i][1]-start)>>2;
11226       assert(target>=0&&target<slen);
11227       assert(instr_addr[target]);
11228       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11229       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11230       //#else
11231       set_jump_target(link_addr[i][0],instr_addr[target]);
11232       //#endif
11233     }
11234   }
11235   // External Branch Targets (jump_in)
11236   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11237   for(i=0;i<slen;i++)
11238   {
11239     if(bt[i]||i==0)
11240     {
11241       if(instr_addr[i]) // TODO - delay slots (=null)
11242       {
11243         u_int vaddr=start+i*4;
11244         u_int page=get_page(vaddr);
11245         u_int vpage=get_vpage(vaddr);
11246         literal_pool(256);
11247         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11248 #ifndef FORCE32
11249         if(!requires_32bit[i])
11250 #else
11251         if(1)
11252 #endif
11253         {
11254           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11255           assem_debug("jump_in: %x\n",start+i*4);
11256           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11257           int entry_point=do_dirty_stub(i);
11258           ll_add(jump_in+page,vaddr,(void *)entry_point);
11259           // If there was an existing entry in the hash table,
11260           // replace it with the new address.
11261           // Don't add new entries.  We'll insert the
11262           // ones that actually get used in check_addr().
11263           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11264           if(ht_bin[0]==vaddr) {
11265             ht_bin[1]=entry_point;
11266           }
11267           if(ht_bin[2]==vaddr) {
11268             ht_bin[3]=entry_point;
11269           }
11270         }
11271         else
11272         {
11273           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11274           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11275           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11276           //int entry_point=(int)out;
11277           ////assem_debug("entry_point: %x\n",entry_point);
11278           //load_regs_entry(i);
11279           //if(entry_point==(int)out)
11280           //  entry_point=instr_addr[i];
11281           //else
11282           //  emit_jmp(instr_addr[i]);
11283           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11284           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11285           int entry_point=do_dirty_stub(i);
11286           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11287         }
11288       }
11289     }
11290   }
11291   // Write out the literal pool if necessary
11292   literal_pool(0);
11293   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11294   // Align code
11295   if(((u_int)out)&7) emit_addnop(13);
11296   #endif
11297   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11298   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11299   memcpy(copy,source,slen*4);
11300   copy+=slen*4;
11301   
11302   #ifdef __arm__
11303   __clear_cache((void *)beginning,out);
11304   #endif
11305   
11306   // If we're within 256K of the end of the buffer,
11307   // start over from the beginning. (Is 256K enough?)
11308   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11309   
11310   // Trap writes to any of the pages we compiled
11311   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11312     invalid_code[i]=0;
11313 #ifndef DISABLE_TLB
11314     memory_map[i]|=0x40000000;
11315     if((signed int)start>=(signed int)0xC0000000) {
11316       assert(using_tlb);
11317       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11318       invalid_code[j]=0;
11319       memory_map[j]|=0x40000000;
11320       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11321     }
11322 #endif
11323   }
11324 #ifdef PCSX
11325   // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
11326   if(get_page(start)<(RAM_SIZE>>12))
11327     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11328       invalid_code[((u_int)0x80000000>>12)|i]=0;
11329 #endif
11330   
11331   /* Pass 10 - Free memory by expiring oldest blocks */
11332   
11333   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11334   while(expirep!=end)
11335   {
11336     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11337     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11338     inv_debug("EXP: Phase %d\n",expirep);
11339     switch((expirep>>11)&3)
11340     {
11341       case 0:
11342         // Clear jump_in and jump_dirty
11343         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11344         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11345         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11346         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11347         break;
11348       case 1:
11349         // Clear pointers
11350         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11351         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11352         break;
11353       case 2:
11354         // Clear hash table
11355         for(i=0;i<32;i++) {
11356           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11357           if((ht_bin[3]>>shift)==(base>>shift) ||
11358              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11359             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11360             ht_bin[2]=ht_bin[3]=-1;
11361           }
11362           if((ht_bin[1]>>shift)==(base>>shift) ||
11363              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11364             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11365             ht_bin[0]=ht_bin[2];
11366             ht_bin[1]=ht_bin[3];
11367             ht_bin[2]=ht_bin[3]=-1;
11368           }
11369         }
11370         break;
11371       case 3:
11372         // Clear jump_out
11373         #ifdef __arm__
11374         if((expirep&2047)==0) 
11375           do_clear_cache();
11376         #endif
11377         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11378         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11379         break;
11380     }
11381     expirep=(expirep+1)&65535;
11382   }
11383   return 0;
11384 }
11385
11386 // vim:shiftwidth=2:expandtab