spu: get rid of pCurr -1, pass right chans to silent handler
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   static u_int sp_in_mirror;
130   u_int stop_after_jal;
131   extern u_char restore_candidate[512];
132   extern int cycle_count;
133
134   /* registers that may be allocated */
135   /* 1-31 gpr */
136 #define HIREG 32 // hi
137 #define LOREG 33 // lo
138 #define FSREG 34 // FPU status (FCSR)
139 #define CSREG 35 // Coprocessor status
140 #define CCREG 36 // Cycle count
141 #define INVCP 37 // Pointer to invalid_code
142 #define MMREG 38 // Pointer to memory_map
143 #define ROREG 39 // ram offset (if rdram!=0x80000000)
144 #define TEMPREG 40
145 #define FTEMP 40 // FPU temporary register
146 #define PTEMP 41 // Prefetch temporary register
147 #define TLREG 42 // TLB mapping offset
148 #define RHASH 43 // Return address hash
149 #define RHTBL 44 // Return address hash table address
150 #define RTEMP 45 // JR/JALR address register
151 #define MAXREG 45
152 #define AGEN1 46 // Address generation temporary register
153 #define AGEN2 47 // Address generation temporary register
154 #define MGEN1 48 // Maptable address generation temporary register
155 #define MGEN2 49 // Maptable address generation temporary register
156 #define BTREG 50 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185 #define HLECALL 26// PCSX fake opcodes for HLE
186 #define COP2 27   // Coprocessor 2 move
187 #define C2LS 28   // Coprocessor 2 load/store
188 #define C2OP 29   // Coprocessor 2 operation
189 #define INTCALL 30// Call interpreter to handle rare corner cases
190
191   /* stubs */
192 #define CC_STUB 1
193 #define FP_STUB 2
194 #define LOADB_STUB 3
195 #define LOADH_STUB 4
196 #define LOADW_STUB 5
197 #define LOADD_STUB 6
198 #define LOADBU_STUB 7
199 #define LOADHU_STUB 8
200 #define STOREB_STUB 9
201 #define STOREH_STUB 10
202 #define STOREW_STUB 11
203 #define STORED_STUB 12
204 #define STORELR_STUB 13
205 #define INVCODE_STUB 14
206
207   /* branch codes */
208 #define TAKEN 1
209 #define NOTTAKEN 2
210 #define NULLDS 3
211
212 // asm linkage
213 int new_recompile_block(int addr);
214 void *get_addr_ht(u_int vaddr);
215 void invalidate_block(u_int block);
216 void invalidate_addr(u_int addr);
217 void remove_hash(int vaddr);
218 void jump_vaddr();
219 void dyna_linker();
220 void dyna_linker_ds();
221 void verify_code();
222 void verify_code_vm();
223 void verify_code_ds();
224 void cc_interrupt();
225 void fp_exception();
226 void fp_exception_ds();
227 void jump_syscall();
228 void jump_syscall_hle();
229 void jump_eret();
230 void jump_hlecall();
231 void jump_intcall();
232 void new_dyna_leave();
233
234 // TLB
235 void TLBWI_new();
236 void TLBWR_new();
237 void read_nomem_new();
238 void read_nomemb_new();
239 void read_nomemh_new();
240 void read_nomemd_new();
241 void write_nomem_new();
242 void write_nomemb_new();
243 void write_nomemh_new();
244 void write_nomemd_new();
245 void write_rdram_new();
246 void write_rdramb_new();
247 void write_rdramh_new();
248 void write_rdramd_new();
249 extern u_int memory_map[1048576];
250
251 // Needed by assembler
252 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
253 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
254 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
255 void load_all_regs(signed char i_regmap[]);
256 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
257 void load_regs_entry(int t);
258 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
259
260 int tracedebug=0;
261
262 //#define DEBUG_CYCLE_COUNT 1
263
264 void nullf() {}
265 //#define assem_debug printf
266 //#define inv_debug printf
267 #define assem_debug nullf
268 #define inv_debug nullf
269
270 static void tlb_hacks()
271 {
272 #ifndef DISABLE_TLB
273   // Goldeneye hack
274   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
275   {
276     u_int addr;
277     int n;
278     switch (ROM_HEADER->Country_code&0xFF) 
279     {
280       case 0x45: // U
281         addr=0x34b30;
282         break;                   
283       case 0x4A: // J 
284         addr=0x34b70;    
285         break;    
286       case 0x50: // E 
287         addr=0x329f0;
288         break;                        
289       default: 
290         // Unknown country code
291         addr=0;
292         break;
293     }
294     u_int rom_addr=(u_int)rom;
295     #ifdef ROM_COPY
296     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
297     // in the lower 4G of memory to use this hack.  Copy it if necessary.
298     if((void *)rom>(void *)0xffffffff) {
299       munmap(ROM_COPY, 67108864);
300       if(mmap(ROM_COPY, 12582912,
301               PROT_READ | PROT_WRITE,
302               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
303               -1, 0) <= 0) {printf("mmap() failed\n");}
304       memcpy(ROM_COPY,rom,12582912);
305       rom_addr=(u_int)ROM_COPY;
306     }
307     #endif
308     if(addr) {
309       for(n=0x7F000;n<0x80000;n++) {
310         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
311       }
312     }
313   }
314 #endif
315 }
316
317 static u_int get_page(u_int vaddr)
318 {
319 #ifndef PCSX
320   u_int page=(vaddr^0x80000000)>>12;
321 #else
322   u_int page=vaddr&~0xe0000000;
323   if (page < 0x1000000)
324     page &= ~0x0e00000; // RAM mirrors
325   page>>=12;
326 #endif
327 #ifndef DISABLE_TLB
328   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
329 #endif
330   if(page>2048) page=2048+(page&2047);
331   return page;
332 }
333
334 static u_int get_vpage(u_int vaddr)
335 {
336   u_int vpage=(vaddr^0x80000000)>>12;
337 #ifndef DISABLE_TLB
338   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
339 #endif
340   if(vpage>2048) vpage=2048+(vpage&2047);
341   return vpage;
342 }
343
344 // Get address from virtual address
345 // This is called from the recompiled JR/JALR instructions
346 void *get_addr(u_int vaddr)
347 {
348   u_int page=get_page(vaddr);
349   u_int vpage=get_vpage(vaddr);
350   struct ll_entry *head;
351   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
352   head=jump_in[page];
353   while(head!=NULL) {
354     if(head->vaddr==vaddr&&head->reg32==0) {
355   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
356       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
357       ht_bin[3]=ht_bin[1];
358       ht_bin[2]=ht_bin[0];
359       ht_bin[1]=(int)head->addr;
360       ht_bin[0]=vaddr;
361       return head->addr;
362     }
363     head=head->next;
364   }
365   head=jump_dirty[vpage];
366   while(head!=NULL) {
367     if(head->vaddr==vaddr&&head->reg32==0) {
368       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
369       // Don't restore blocks which are about to expire from the cache
370       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
371       if(verify_dirty(head->addr)) {
372         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
373         invalid_code[vaddr>>12]=0;
374         memory_map[vaddr>>12]|=0x40000000;
375         if(vpage<2048) {
376 #ifndef DISABLE_TLB
377           if(tlb_LUT_r[vaddr>>12]) {
378             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
379             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
380           }
381 #endif
382           restore_candidate[vpage>>3]|=1<<(vpage&7);
383         }
384         else restore_candidate[page>>3]|=1<<(page&7);
385         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
386         if(ht_bin[0]==vaddr) {
387           ht_bin[1]=(int)head->addr; // Replace existing entry
388         }
389         else
390         {
391           ht_bin[3]=ht_bin[1];
392           ht_bin[2]=ht_bin[0];
393           ht_bin[1]=(int)head->addr;
394           ht_bin[0]=vaddr;
395         }
396         return head->addr;
397       }
398     }
399     head=head->next;
400   }
401   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
402   int r=new_recompile_block(vaddr);
403   if(r==0) return get_addr(vaddr);
404   // Execute in unmapped page, generate pagefault execption
405   Status|=2;
406   Cause=(vaddr<<31)|0x8;
407   EPC=(vaddr&1)?vaddr-5:vaddr;
408   BadVAddr=(vaddr&~1);
409   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
410   EntryHi=BadVAddr&0xFFFFE000;
411   return get_addr_ht(0x80000000);
412 }
413 // Look up address in hash table first
414 void *get_addr_ht(u_int vaddr)
415 {
416   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   return get_addr(vaddr);
421 }
422
423 void *get_addr_32(u_int vaddr,u_int flags)
424 {
425 #ifdef FORCE32
426   return get_addr(vaddr);
427 #else
428   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
429   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
430   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
431   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
432   u_int page=get_page(vaddr);
433   u_int vpage=get_vpage(vaddr);
434   struct ll_entry *head;
435   head=jump_in[page];
436   while(head!=NULL) {
437     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
438       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
439       if(head->reg32==0) {
440         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441         if(ht_bin[0]==-1) {
442           ht_bin[1]=(int)head->addr;
443           ht_bin[0]=vaddr;
444         }else if(ht_bin[2]==-1) {
445           ht_bin[3]=(int)head->addr;
446           ht_bin[2]=vaddr;
447         }
448         //ht_bin[3]=ht_bin[1];
449         //ht_bin[2]=ht_bin[0];
450         //ht_bin[1]=(int)head->addr;
451         //ht_bin[0]=vaddr;
452       }
453       return head->addr;
454     }
455     head=head->next;
456   }
457   head=jump_dirty[vpage];
458   while(head!=NULL) {
459     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
460       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
461       // Don't restore blocks which are about to expire from the cache
462       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
463       if(verify_dirty(head->addr)) {
464         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
465         invalid_code[vaddr>>12]=0;
466         memory_map[vaddr>>12]|=0x40000000;
467         if(vpage<2048) {
468 #ifndef DISABLE_TLB
469           if(tlb_LUT_r[vaddr>>12]) {
470             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
471             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
472           }
473 #endif
474           restore_candidate[vpage>>3]|=1<<(vpage&7);
475         }
476         else restore_candidate[page>>3]|=1<<(page&7);
477         if(head->reg32==0) {
478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479           if(ht_bin[0]==-1) {
480             ht_bin[1]=(int)head->addr;
481             ht_bin[0]=vaddr;
482           }else if(ht_bin[2]==-1) {
483             ht_bin[3]=(int)head->addr;
484             ht_bin[2]=vaddr;
485           }
486           //ht_bin[3]=ht_bin[1];
487           //ht_bin[2]=ht_bin[0];
488           //ht_bin[1]=(int)head->addr;
489           //ht_bin[0]=vaddr;
490         }
491         return head->addr;
492       }
493     }
494     head=head->next;
495   }
496   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
497   int r=new_recompile_block(vaddr);
498   if(r==0) return get_addr(vaddr);
499   // Execute in unmapped page, generate pagefault execption
500   Status|=2;
501   Cause=(vaddr<<31)|0x8;
502   EPC=(vaddr&1)?vaddr-5:vaddr;
503   BadVAddr=(vaddr&~1);
504   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
505   EntryHi=BadVAddr&0xFFFFE000;
506   return get_addr_ht(0x80000000);
507 #endif
508 }
509
510 void clear_all_regs(signed char regmap[])
511 {
512   int hr;
513   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
514 }
515
516 signed char get_reg(signed char regmap[],int r)
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
520   return -1;
521 }
522
523 // Find a register that is available for two consecutive cycles
524 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
525 {
526   int hr;
527   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
528   return -1;
529 }
530
531 int count_free_regs(signed char regmap[])
532 {
533   int count=0;
534   int hr;
535   for(hr=0;hr<HOST_REGS;hr++)
536   {
537     if(hr!=EXCLUDE_REG) {
538       if(regmap[hr]<0) count++;
539     }
540   }
541   return count;
542 }
543
544 void dirty_reg(struct regstat *cur,signed char reg)
545 {
546   int hr;
547   if(!reg) return;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       cur->dirty|=1<<hr;
551     }
552   }
553 }
554
555 // If we dirty the lower half of a 64 bit register which is now being
556 // sign-extended, we need to dump the upper half.
557 // Note: Do this only after completion of the instruction, because
558 // some instructions may need to read the full 64-bit value even if
559 // overwriting it (eg SLTI, DSRA32).
560 static void flush_dirty_uppers(struct regstat *cur)
561 {
562   int hr,reg;
563   for (hr=0;hr<HOST_REGS;hr++) {
564     if((cur->dirty>>hr)&1) {
565       reg=cur->regmap[hr];
566       if(reg>=64) 
567         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
568     }
569   }
570 }
571
572 void set_const(struct regstat *cur,signed char reg,uint64_t value)
573 {
574   int hr;
575   if(!reg) return;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       cur->isconst|=1<<hr;
579       cur->constmap[hr]=value;
580     }
581     else if((cur->regmap[hr]^64)==reg) {
582       cur->isconst|=1<<hr;
583       cur->constmap[hr]=value>>32;
584     }
585   }
586 }
587
588 void clear_const(struct regstat *cur,signed char reg)
589 {
590   int hr;
591   if(!reg) return;
592   for (hr=0;hr<HOST_REGS;hr++) {
593     if((cur->regmap[hr]&63)==reg) {
594       cur->isconst&=~(1<<hr);
595     }
596   }
597 }
598
599 int is_const(struct regstat *cur,signed char reg)
600 {
601   int hr;
602   if(reg<0) return 0;
603   if(!reg) return 1;
604   for (hr=0;hr<HOST_REGS;hr++) {
605     if((cur->regmap[hr]&63)==reg) {
606       return (cur->isconst>>hr)&1;
607     }
608   }
609   return 0;
610 }
611 uint64_t get_const(struct regstat *cur,signed char reg)
612 {
613   int hr;
614   if(!reg) return 0;
615   for (hr=0;hr<HOST_REGS;hr++) {
616     if(cur->regmap[hr]==reg) {
617       return cur->constmap[hr];
618     }
619   }
620   printf("Unknown constant in r%d\n",reg);
621   exit(1);
622 }
623
624 // Least soon needed registers
625 // Look at the next ten instructions and see which registers
626 // will be used.  Try not to reallocate these.
627 void lsn(u_char hsn[], int i, int *preferred_reg)
628 {
629   int j;
630   int b=-1;
631   for(j=0;j<9;j++)
632   {
633     if(i+j>=slen) {
634       j=slen-i-1;
635       break;
636     }
637     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
638     {
639       // Don't go past an unconditonal jump
640       j++;
641       break;
642     }
643   }
644   for(;j>=0;j--)
645   {
646     if(rs1[i+j]) hsn[rs1[i+j]]=j;
647     if(rs2[i+j]) hsn[rs2[i+j]]=j;
648     if(rt1[i+j]) hsn[rt1[i+j]]=j;
649     if(rt2[i+j]) hsn[rt2[i+j]]=j;
650     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
651       // Stores can allocate zero
652       hsn[rs1[i+j]]=j;
653       hsn[rs2[i+j]]=j;
654     }
655     // On some architectures stores need invc_ptr
656     #if defined(HOST_IMM8)
657     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
658       hsn[INVCP]=j;
659     }
660     #endif
661     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
662     {
663       hsn[CCREG]=j;
664       b=j;
665     }
666   }
667   if(b>=0)
668   {
669     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
670     {
671       // Follow first branch
672       int t=(ba[i+b]-start)>>2;
673       j=7-b;if(t+j>=slen) j=slen-t-1;
674       for(;j>=0;j--)
675       {
676         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
677         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
678         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
679         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
680       }
681     }
682     // TODO: preferred register based on backward branch
683   }
684   // Delay slot should preferably not overwrite branch conditions or cycle count
685   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
686     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
687     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
688     hsn[CCREG]=1;
689     // ...or hash tables
690     hsn[RHASH]=1;
691     hsn[RHTBL]=1;
692   }
693   // Coprocessor load/store needs FTEMP, even if not declared
694   if(itype[i]==C1LS||itype[i]==C2LS) {
695     hsn[FTEMP]=0;
696   }
697   // Load L/R also uses FTEMP as a temporary register
698   if(itype[i]==LOADLR) {
699     hsn[FTEMP]=0;
700   }
701   // Also SWL/SWR/SDL/SDR
702   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
703     hsn[FTEMP]=0;
704   }
705   // Don't remove the TLB registers either
706   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
707     hsn[TLREG]=0;
708   }
709   // Don't remove the miniht registers
710   if(itype[i]==UJUMP||itype[i]==RJUMP)
711   {
712     hsn[RHASH]=0;
713     hsn[RHTBL]=0;
714   }
715 }
716
717 // We only want to allocate registers if we're going to use them again soon
718 int needed_again(int r, int i)
719 {
720   int j;
721   int b=-1;
722   int rn=10;
723   
724   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
725   {
726     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
727       return 0; // Don't need any registers if exiting the block
728   }
729   for(j=0;j<9;j++)
730   {
731     if(i+j>=slen) {
732       j=slen-i-1;
733       break;
734     }
735     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
736     {
737       // Don't go past an unconditonal jump
738       j++;
739       break;
740     }
741     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
742     {
743       break;
744     }
745   }
746   for(;j>=1;j--)
747   {
748     if(rs1[i+j]==r) rn=j;
749     if(rs2[i+j]==r) rn=j;
750     if((unneeded_reg[i+j]>>r)&1) rn=10;
751     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
752     {
753       b=j;
754     }
755   }
756   /*
757   if(b>=0)
758   {
759     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
760     {
761       // Follow first branch
762       int o=rn;
763       int t=(ba[i+b]-start)>>2;
764       j=7-b;if(t+j>=slen) j=slen-t-1;
765       for(;j>=0;j--)
766       {
767         if(!((unneeded_reg[t+j]>>r)&1)) {
768           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
769           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
770         }
771         else rn=o;
772       }
773     }
774   }*/
775   if(rn<10) return 1;
776   return 0;
777 }
778
779 // Try to match register allocations at the end of a loop with those
780 // at the beginning
781 int loop_reg(int i, int r, int hr)
782 {
783   int j,k;
784   for(j=0;j<9;j++)
785   {
786     if(i+j>=slen) {
787       j=slen-i-1;
788       break;
789     }
790     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
791     {
792       // Don't go past an unconditonal jump
793       j++;
794       break;
795     }
796   }
797   k=0;
798   if(i>0){
799     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
800       k--;
801   }
802   for(;k<j;k++)
803   {
804     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
805     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
806     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
807     {
808       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
809       {
810         int t=(ba[i+k]-start)>>2;
811         int reg=get_reg(regs[t].regmap_entry,r);
812         if(reg>=0) return reg;
813         //reg=get_reg(regs[t+1].regmap_entry,r);
814         //if(reg>=0) return reg;
815       }
816     }
817   }
818   return hr;
819 }
820
821
822 // Allocate every register, preserving source/target regs
823 void alloc_all(struct regstat *cur,int i)
824 {
825   int hr;
826   
827   for(hr=0;hr<HOST_REGS;hr++) {
828     if(hr!=EXCLUDE_REG) {
829       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
830          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
831       {
832         cur->regmap[hr]=-1;
833         cur->dirty&=~(1<<hr);
834       }
835       // Don't need zeros
836       if((cur->regmap[hr]&63)==0)
837       {
838         cur->regmap[hr]=-1;
839         cur->dirty&=~(1<<hr);
840       }
841     }
842   }
843 }
844
845
846 void div64(int64_t dividend,int64_t divisor)
847 {
848   lo=dividend/divisor;
849   hi=dividend%divisor;
850   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
851   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
852 }
853 void divu64(uint64_t dividend,uint64_t divisor)
854 {
855   lo=dividend/divisor;
856   hi=dividend%divisor;
857   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
858   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
859 }
860
861 void mult64(uint64_t m1,uint64_t m2)
862 {
863    unsigned long long int op1, op2, op3, op4;
864    unsigned long long int result1, result2, result3, result4;
865    unsigned long long int temp1, temp2, temp3, temp4;
866    int sign = 0;
867    
868    if (m1 < 0)
869      {
870     op2 = -m1;
871     sign = 1 - sign;
872      }
873    else op2 = m1;
874    if (m2 < 0)
875      {
876     op4 = -m2;
877     sign = 1 - sign;
878      }
879    else op4 = m2;
880    
881    op1 = op2 & 0xFFFFFFFF;
882    op2 = (op2 >> 32) & 0xFFFFFFFF;
883    op3 = op4 & 0xFFFFFFFF;
884    op4 = (op4 >> 32) & 0xFFFFFFFF;
885    
886    temp1 = op1 * op3;
887    temp2 = (temp1 >> 32) + op1 * op4;
888    temp3 = op2 * op3;
889    temp4 = (temp3 >> 32) + op2 * op4;
890    
891    result1 = temp1 & 0xFFFFFFFF;
892    result2 = temp2 + (temp3 & 0xFFFFFFFF);
893    result3 = (result2 >> 32) + temp4;
894    result4 = (result3 >> 32);
895    
896    lo = result1 | (result2 << 32);
897    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
898    if (sign)
899      {
900     hi = ~hi;
901     if (!lo) hi++;
902     else lo = ~lo + 1;
903      }
904 }
905
906 void multu64(uint64_t m1,uint64_t m2)
907 {
908    unsigned long long int op1, op2, op3, op4;
909    unsigned long long int result1, result2, result3, result4;
910    unsigned long long int temp1, temp2, temp3, temp4;
911    
912    op1 = m1 & 0xFFFFFFFF;
913    op2 = (m1 >> 32) & 0xFFFFFFFF;
914    op3 = m2 & 0xFFFFFFFF;
915    op4 = (m2 >> 32) & 0xFFFFFFFF;
916    
917    temp1 = op1 * op3;
918    temp2 = (temp1 >> 32) + op1 * op4;
919    temp3 = op2 * op3;
920    temp4 = (temp3 >> 32) + op2 * op4;
921    
922    result1 = temp1 & 0xFFFFFFFF;
923    result2 = temp2 + (temp3 & 0xFFFFFFFF);
924    result3 = (result2 >> 32) + temp4;
925    result4 = (result3 >> 32);
926    
927    lo = result1 | (result2 << 32);
928    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
929    
930   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
931   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
932 }
933
934 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
935 {
936   if(bits) {
937     original<<=64-bits;
938     original>>=64-bits;
939     loaded<<=bits;
940     original|=loaded;
941   }
942   else original=loaded;
943   return original;
944 }
945 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
946 {
947   if(bits^56) {
948     original>>=64-(bits^56);
949     original<<=64-(bits^56);
950     loaded>>=bits^56;
951     original|=loaded;
952   }
953   else original=loaded;
954   return original;
955 }
956
957 #ifdef __i386__
958 #include "assem_x86.c"
959 #endif
960 #ifdef __x86_64__
961 #include "assem_x64.c"
962 #endif
963 #ifdef __arm__
964 #include "assem_arm.c"
965 #endif
966
967 // Add virtual address mapping to linked list
968 void ll_add(struct ll_entry **head,int vaddr,void *addr)
969 {
970   struct ll_entry *new_entry;
971   new_entry=malloc(sizeof(struct ll_entry));
972   assert(new_entry!=NULL);
973   new_entry->vaddr=vaddr;
974   new_entry->reg32=0;
975   new_entry->addr=addr;
976   new_entry->next=*head;
977   *head=new_entry;
978 }
979
980 // Add virtual address mapping for 32-bit compiled block
981 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
982 {
983   ll_add(head,vaddr,addr);
984 #ifndef FORCE32
985   (*head)->reg32=reg32;
986 #endif
987 }
988
989 // Check if an address is already compiled
990 // but don't return addresses which are about to expire from the cache
991 void *check_addr(u_int vaddr)
992 {
993   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
994   if(ht_bin[0]==vaddr) {
995     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
996       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
997   }
998   if(ht_bin[2]==vaddr) {
999     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1000       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1001   }
1002   u_int page=get_page(vaddr);
1003   struct ll_entry *head;
1004   head=jump_in[page];
1005   while(head!=NULL) {
1006     if(head->vaddr==vaddr&&head->reg32==0) {
1007       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1008         // Update existing entry with current address
1009         if(ht_bin[0]==vaddr) {
1010           ht_bin[1]=(int)head->addr;
1011           return head->addr;
1012         }
1013         if(ht_bin[2]==vaddr) {
1014           ht_bin[3]=(int)head->addr;
1015           return head->addr;
1016         }
1017         // Insert into hash table with low priority.
1018         // Don't evict existing entries, as they are probably
1019         // addresses that are being accessed frequently.
1020         if(ht_bin[0]==-1) {
1021           ht_bin[1]=(int)head->addr;
1022           ht_bin[0]=vaddr;
1023         }else if(ht_bin[2]==-1) {
1024           ht_bin[3]=(int)head->addr;
1025           ht_bin[2]=vaddr;
1026         }
1027         return head->addr;
1028       }
1029     }
1030     head=head->next;
1031   }
1032   return 0;
1033 }
1034
1035 void remove_hash(int vaddr)
1036 {
1037   //printf("remove hash: %x\n",vaddr);
1038   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1039   if(ht_bin[2]==vaddr) {
1040     ht_bin[2]=ht_bin[3]=-1;
1041   }
1042   if(ht_bin[0]==vaddr) {
1043     ht_bin[0]=ht_bin[2];
1044     ht_bin[1]=ht_bin[3];
1045     ht_bin[2]=ht_bin[3]=-1;
1046   }
1047 }
1048
1049 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1050 {
1051   struct ll_entry *next;
1052   while(*head) {
1053     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1054        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1055     {
1056       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1057       remove_hash((*head)->vaddr);
1058       next=(*head)->next;
1059       free(*head);
1060       *head=next;
1061     }
1062     else
1063     {
1064       head=&((*head)->next);
1065     }
1066   }
1067 }
1068
1069 // Remove all entries from linked list
1070 void ll_clear(struct ll_entry **head)
1071 {
1072   struct ll_entry *cur;
1073   struct ll_entry *next;
1074   if(cur=*head) {
1075     *head=0;
1076     while(cur) {
1077       next=cur->next;
1078       free(cur);
1079       cur=next;
1080     }
1081   }
1082 }
1083
1084 // Dereference the pointers and remove if it matches
1085 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1086 {
1087   while(head) {
1088     int ptr=get_pointer(head->addr);
1089     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1090     if(((ptr>>shift)==(addr>>shift)) ||
1091        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1092     {
1093       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1094       u_int host_addr=(u_int)kill_pointer(head->addr);
1095       #ifdef __arm__
1096         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1097       #endif
1098     }
1099     head=head->next;
1100   }
1101 }
1102
1103 // This is called when we write to a compiled block (see do_invstub)
1104 void invalidate_page(u_int page)
1105 {
1106   struct ll_entry *head;
1107   struct ll_entry *next;
1108   head=jump_in[page];
1109   jump_in[page]=0;
1110   while(head!=NULL) {
1111     inv_debug("INVALIDATE: %x\n",head->vaddr);
1112     remove_hash(head->vaddr);
1113     next=head->next;
1114     free(head);
1115     head=next;
1116   }
1117   head=jump_out[page];
1118   jump_out[page]=0;
1119   while(head!=NULL) {
1120     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1121     u_int host_addr=(u_int)kill_pointer(head->addr);
1122     #ifdef __arm__
1123       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1124     #endif
1125     next=head->next;
1126     free(head);
1127     head=next;
1128   }
1129 }
1130 void invalidate_block(u_int block)
1131 {
1132   u_int page=get_page(block<<12);
1133   u_int vpage=get_vpage(block<<12);
1134   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1135   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1136   u_int first,last;
1137   first=last=page;
1138   struct ll_entry *head;
1139   head=jump_dirty[vpage];
1140   //printf("page=%d vpage=%d\n",page,vpage);
1141   while(head!=NULL) {
1142     u_int start,end;
1143     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1144       get_bounds((int)head->addr,&start,&end);
1145       //printf("start: %x end: %x\n",start,end);
1146       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1147         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1148           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1149           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1150         }
1151       }
1152 #ifndef DISABLE_TLB
1153       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1154         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1155           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1156           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1157         }
1158       }
1159 #endif
1160     }
1161     head=head->next;
1162   }
1163   //printf("first=%d last=%d\n",first,last);
1164   invalidate_page(page);
1165   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1166   assert(last<page+5);
1167   // Invalidate the adjacent pages if a block crosses a 4K boundary
1168   while(first<page) {
1169     invalidate_page(first);
1170     first++;
1171   }
1172   for(first=page+1;first<last;first++) {
1173     invalidate_page(first);
1174   }
1175   #ifdef __arm__
1176     do_clear_cache();
1177   #endif
1178   
1179   // Don't trap writes
1180   invalid_code[block]=1;
1181 #ifdef PCSX
1182   invalid_code[((u_int)0x80000000>>12)|page]=1;
1183 #endif
1184 #ifndef DISABLE_TLB
1185   // If there is a valid TLB entry for this page, remove write protect
1186   if(tlb_LUT_w[block]) {
1187     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1188     // CHECK: Is this right?
1189     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1190     u_int real_block=tlb_LUT_w[block]>>12;
1191     invalid_code[real_block]=1;
1192     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1193   }
1194   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1195 #endif
1196
1197   #ifdef USE_MINI_HT
1198   memset(mini_ht,-1,sizeof(mini_ht));
1199   #endif
1200 }
1201 void invalidate_addr(u_int addr)
1202 {
1203   invalidate_block(addr>>12);
1204 }
1205 // This is called when loading a save state.
1206 // Anything could have changed, so invalidate everything.
1207 void invalidate_all_pages()
1208 {
1209   u_int page,n;
1210   for(page=0;page<4096;page++)
1211     invalidate_page(page);
1212   for(page=0;page<1048576;page++)
1213     if(!invalid_code[page]) {
1214       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1215       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1216     }
1217   #ifdef __arm__
1218   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1219   #endif
1220   #ifdef USE_MINI_HT
1221   memset(mini_ht,-1,sizeof(mini_ht));
1222   #endif
1223   #ifndef DISABLE_TLB
1224   // TLB
1225   for(page=0;page<0x100000;page++) {
1226     if(tlb_LUT_r[page]) {
1227       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1228       if(!tlb_LUT_w[page]||!invalid_code[page])
1229         memory_map[page]|=0x40000000; // Write protect
1230     }
1231     else memory_map[page]=-1;
1232     if(page==0x80000) page=0xC0000;
1233   }
1234   tlb_hacks();
1235   #endif
1236 }
1237
1238 // Add an entry to jump_out after making a link
1239 void add_link(u_int vaddr,void *src)
1240 {
1241   u_int page=get_page(vaddr);
1242   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1243   int *ptr=(int *)(src+4);
1244   assert((*ptr&0x0fff0000)==0x059f0000);
1245   ll_add(jump_out+page,vaddr,src);
1246   //int ptr=get_pointer(src);
1247   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1248 }
1249
1250 // If a code block was found to be unmodified (bit was set in
1251 // restore_candidate) and it remains unmodified (bit is clear
1252 // in invalid_code) then move the entries for that 4K page from
1253 // the dirty list to the clean list.
1254 void clean_blocks(u_int page)
1255 {
1256   struct ll_entry *head;
1257   inv_debug("INV: clean_blocks page=%d\n",page);
1258   head=jump_dirty[page];
1259   while(head!=NULL) {
1260     if(!invalid_code[head->vaddr>>12]) {
1261       // Don't restore blocks which are about to expire from the cache
1262       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1263         u_int start,end;
1264         if(verify_dirty((int)head->addr)) {
1265           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1266           u_int i;
1267           u_int inv=0;
1268           get_bounds((int)head->addr,&start,&end);
1269           if(start-(u_int)rdram<RAM_SIZE) {
1270             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1271               inv|=invalid_code[i];
1272             }
1273           }
1274           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1275             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1276             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1277             if(addr<start||addr>=end) inv=1;
1278           }
1279           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1280             inv=1;
1281           }
1282           if(!inv) {
1283             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1284             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1285               u_int ppage=page;
1286 #ifndef DISABLE_TLB
1287               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1288 #endif
1289               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1290               //printf("page=%x, addr=%x\n",page,head->vaddr);
1291               //assert(head->vaddr>>12==(page|0x80000));
1292               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1293               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1294               if(!head->reg32) {
1295                 if(ht_bin[0]==head->vaddr) {
1296                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1297                 }
1298                 if(ht_bin[2]==head->vaddr) {
1299                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1300                 }
1301               }
1302             }
1303           }
1304         }
1305       }
1306     }
1307     head=head->next;
1308   }
1309 }
1310
1311
1312 void mov_alloc(struct regstat *current,int i)
1313 {
1314   // Note: Don't need to actually alloc the source registers
1315   if((~current->is32>>rs1[i])&1) {
1316     //alloc_reg64(current,i,rs1[i]);
1317     alloc_reg64(current,i,rt1[i]);
1318     current->is32&=~(1LL<<rt1[i]);
1319   } else {
1320     //alloc_reg(current,i,rs1[i]);
1321     alloc_reg(current,i,rt1[i]);
1322     current->is32|=(1LL<<rt1[i]);
1323   }
1324   clear_const(current,rs1[i]);
1325   clear_const(current,rt1[i]);
1326   dirty_reg(current,rt1[i]);
1327 }
1328
1329 void shiftimm_alloc(struct regstat *current,int i)
1330 {
1331   clear_const(current,rs1[i]);
1332   clear_const(current,rt1[i]);
1333   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1334   {
1335     if(rt1[i]) {
1336       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1337       else lt1[i]=rs1[i];
1338       alloc_reg(current,i,rt1[i]);
1339       current->is32|=1LL<<rt1[i];
1340       dirty_reg(current,rt1[i]);
1341     }
1342   }
1343   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1344   {
1345     if(rt1[i]) {
1346       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1347       alloc_reg64(current,i,rt1[i]);
1348       current->is32&=~(1LL<<rt1[i]);
1349       dirty_reg(current,rt1[i]);
1350     }
1351   }
1352   if(opcode2[i]==0x3c) // DSLL32
1353   {
1354     if(rt1[i]) {
1355       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1356       alloc_reg64(current,i,rt1[i]);
1357       current->is32&=~(1LL<<rt1[i]);
1358       dirty_reg(current,rt1[i]);
1359     }
1360   }
1361   if(opcode2[i]==0x3e) // DSRL32
1362   {
1363     if(rt1[i]) {
1364       alloc_reg64(current,i,rs1[i]);
1365       if(imm[i]==32) {
1366         alloc_reg64(current,i,rt1[i]);
1367         current->is32&=~(1LL<<rt1[i]);
1368       } else {
1369         alloc_reg(current,i,rt1[i]);
1370         current->is32|=1LL<<rt1[i];
1371       }
1372       dirty_reg(current,rt1[i]);
1373     }
1374   }
1375   if(opcode2[i]==0x3f) // DSRA32
1376   {
1377     if(rt1[i]) {
1378       alloc_reg64(current,i,rs1[i]);
1379       alloc_reg(current,i,rt1[i]);
1380       current->is32|=1LL<<rt1[i];
1381       dirty_reg(current,rt1[i]);
1382     }
1383   }
1384 }
1385
1386 void shift_alloc(struct regstat *current,int i)
1387 {
1388   if(rt1[i]) {
1389     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1390     {
1391       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1392       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1393       alloc_reg(current,i,rt1[i]);
1394       if(rt1[i]==rs2[i]) {
1395         alloc_reg_temp(current,i,-1);
1396         minimum_free_regs[i]=1;
1397       }
1398       current->is32|=1LL<<rt1[i];
1399     } else { // DSLLV/DSRLV/DSRAV
1400       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1401       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1402       alloc_reg64(current,i,rt1[i]);
1403       current->is32&=~(1LL<<rt1[i]);
1404       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1405       {
1406         alloc_reg_temp(current,i,-1);
1407         minimum_free_regs[i]=1;
1408       }
1409     }
1410     clear_const(current,rs1[i]);
1411     clear_const(current,rs2[i]);
1412     clear_const(current,rt1[i]);
1413     dirty_reg(current,rt1[i]);
1414   }
1415 }
1416
1417 void alu_alloc(struct regstat *current,int i)
1418 {
1419   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1420     if(rt1[i]) {
1421       if(rs1[i]&&rs2[i]) {
1422         alloc_reg(current,i,rs1[i]);
1423         alloc_reg(current,i,rs2[i]);
1424       }
1425       else {
1426         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1427         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1428       }
1429       alloc_reg(current,i,rt1[i]);
1430     }
1431     current->is32|=1LL<<rt1[i];
1432   }
1433   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1434     if(rt1[i]) {
1435       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1436       {
1437         alloc_reg64(current,i,rs1[i]);
1438         alloc_reg64(current,i,rs2[i]);
1439         alloc_reg(current,i,rt1[i]);
1440       } else {
1441         alloc_reg(current,i,rs1[i]);
1442         alloc_reg(current,i,rs2[i]);
1443         alloc_reg(current,i,rt1[i]);
1444       }
1445     }
1446     current->is32|=1LL<<rt1[i];
1447   }
1448   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1449     if(rt1[i]) {
1450       if(rs1[i]&&rs2[i]) {
1451         alloc_reg(current,i,rs1[i]);
1452         alloc_reg(current,i,rs2[i]);
1453       }
1454       else
1455       {
1456         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1457         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1458       }
1459       alloc_reg(current,i,rt1[i]);
1460       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1461       {
1462         if(!((current->uu>>rt1[i])&1)) {
1463           alloc_reg64(current,i,rt1[i]);
1464         }
1465         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1466           if(rs1[i]&&rs2[i]) {
1467             alloc_reg64(current,i,rs1[i]);
1468             alloc_reg64(current,i,rs2[i]);
1469           }
1470           else
1471           {
1472             // Is is really worth it to keep 64-bit values in registers?
1473             #ifdef NATIVE_64BIT
1474             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1475             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1476             #endif
1477           }
1478         }
1479         current->is32&=~(1LL<<rt1[i]);
1480       } else {
1481         current->is32|=1LL<<rt1[i];
1482       }
1483     }
1484   }
1485   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1486     if(rt1[i]) {
1487       if(rs1[i]&&rs2[i]) {
1488         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1489           alloc_reg64(current,i,rs1[i]);
1490           alloc_reg64(current,i,rs2[i]);
1491           alloc_reg64(current,i,rt1[i]);
1492         } else {
1493           alloc_reg(current,i,rs1[i]);
1494           alloc_reg(current,i,rs2[i]);
1495           alloc_reg(current,i,rt1[i]);
1496         }
1497       }
1498       else {
1499         alloc_reg(current,i,rt1[i]);
1500         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1501           // DADD used as move, or zeroing
1502           // If we have a 64-bit source, then make the target 64 bits too
1503           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1504             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1505             alloc_reg64(current,i,rt1[i]);
1506           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1507             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1508             alloc_reg64(current,i,rt1[i]);
1509           }
1510           if(opcode2[i]>=0x2e&&rs2[i]) {
1511             // DSUB used as negation - 64-bit result
1512             // If we have a 32-bit register, extend it to 64 bits
1513             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1514             alloc_reg64(current,i,rt1[i]);
1515           }
1516         }
1517       }
1518       if(rs1[i]&&rs2[i]) {
1519         current->is32&=~(1LL<<rt1[i]);
1520       } else if(rs1[i]) {
1521         current->is32&=~(1LL<<rt1[i]);
1522         if((current->is32>>rs1[i])&1)
1523           current->is32|=1LL<<rt1[i];
1524       } else if(rs2[i]) {
1525         current->is32&=~(1LL<<rt1[i]);
1526         if((current->is32>>rs2[i])&1)
1527           current->is32|=1LL<<rt1[i];
1528       } else {
1529         current->is32|=1LL<<rt1[i];
1530       }
1531     }
1532   }
1533   clear_const(current,rs1[i]);
1534   clear_const(current,rs2[i]);
1535   clear_const(current,rt1[i]);
1536   dirty_reg(current,rt1[i]);
1537 }
1538
1539 void imm16_alloc(struct regstat *current,int i)
1540 {
1541   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1542   else lt1[i]=rs1[i];
1543   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1544   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1545     current->is32&=~(1LL<<rt1[i]);
1546     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1547       // TODO: Could preserve the 32-bit flag if the immediate is zero
1548       alloc_reg64(current,i,rt1[i]);
1549       alloc_reg64(current,i,rs1[i]);
1550     }
1551     clear_const(current,rs1[i]);
1552     clear_const(current,rt1[i]);
1553   }
1554   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1555     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1556     current->is32|=1LL<<rt1[i];
1557     clear_const(current,rs1[i]);
1558     clear_const(current,rt1[i]);
1559   }
1560   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1561     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1562       if(rs1[i]!=rt1[i]) {
1563         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1564         alloc_reg64(current,i,rt1[i]);
1565         current->is32&=~(1LL<<rt1[i]);
1566       }
1567     }
1568     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1569     if(is_const(current,rs1[i])) {
1570       int v=get_const(current,rs1[i]);
1571       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1572       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1573       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1574     }
1575     else clear_const(current,rt1[i]);
1576   }
1577   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1578     if(is_const(current,rs1[i])) {
1579       int v=get_const(current,rs1[i]);
1580       set_const(current,rt1[i],v+imm[i]);
1581     }
1582     else clear_const(current,rt1[i]);
1583     current->is32|=1LL<<rt1[i];
1584   }
1585   else {
1586     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1587     current->is32|=1LL<<rt1[i];
1588   }
1589   dirty_reg(current,rt1[i]);
1590 }
1591
1592 void load_alloc(struct regstat *current,int i)
1593 {
1594   clear_const(current,rt1[i]);
1595   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1596   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1597   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1598   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1599     alloc_reg(current,i,rt1[i]);
1600     assert(get_reg(current->regmap,rt1[i])>=0);
1601     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1602     {
1603       current->is32&=~(1LL<<rt1[i]);
1604       alloc_reg64(current,i,rt1[i]);
1605     }
1606     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1607     {
1608       current->is32&=~(1LL<<rt1[i]);
1609       alloc_reg64(current,i,rt1[i]);
1610       alloc_all(current,i);
1611       alloc_reg64(current,i,FTEMP);
1612       minimum_free_regs[i]=HOST_REGS;
1613     }
1614     else current->is32|=1LL<<rt1[i];
1615     dirty_reg(current,rt1[i]);
1616     // If using TLB, need a register for pointer to the mapping table
1617     if(using_tlb) alloc_reg(current,i,TLREG);
1618     // LWL/LWR need a temporary register for the old value
1619     if(opcode[i]==0x22||opcode[i]==0x26)
1620     {
1621       alloc_reg(current,i,FTEMP);
1622       alloc_reg_temp(current,i,-1);
1623       minimum_free_regs[i]=1;
1624     }
1625   }
1626   else
1627   {
1628     // Load to r0 or unneeded register (dummy load)
1629     // but we still need a register to calculate the address
1630     if(opcode[i]==0x22||opcode[i]==0x26)
1631     {
1632       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1633     }
1634     // If using TLB, need a register for pointer to the mapping table
1635     if(using_tlb) alloc_reg(current,i,TLREG);
1636     alloc_reg_temp(current,i,-1);
1637     minimum_free_regs[i]=1;
1638     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1639     {
1640       alloc_all(current,i);
1641       alloc_reg64(current,i,FTEMP);
1642       minimum_free_regs[i]=HOST_REGS;
1643     }
1644   }
1645 }
1646
1647 void store_alloc(struct regstat *current,int i)
1648 {
1649   clear_const(current,rs2[i]);
1650   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1651   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1652   alloc_reg(current,i,rs2[i]);
1653   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1654     alloc_reg64(current,i,rs2[i]);
1655     if(rs2[i]) alloc_reg(current,i,FTEMP);
1656   }
1657   // If using TLB, need a register for pointer to the mapping table
1658   if(using_tlb) alloc_reg(current,i,TLREG);
1659   #if defined(HOST_IMM8)
1660   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1661   else alloc_reg(current,i,INVCP);
1662   #endif
1663   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1664     alloc_reg(current,i,FTEMP);
1665   }
1666   // We need a temporary register for address generation
1667   alloc_reg_temp(current,i,-1);
1668   minimum_free_regs[i]=1;
1669 }
1670
1671 void c1ls_alloc(struct regstat *current,int i)
1672 {
1673   //clear_const(current,rs1[i]); // FIXME
1674   clear_const(current,rt1[i]);
1675   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1676   alloc_reg(current,i,CSREG); // Status
1677   alloc_reg(current,i,FTEMP);
1678   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1679     alloc_reg64(current,i,FTEMP);
1680   }
1681   // If using TLB, need a register for pointer to the mapping table
1682   if(using_tlb) alloc_reg(current,i,TLREG);
1683   #if defined(HOST_IMM8)
1684   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1685   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1686     alloc_reg(current,i,INVCP);
1687   #endif
1688   // We need a temporary register for address generation
1689   alloc_reg_temp(current,i,-1);
1690 }
1691
1692 void c2ls_alloc(struct regstat *current,int i)
1693 {
1694   clear_const(current,rt1[i]);
1695   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1696   alloc_reg(current,i,FTEMP);
1697   // If using TLB, need a register for pointer to the mapping table
1698   if(using_tlb) alloc_reg(current,i,TLREG);
1699   #if defined(HOST_IMM8)
1700   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1701   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1702     alloc_reg(current,i,INVCP);
1703   #endif
1704   // We need a temporary register for address generation
1705   alloc_reg_temp(current,i,-1);
1706   minimum_free_regs[i]=1;
1707 }
1708
1709 #ifndef multdiv_alloc
1710 void multdiv_alloc(struct regstat *current,int i)
1711 {
1712   //  case 0x18: MULT
1713   //  case 0x19: MULTU
1714   //  case 0x1A: DIV
1715   //  case 0x1B: DIVU
1716   //  case 0x1C: DMULT
1717   //  case 0x1D: DMULTU
1718   //  case 0x1E: DDIV
1719   //  case 0x1F: DDIVU
1720   clear_const(current,rs1[i]);
1721   clear_const(current,rs2[i]);
1722   if(rs1[i]&&rs2[i])
1723   {
1724     if((opcode2[i]&4)==0) // 32-bit
1725     {
1726       current->u&=~(1LL<<HIREG);
1727       current->u&=~(1LL<<LOREG);
1728       alloc_reg(current,i,HIREG);
1729       alloc_reg(current,i,LOREG);
1730       alloc_reg(current,i,rs1[i]);
1731       alloc_reg(current,i,rs2[i]);
1732       current->is32|=1LL<<HIREG;
1733       current->is32|=1LL<<LOREG;
1734       dirty_reg(current,HIREG);
1735       dirty_reg(current,LOREG);
1736     }
1737     else // 64-bit
1738     {
1739       current->u&=~(1LL<<HIREG);
1740       current->u&=~(1LL<<LOREG);
1741       current->uu&=~(1LL<<HIREG);
1742       current->uu&=~(1LL<<LOREG);
1743       alloc_reg64(current,i,HIREG);
1744       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1745       alloc_reg64(current,i,rs1[i]);
1746       alloc_reg64(current,i,rs2[i]);
1747       alloc_all(current,i);
1748       current->is32&=~(1LL<<HIREG);
1749       current->is32&=~(1LL<<LOREG);
1750       dirty_reg(current,HIREG);
1751       dirty_reg(current,LOREG);
1752       minimum_free_regs[i]=HOST_REGS;
1753     }
1754   }
1755   else
1756   {
1757     // Multiply by zero is zero.
1758     // MIPS does not have a divide by zero exception.
1759     // The result is undefined, we return zero.
1760     alloc_reg(current,i,HIREG);
1761     alloc_reg(current,i,LOREG);
1762     current->is32|=1LL<<HIREG;
1763     current->is32|=1LL<<LOREG;
1764     dirty_reg(current,HIREG);
1765     dirty_reg(current,LOREG);
1766   }
1767 }
1768 #endif
1769
1770 void cop0_alloc(struct regstat *current,int i)
1771 {
1772   if(opcode2[i]==0) // MFC0
1773   {
1774     if(rt1[i]) {
1775       clear_const(current,rt1[i]);
1776       alloc_all(current,i);
1777       alloc_reg(current,i,rt1[i]);
1778       current->is32|=1LL<<rt1[i];
1779       dirty_reg(current,rt1[i]);
1780     }
1781   }
1782   else if(opcode2[i]==4) // MTC0
1783   {
1784     if(rs1[i]){
1785       clear_const(current,rs1[i]);
1786       alloc_reg(current,i,rs1[i]);
1787       alloc_all(current,i);
1788     }
1789     else {
1790       alloc_all(current,i); // FIXME: Keep r0
1791       current->u&=~1LL;
1792       alloc_reg(current,i,0);
1793     }
1794   }
1795   else
1796   {
1797     // TLBR/TLBWI/TLBWR/TLBP/ERET
1798     assert(opcode2[i]==0x10);
1799     alloc_all(current,i);
1800   }
1801   minimum_free_regs[i]=HOST_REGS;
1802 }
1803
1804 void cop1_alloc(struct regstat *current,int i)
1805 {
1806   alloc_reg(current,i,CSREG); // Load status
1807   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1808   {
1809     if(rt1[i]){
1810       clear_const(current,rt1[i]);
1811       if(opcode2[i]==1) {
1812         alloc_reg64(current,i,rt1[i]); // DMFC1
1813         current->is32&=~(1LL<<rt1[i]);
1814       }else{
1815         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1816         current->is32|=1LL<<rt1[i];
1817       }
1818       dirty_reg(current,rt1[i]);
1819     }
1820     alloc_reg_temp(current,i,-1);
1821   }
1822   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1823   {
1824     if(rs1[i]){
1825       clear_const(current,rs1[i]);
1826       if(opcode2[i]==5)
1827         alloc_reg64(current,i,rs1[i]); // DMTC1
1828       else
1829         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1830       alloc_reg_temp(current,i,-1);
1831     }
1832     else {
1833       current->u&=~1LL;
1834       alloc_reg(current,i,0);
1835       alloc_reg_temp(current,i,-1);
1836     }
1837   }
1838   minimum_free_regs[i]=1;
1839 }
1840 void fconv_alloc(struct regstat *current,int i)
1841 {
1842   alloc_reg(current,i,CSREG); // Load status
1843   alloc_reg_temp(current,i,-1);
1844   minimum_free_regs[i]=1;
1845 }
1846 void float_alloc(struct regstat *current,int i)
1847 {
1848   alloc_reg(current,i,CSREG); // Load status
1849   alloc_reg_temp(current,i,-1);
1850   minimum_free_regs[i]=1;
1851 }
1852 void c2op_alloc(struct regstat *current,int i)
1853 {
1854   alloc_reg_temp(current,i,-1);
1855 }
1856 void fcomp_alloc(struct regstat *current,int i)
1857 {
1858   alloc_reg(current,i,CSREG); // Load status
1859   alloc_reg(current,i,FSREG); // Load flags
1860   dirty_reg(current,FSREG); // Flag will be modified
1861   alloc_reg_temp(current,i,-1);
1862   minimum_free_regs[i]=1;
1863 }
1864
1865 void syscall_alloc(struct regstat *current,int i)
1866 {
1867   alloc_cc(current,i);
1868   dirty_reg(current,CCREG);
1869   alloc_all(current,i);
1870   minimum_free_regs[i]=HOST_REGS;
1871   current->isconst=0;
1872 }
1873
1874 void delayslot_alloc(struct regstat *current,int i)
1875 {
1876   switch(itype[i]) {
1877     case UJUMP:
1878     case CJUMP:
1879     case SJUMP:
1880     case RJUMP:
1881     case FJUMP:
1882     case SYSCALL:
1883     case HLECALL:
1884     case SPAN:
1885       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1886       printf("Disabled speculative precompilation\n");
1887       stop_after_jal=1;
1888       break;
1889     case IMM16:
1890       imm16_alloc(current,i);
1891       break;
1892     case LOAD:
1893     case LOADLR:
1894       load_alloc(current,i);
1895       break;
1896     case STORE:
1897     case STORELR:
1898       store_alloc(current,i);
1899       break;
1900     case ALU:
1901       alu_alloc(current,i);
1902       break;
1903     case SHIFT:
1904       shift_alloc(current,i);
1905       break;
1906     case MULTDIV:
1907       multdiv_alloc(current,i);
1908       break;
1909     case SHIFTIMM:
1910       shiftimm_alloc(current,i);
1911       break;
1912     case MOV:
1913       mov_alloc(current,i);
1914       break;
1915     case COP0:
1916       cop0_alloc(current,i);
1917       break;
1918     case COP1:
1919     case COP2:
1920       cop1_alloc(current,i);
1921       break;
1922     case C1LS:
1923       c1ls_alloc(current,i);
1924       break;
1925     case C2LS:
1926       c2ls_alloc(current,i);
1927       break;
1928     case FCONV:
1929       fconv_alloc(current,i);
1930       break;
1931     case FLOAT:
1932       float_alloc(current,i);
1933       break;
1934     case FCOMP:
1935       fcomp_alloc(current,i);
1936       break;
1937     case C2OP:
1938       c2op_alloc(current,i);
1939       break;
1940   }
1941 }
1942
1943 // Special case where a branch and delay slot span two pages in virtual memory
1944 static void pagespan_alloc(struct regstat *current,int i)
1945 {
1946   current->isconst=0;
1947   current->wasconst=0;
1948   regs[i].wasconst=0;
1949   minimum_free_regs[i]=HOST_REGS;
1950   alloc_all(current,i);
1951   alloc_cc(current,i);
1952   dirty_reg(current,CCREG);
1953   if(opcode[i]==3) // JAL
1954   {
1955     alloc_reg(current,i,31);
1956     dirty_reg(current,31);
1957   }
1958   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1959   {
1960     alloc_reg(current,i,rs1[i]);
1961     if (rt1[i]!=0) {
1962       alloc_reg(current,i,rt1[i]);
1963       dirty_reg(current,rt1[i]);
1964     }
1965   }
1966   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1967   {
1968     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1969     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1970     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1971     {
1972       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1973       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1974     }
1975   }
1976   else
1977   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1978   {
1979     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1980     if(!((current->is32>>rs1[i])&1))
1981     {
1982       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1983     }
1984   }
1985   else
1986   if(opcode[i]==0x11) // BC1
1987   {
1988     alloc_reg(current,i,FSREG);
1989     alloc_reg(current,i,CSREG);
1990   }
1991   //else ...
1992 }
1993
1994 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1995 {
1996   stubs[stubcount][0]=type;
1997   stubs[stubcount][1]=addr;
1998   stubs[stubcount][2]=retaddr;
1999   stubs[stubcount][3]=a;
2000   stubs[stubcount][4]=b;
2001   stubs[stubcount][5]=c;
2002   stubs[stubcount][6]=d;
2003   stubs[stubcount][7]=e;
2004   stubcount++;
2005 }
2006
2007 // Write out a single register
2008 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2009 {
2010   int hr;
2011   for(hr=0;hr<HOST_REGS;hr++) {
2012     if(hr!=EXCLUDE_REG) {
2013       if((regmap[hr]&63)==r) {
2014         if((dirty>>hr)&1) {
2015           if(regmap[hr]<64) {
2016             emit_storereg(r,hr);
2017 #ifndef FORCE32
2018             if((is32>>regmap[hr])&1) {
2019               emit_sarimm(hr,31,hr);
2020               emit_storereg(r|64,hr);
2021             }
2022 #endif
2023           }else{
2024             emit_storereg(r|64,hr);
2025           }
2026         }
2027       }
2028     }
2029   }
2030 }
2031
2032 int mchecksum()
2033 {
2034   //if(!tracedebug) return 0;
2035   int i;
2036   int sum=0;
2037   for(i=0;i<2097152;i++) {
2038     unsigned int temp=sum;
2039     sum<<=1;
2040     sum|=(~temp)>>31;
2041     sum^=((u_int *)rdram)[i];
2042   }
2043   return sum;
2044 }
2045 int rchecksum()
2046 {
2047   int i;
2048   int sum=0;
2049   for(i=0;i<64;i++)
2050     sum^=((u_int *)reg)[i];
2051   return sum;
2052 }
2053 void rlist()
2054 {
2055   int i;
2056   printf("TRACE: ");
2057   for(i=0;i<32;i++)
2058     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2059   printf("\n");
2060 #ifndef DISABLE_COP1
2061   printf("TRACE: ");
2062   for(i=0;i<32;i++)
2063     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2064   printf("\n");
2065 #endif
2066 }
2067
2068 void enabletrace()
2069 {
2070   tracedebug=1;
2071 }
2072
2073 void memdebug(int i)
2074 {
2075   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2076   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2077   //rlist();
2078   //if(tracedebug) {
2079   //if(Count>=-2084597794) {
2080   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2081   //if(0) {
2082     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2083     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2084     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2085     rlist();
2086     #ifdef __i386__
2087     printf("TRACE: %x\n",(&i)[-1]);
2088     #endif
2089     #ifdef __arm__
2090     int j;
2091     printf("TRACE: %x \n",(&j)[10]);
2092     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2093     #endif
2094     //fflush(stdout);
2095   }
2096   //printf("TRACE: %x\n",(&i)[-1]);
2097 }
2098
2099 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2100 {
2101   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2102 }
2103
2104 void alu_assemble(int i,struct regstat *i_regs)
2105 {
2106   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2107     if(rt1[i]) {
2108       signed char s1,s2,t;
2109       t=get_reg(i_regs->regmap,rt1[i]);
2110       if(t>=0) {
2111         s1=get_reg(i_regs->regmap,rs1[i]);
2112         s2=get_reg(i_regs->regmap,rs2[i]);
2113         if(rs1[i]&&rs2[i]) {
2114           assert(s1>=0);
2115           assert(s2>=0);
2116           if(opcode2[i]&2) emit_sub(s1,s2,t);
2117           else emit_add(s1,s2,t);
2118         }
2119         else if(rs1[i]) {
2120           if(s1>=0) emit_mov(s1,t);
2121           else emit_loadreg(rs1[i],t);
2122         }
2123         else if(rs2[i]) {
2124           if(s2>=0) {
2125             if(opcode2[i]&2) emit_neg(s2,t);
2126             else emit_mov(s2,t);
2127           }
2128           else {
2129             emit_loadreg(rs2[i],t);
2130             if(opcode2[i]&2) emit_neg(t,t);
2131           }
2132         }
2133         else emit_zeroreg(t);
2134       }
2135     }
2136   }
2137   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2138     if(rt1[i]) {
2139       signed char s1l,s2l,s1h,s2h,tl,th;
2140       tl=get_reg(i_regs->regmap,rt1[i]);
2141       th=get_reg(i_regs->regmap,rt1[i]|64);
2142       if(tl>=0) {
2143         s1l=get_reg(i_regs->regmap,rs1[i]);
2144         s2l=get_reg(i_regs->regmap,rs2[i]);
2145         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2146         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2147         if(rs1[i]&&rs2[i]) {
2148           assert(s1l>=0);
2149           assert(s2l>=0);
2150           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2151           else emit_adds(s1l,s2l,tl);
2152           if(th>=0) {
2153             #ifdef INVERTED_CARRY
2154             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2155             #else
2156             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2157             #endif
2158             else emit_add(s1h,s2h,th);
2159           }
2160         }
2161         else if(rs1[i]) {
2162           if(s1l>=0) emit_mov(s1l,tl);
2163           else emit_loadreg(rs1[i],tl);
2164           if(th>=0) {
2165             if(s1h>=0) emit_mov(s1h,th);
2166             else emit_loadreg(rs1[i]|64,th);
2167           }
2168         }
2169         else if(rs2[i]) {
2170           if(s2l>=0) {
2171             if(opcode2[i]&2) emit_negs(s2l,tl);
2172             else emit_mov(s2l,tl);
2173           }
2174           else {
2175             emit_loadreg(rs2[i],tl);
2176             if(opcode2[i]&2) emit_negs(tl,tl);
2177           }
2178           if(th>=0) {
2179             #ifdef INVERTED_CARRY
2180             if(s2h>=0) emit_mov(s2h,th);
2181             else emit_loadreg(rs2[i]|64,th);
2182             if(opcode2[i]&2) {
2183               emit_adcimm(-1,th); // x86 has inverted carry flag
2184               emit_not(th,th);
2185             }
2186             #else
2187             if(opcode2[i]&2) {
2188               if(s2h>=0) emit_rscimm(s2h,0,th);
2189               else {
2190                 emit_loadreg(rs2[i]|64,th);
2191                 emit_rscimm(th,0,th);
2192               }
2193             }else{
2194               if(s2h>=0) emit_mov(s2h,th);
2195               else emit_loadreg(rs2[i]|64,th);
2196             }
2197             #endif
2198           }
2199         }
2200         else {
2201           emit_zeroreg(tl);
2202           if(th>=0) emit_zeroreg(th);
2203         }
2204       }
2205     }
2206   }
2207   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2208     if(rt1[i]) {
2209       signed char s1l,s1h,s2l,s2h,t;
2210       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2211       {
2212         t=get_reg(i_regs->regmap,rt1[i]);
2213         //assert(t>=0);
2214         if(t>=0) {
2215           s1l=get_reg(i_regs->regmap,rs1[i]);
2216           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2217           s2l=get_reg(i_regs->regmap,rs2[i]);
2218           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2219           if(rs2[i]==0) // rx<r0
2220           {
2221             assert(s1h>=0);
2222             if(opcode2[i]==0x2a) // SLT
2223               emit_shrimm(s1h,31,t);
2224             else // SLTU (unsigned can not be less than zero)
2225               emit_zeroreg(t);
2226           }
2227           else if(rs1[i]==0) // r0<rx
2228           {
2229             assert(s2h>=0);
2230             if(opcode2[i]==0x2a) // SLT
2231               emit_set_gz64_32(s2h,s2l,t);
2232             else // SLTU (set if not zero)
2233               emit_set_nz64_32(s2h,s2l,t);
2234           }
2235           else {
2236             assert(s1l>=0);assert(s1h>=0);
2237             assert(s2l>=0);assert(s2h>=0);
2238             if(opcode2[i]==0x2a) // SLT
2239               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2240             else // SLTU
2241               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2242           }
2243         }
2244       } else {
2245         t=get_reg(i_regs->regmap,rt1[i]);
2246         //assert(t>=0);
2247         if(t>=0) {
2248           s1l=get_reg(i_regs->regmap,rs1[i]);
2249           s2l=get_reg(i_regs->regmap,rs2[i]);
2250           if(rs2[i]==0) // rx<r0
2251           {
2252             assert(s1l>=0);
2253             if(opcode2[i]==0x2a) // SLT
2254               emit_shrimm(s1l,31,t);
2255             else // SLTU (unsigned can not be less than zero)
2256               emit_zeroreg(t);
2257           }
2258           else if(rs1[i]==0) // r0<rx
2259           {
2260             assert(s2l>=0);
2261             if(opcode2[i]==0x2a) // SLT
2262               emit_set_gz32(s2l,t);
2263             else // SLTU (set if not zero)
2264               emit_set_nz32(s2l,t);
2265           }
2266           else{
2267             assert(s1l>=0);assert(s2l>=0);
2268             if(opcode2[i]==0x2a) // SLT
2269               emit_set_if_less32(s1l,s2l,t);
2270             else // SLTU
2271               emit_set_if_carry32(s1l,s2l,t);
2272           }
2273         }
2274       }
2275     }
2276   }
2277   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2278     if(rt1[i]) {
2279       signed char s1l,s1h,s2l,s2h,th,tl;
2280       tl=get_reg(i_regs->regmap,rt1[i]);
2281       th=get_reg(i_regs->regmap,rt1[i]|64);
2282       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2283       {
2284         assert(tl>=0);
2285         if(tl>=0) {
2286           s1l=get_reg(i_regs->regmap,rs1[i]);
2287           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2288           s2l=get_reg(i_regs->regmap,rs2[i]);
2289           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2290           if(rs1[i]&&rs2[i]) {
2291             assert(s1l>=0);assert(s1h>=0);
2292             assert(s2l>=0);assert(s2h>=0);
2293             if(opcode2[i]==0x24) { // AND
2294               emit_and(s1l,s2l,tl);
2295               emit_and(s1h,s2h,th);
2296             } else
2297             if(opcode2[i]==0x25) { // OR
2298               emit_or(s1l,s2l,tl);
2299               emit_or(s1h,s2h,th);
2300             } else
2301             if(opcode2[i]==0x26) { // XOR
2302               emit_xor(s1l,s2l,tl);
2303               emit_xor(s1h,s2h,th);
2304             } else
2305             if(opcode2[i]==0x27) { // NOR
2306               emit_or(s1l,s2l,tl);
2307               emit_or(s1h,s2h,th);
2308               emit_not(tl,tl);
2309               emit_not(th,th);
2310             }
2311           }
2312           else
2313           {
2314             if(opcode2[i]==0x24) { // AND
2315               emit_zeroreg(tl);
2316               emit_zeroreg(th);
2317             } else
2318             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2319               if(rs1[i]){
2320                 if(s1l>=0) emit_mov(s1l,tl);
2321                 else emit_loadreg(rs1[i],tl);
2322                 if(s1h>=0) emit_mov(s1h,th);
2323                 else emit_loadreg(rs1[i]|64,th);
2324               }
2325               else
2326               if(rs2[i]){
2327                 if(s2l>=0) emit_mov(s2l,tl);
2328                 else emit_loadreg(rs2[i],tl);
2329                 if(s2h>=0) emit_mov(s2h,th);
2330                 else emit_loadreg(rs2[i]|64,th);
2331               }
2332               else{
2333                 emit_zeroreg(tl);
2334                 emit_zeroreg(th);
2335               }
2336             } else
2337             if(opcode2[i]==0x27) { // NOR
2338               if(rs1[i]){
2339                 if(s1l>=0) emit_not(s1l,tl);
2340                 else{
2341                   emit_loadreg(rs1[i],tl);
2342                   emit_not(tl,tl);
2343                 }
2344                 if(s1h>=0) emit_not(s1h,th);
2345                 else{
2346                   emit_loadreg(rs1[i]|64,th);
2347                   emit_not(th,th);
2348                 }
2349               }
2350               else
2351               if(rs2[i]){
2352                 if(s2l>=0) emit_not(s2l,tl);
2353                 else{
2354                   emit_loadreg(rs2[i],tl);
2355                   emit_not(tl,tl);
2356                 }
2357                 if(s2h>=0) emit_not(s2h,th);
2358                 else{
2359                   emit_loadreg(rs2[i]|64,th);
2360                   emit_not(th,th);
2361                 }
2362               }
2363               else {
2364                 emit_movimm(-1,tl);
2365                 emit_movimm(-1,th);
2366               }
2367             }
2368           }
2369         }
2370       }
2371       else
2372       {
2373         // 32 bit
2374         if(tl>=0) {
2375           s1l=get_reg(i_regs->regmap,rs1[i]);
2376           s2l=get_reg(i_regs->regmap,rs2[i]);
2377           if(rs1[i]&&rs2[i]) {
2378             assert(s1l>=0);
2379             assert(s2l>=0);
2380             if(opcode2[i]==0x24) { // AND
2381               emit_and(s1l,s2l,tl);
2382             } else
2383             if(opcode2[i]==0x25) { // OR
2384               emit_or(s1l,s2l,tl);
2385             } else
2386             if(opcode2[i]==0x26) { // XOR
2387               emit_xor(s1l,s2l,tl);
2388             } else
2389             if(opcode2[i]==0x27) { // NOR
2390               emit_or(s1l,s2l,tl);
2391               emit_not(tl,tl);
2392             }
2393           }
2394           else
2395           {
2396             if(opcode2[i]==0x24) { // AND
2397               emit_zeroreg(tl);
2398             } else
2399             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2400               if(rs1[i]){
2401                 if(s1l>=0) emit_mov(s1l,tl);
2402                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2403               }
2404               else
2405               if(rs2[i]){
2406                 if(s2l>=0) emit_mov(s2l,tl);
2407                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2408               }
2409               else emit_zeroreg(tl);
2410             } else
2411             if(opcode2[i]==0x27) { // NOR
2412               if(rs1[i]){
2413                 if(s1l>=0) emit_not(s1l,tl);
2414                 else {
2415                   emit_loadreg(rs1[i],tl);
2416                   emit_not(tl,tl);
2417                 }
2418               }
2419               else
2420               if(rs2[i]){
2421                 if(s2l>=0) emit_not(s2l,tl);
2422                 else {
2423                   emit_loadreg(rs2[i],tl);
2424                   emit_not(tl,tl);
2425                 }
2426               }
2427               else emit_movimm(-1,tl);
2428             }
2429           }
2430         }
2431       }
2432     }
2433   }
2434 }
2435
2436 void imm16_assemble(int i,struct regstat *i_regs)
2437 {
2438   if (opcode[i]==0x0f) { // LUI
2439     if(rt1[i]) {
2440       signed char t;
2441       t=get_reg(i_regs->regmap,rt1[i]);
2442       //assert(t>=0);
2443       if(t>=0) {
2444         if(!((i_regs->isconst>>t)&1))
2445           emit_movimm(imm[i]<<16,t);
2446       }
2447     }
2448   }
2449   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2450     if(rt1[i]) {
2451       signed char s,t;
2452       t=get_reg(i_regs->regmap,rt1[i]);
2453       s=get_reg(i_regs->regmap,rs1[i]);
2454       if(rs1[i]) {
2455         //assert(t>=0);
2456         //assert(s>=0);
2457         if(t>=0) {
2458           if(!((i_regs->isconst>>t)&1)) {
2459             if(s<0) {
2460               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2461               emit_addimm(t,imm[i],t);
2462             }else{
2463               if(!((i_regs->wasconst>>s)&1))
2464                 emit_addimm(s,imm[i],t);
2465               else
2466                 emit_movimm(constmap[i][s]+imm[i],t);
2467             }
2468           }
2469         }
2470       } else {
2471         if(t>=0) {
2472           if(!((i_regs->isconst>>t)&1))
2473             emit_movimm(imm[i],t);
2474         }
2475       }
2476     }
2477   }
2478   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2479     if(rt1[i]) {
2480       signed char sh,sl,th,tl;
2481       th=get_reg(i_regs->regmap,rt1[i]|64);
2482       tl=get_reg(i_regs->regmap,rt1[i]);
2483       sh=get_reg(i_regs->regmap,rs1[i]|64);
2484       sl=get_reg(i_regs->regmap,rs1[i]);
2485       if(tl>=0) {
2486         if(rs1[i]) {
2487           assert(sh>=0);
2488           assert(sl>=0);
2489           if(th>=0) {
2490             emit_addimm64_32(sh,sl,imm[i],th,tl);
2491           }
2492           else {
2493             emit_addimm(sl,imm[i],tl);
2494           }
2495         } else {
2496           emit_movimm(imm[i],tl);
2497           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2498         }
2499       }
2500     }
2501   }
2502   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2503     if(rt1[i]) {
2504       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2505       signed char sh,sl,t;
2506       t=get_reg(i_regs->regmap,rt1[i]);
2507       sh=get_reg(i_regs->regmap,rs1[i]|64);
2508       sl=get_reg(i_regs->regmap,rs1[i]);
2509       //assert(t>=0);
2510       if(t>=0) {
2511         if(rs1[i]>0) {
2512           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2513           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2514             if(opcode[i]==0x0a) { // SLTI
2515               if(sl<0) {
2516                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2517                 emit_slti32(t,imm[i],t);
2518               }else{
2519                 emit_slti32(sl,imm[i],t);
2520               }
2521             }
2522             else { // SLTIU
2523               if(sl<0) {
2524                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2525                 emit_sltiu32(t,imm[i],t);
2526               }else{
2527                 emit_sltiu32(sl,imm[i],t);
2528               }
2529             }
2530           }else{ // 64-bit
2531             assert(sl>=0);
2532             if(opcode[i]==0x0a) // SLTI
2533               emit_slti64_32(sh,sl,imm[i],t);
2534             else // SLTIU
2535               emit_sltiu64_32(sh,sl,imm[i],t);
2536           }
2537         }else{
2538           // SLTI(U) with r0 is just stupid,
2539           // nonetheless examples can be found
2540           if(opcode[i]==0x0a) // SLTI
2541             if(0<imm[i]) emit_movimm(1,t);
2542             else emit_zeroreg(t);
2543           else // SLTIU
2544           {
2545             if(imm[i]) emit_movimm(1,t);
2546             else emit_zeroreg(t);
2547           }
2548         }
2549       }
2550     }
2551   }
2552   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2553     if(rt1[i]) {
2554       signed char sh,sl,th,tl;
2555       th=get_reg(i_regs->regmap,rt1[i]|64);
2556       tl=get_reg(i_regs->regmap,rt1[i]);
2557       sh=get_reg(i_regs->regmap,rs1[i]|64);
2558       sl=get_reg(i_regs->regmap,rs1[i]);
2559       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2560         if(opcode[i]==0x0c) //ANDI
2561         {
2562           if(rs1[i]) {
2563             if(sl<0) {
2564               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2565               emit_andimm(tl,imm[i],tl);
2566             }else{
2567               if(!((i_regs->wasconst>>sl)&1))
2568                 emit_andimm(sl,imm[i],tl);
2569               else
2570                 emit_movimm(constmap[i][sl]&imm[i],tl);
2571             }
2572           }
2573           else
2574             emit_zeroreg(tl);
2575           if(th>=0) emit_zeroreg(th);
2576         }
2577         else
2578         {
2579           if(rs1[i]) {
2580             if(sl<0) {
2581               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2582             }
2583             if(th>=0) {
2584               if(sh<0) {
2585                 emit_loadreg(rs1[i]|64,th);
2586               }else{
2587                 emit_mov(sh,th);
2588               }
2589             }
2590             if(opcode[i]==0x0d) //ORI
2591             if(sl<0) {
2592               emit_orimm(tl,imm[i],tl);
2593             }else{
2594               if(!((i_regs->wasconst>>sl)&1))
2595                 emit_orimm(sl,imm[i],tl);
2596               else
2597                 emit_movimm(constmap[i][sl]|imm[i],tl);
2598             }
2599             if(opcode[i]==0x0e) //XORI
2600             if(sl<0) {
2601               emit_xorimm(tl,imm[i],tl);
2602             }else{
2603               if(!((i_regs->wasconst>>sl)&1))
2604                 emit_xorimm(sl,imm[i],tl);
2605               else
2606                 emit_movimm(constmap[i][sl]^imm[i],tl);
2607             }
2608           }
2609           else {
2610             emit_movimm(imm[i],tl);
2611             if(th>=0) emit_zeroreg(th);
2612           }
2613         }
2614       }
2615     }
2616   }
2617 }
2618
2619 void shiftimm_assemble(int i,struct regstat *i_regs)
2620 {
2621   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2622   {
2623     if(rt1[i]) {
2624       signed char s,t;
2625       t=get_reg(i_regs->regmap,rt1[i]);
2626       s=get_reg(i_regs->regmap,rs1[i]);
2627       //assert(t>=0);
2628       if(t>=0){
2629         if(rs1[i]==0)
2630         {
2631           emit_zeroreg(t);
2632         }
2633         else
2634         {
2635           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2636           if(imm[i]) {
2637             if(opcode2[i]==0) // SLL
2638             {
2639               emit_shlimm(s<0?t:s,imm[i],t);
2640             }
2641             if(opcode2[i]==2) // SRL
2642             {
2643               emit_shrimm(s<0?t:s,imm[i],t);
2644             }
2645             if(opcode2[i]==3) // SRA
2646             {
2647               emit_sarimm(s<0?t:s,imm[i],t);
2648             }
2649           }else{
2650             // Shift by zero
2651             if(s>=0 && s!=t) emit_mov(s,t);
2652           }
2653         }
2654       }
2655       //emit_storereg(rt1[i],t); //DEBUG
2656     }
2657   }
2658   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2659   {
2660     if(rt1[i]) {
2661       signed char sh,sl,th,tl;
2662       th=get_reg(i_regs->regmap,rt1[i]|64);
2663       tl=get_reg(i_regs->regmap,rt1[i]);
2664       sh=get_reg(i_regs->regmap,rs1[i]|64);
2665       sl=get_reg(i_regs->regmap,rs1[i]);
2666       if(tl>=0) {
2667         if(rs1[i]==0)
2668         {
2669           emit_zeroreg(tl);
2670           if(th>=0) emit_zeroreg(th);
2671         }
2672         else
2673         {
2674           assert(sl>=0);
2675           assert(sh>=0);
2676           if(imm[i]) {
2677             if(opcode2[i]==0x38) // DSLL
2678             {
2679               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2680               emit_shlimm(sl,imm[i],tl);
2681             }
2682             if(opcode2[i]==0x3a) // DSRL
2683             {
2684               emit_shrdimm(sl,sh,imm[i],tl);
2685               if(th>=0) emit_shrimm(sh,imm[i],th);
2686             }
2687             if(opcode2[i]==0x3b) // DSRA
2688             {
2689               emit_shrdimm(sl,sh,imm[i],tl);
2690               if(th>=0) emit_sarimm(sh,imm[i],th);
2691             }
2692           }else{
2693             // Shift by zero
2694             if(sl!=tl) emit_mov(sl,tl);
2695             if(th>=0&&sh!=th) emit_mov(sh,th);
2696           }
2697         }
2698       }
2699     }
2700   }
2701   if(opcode2[i]==0x3c) // DSLL32
2702   {
2703     if(rt1[i]) {
2704       signed char sl,tl,th;
2705       tl=get_reg(i_regs->regmap,rt1[i]);
2706       th=get_reg(i_regs->regmap,rt1[i]|64);
2707       sl=get_reg(i_regs->regmap,rs1[i]);
2708       if(th>=0||tl>=0){
2709         assert(tl>=0);
2710         assert(th>=0);
2711         assert(sl>=0);
2712         emit_mov(sl,th);
2713         emit_zeroreg(tl);
2714         if(imm[i]>32)
2715         {
2716           emit_shlimm(th,imm[i]&31,th);
2717         }
2718       }
2719     }
2720   }
2721   if(opcode2[i]==0x3e) // DSRL32
2722   {
2723     if(rt1[i]) {
2724       signed char sh,tl,th;
2725       tl=get_reg(i_regs->regmap,rt1[i]);
2726       th=get_reg(i_regs->regmap,rt1[i]|64);
2727       sh=get_reg(i_regs->regmap,rs1[i]|64);
2728       if(tl>=0){
2729         assert(sh>=0);
2730         emit_mov(sh,tl);
2731         if(th>=0) emit_zeroreg(th);
2732         if(imm[i]>32)
2733         {
2734           emit_shrimm(tl,imm[i]&31,tl);
2735         }
2736       }
2737     }
2738   }
2739   if(opcode2[i]==0x3f) // DSRA32
2740   {
2741     if(rt1[i]) {
2742       signed char sh,tl;
2743       tl=get_reg(i_regs->regmap,rt1[i]);
2744       sh=get_reg(i_regs->regmap,rs1[i]|64);
2745       if(tl>=0){
2746         assert(sh>=0);
2747         emit_mov(sh,tl);
2748         if(imm[i]>32)
2749         {
2750           emit_sarimm(tl,imm[i]&31,tl);
2751         }
2752       }
2753     }
2754   }
2755 }
2756
2757 #ifndef shift_assemble
2758 void shift_assemble(int i,struct regstat *i_regs)
2759 {
2760   printf("Need shift_assemble for this architecture.\n");
2761   exit(1);
2762 }
2763 #endif
2764
2765 void load_assemble(int i,struct regstat *i_regs)
2766 {
2767   int s,th,tl,addr,map=-1;
2768   int offset;
2769   int jaddr=0;
2770   int memtarget=0,c=0;
2771   int fastload_reg_override=0;
2772   u_int hr,reglist=0;
2773   th=get_reg(i_regs->regmap,rt1[i]|64);
2774   tl=get_reg(i_regs->regmap,rt1[i]);
2775   s=get_reg(i_regs->regmap,rs1[i]);
2776   offset=imm[i];
2777   for(hr=0;hr<HOST_REGS;hr++) {
2778     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2779   }
2780   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2781   if(s>=0) {
2782     c=(i_regs->wasconst>>s)&1;
2783     if (c) {
2784       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2785       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2786     }
2787   }
2788   //printf("load_assemble: c=%d\n",c);
2789   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2790   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2791 #ifdef PCSX
2792   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2793     ||rt1[i]==0) {
2794       // could be FIFO, must perform the read
2795       // ||dummy read
2796       assem_debug("(forced read)\n");
2797       tl=get_reg(i_regs->regmap,-1);
2798       assert(tl>=0);
2799   }
2800 #endif
2801   if(offset||s<0||c) addr=tl;
2802   else addr=s;
2803   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2804  if(tl>=0) {
2805   //printf("load_assemble: c=%d\n",c);
2806   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2807   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2808   reglist&=~(1<<tl);
2809   if(th>=0) reglist&=~(1<<th);
2810   if(!using_tlb) {
2811     if(!c) {
2812       #ifdef RAM_OFFSET
2813       map=get_reg(i_regs->regmap,ROREG);
2814       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2815       #endif
2816 //#define R29_HACK 1
2817       #ifdef R29_HACK
2818       // Strmnnrmn's speed hack
2819       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2820       #endif
2821       {
2822         #ifdef PCSX
2823         if(sp_in_mirror&&rs1[i]==29) {
2824           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2825           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2826           fastload_reg_override=HOST_TEMPREG;
2827         }
2828         else
2829         #endif
2830         emit_cmpimm(addr,RAM_SIZE);
2831         jaddr=(int)out;
2832         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2833         // Hint to branch predictor that the branch is unlikely to be taken
2834         if(rs1[i]>=28)
2835           emit_jno_unlikely(0);
2836         else
2837         #endif
2838         emit_jno(0);
2839       }
2840     }
2841   }else{ // using tlb
2842     int x=0;
2843     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2844     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2845     map=get_reg(i_regs->regmap,TLREG);
2846     assert(map>=0);
2847     reglist&=~(1<<map);
2848     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2849     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2850   }
2851   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2852   if (opcode[i]==0x20) { // LB
2853     if(!c||memtarget) {
2854       if(!dummy) {
2855         #ifdef HOST_IMM_ADDR32
2856         if(c)
2857           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2858         else
2859         #endif
2860         {
2861           //emit_xorimm(addr,3,tl);
2862           //gen_tlb_addr_r(tl,map);
2863           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2864           int x=0,a=tl;
2865 #ifdef BIG_ENDIAN_MIPS
2866           if(!c) emit_xorimm(addr,3,tl);
2867           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2868 #else
2869           if(!c) a=addr;
2870 #endif
2871           if(fastload_reg_override) a=fastload_reg_override;
2872
2873           emit_movsbl_indexed_tlb(x,a,map,tl);
2874         }
2875       }
2876       if(jaddr)
2877         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2878     }
2879     else
2880       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2881   }
2882   if (opcode[i]==0x21) { // LH
2883     if(!c||memtarget) {
2884       if(!dummy) {
2885         #ifdef HOST_IMM_ADDR32
2886         if(c)
2887           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2888         else
2889         #endif
2890         {
2891           int x=0,a=tl;
2892 #ifdef BIG_ENDIAN_MIPS
2893           if(!c) emit_xorimm(addr,2,tl);
2894           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2895 #else
2896           if(!c) a=addr;
2897 #endif
2898           if(fastload_reg_override) a=fastload_reg_override;
2899           //#ifdef
2900           //emit_movswl_indexed_tlb(x,tl,map,tl);
2901           //else
2902           if(map>=0) {
2903             gen_tlb_addr_r(a,map);
2904             emit_movswl_indexed(x,a,tl);
2905           }else{
2906             #ifdef RAM_OFFSET
2907             emit_movswl_indexed(x,a,tl);
2908             #else
2909             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2910             #endif
2911           }
2912         }
2913       }
2914       if(jaddr)
2915         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2916     }
2917     else
2918       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2919   }
2920   if (opcode[i]==0x23) { // LW
2921     if(!c||memtarget) {
2922       if(!dummy) {
2923         int a=addr;
2924         if(fastload_reg_override) a=fastload_reg_override;
2925         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2926         #ifdef HOST_IMM_ADDR32
2927         if(c)
2928           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2929         else
2930         #endif
2931         emit_readword_indexed_tlb(0,a,map,tl);
2932       }
2933       if(jaddr)
2934         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2935     }
2936     else
2937       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2938   }
2939   if (opcode[i]==0x24) { // LBU
2940     if(!c||memtarget) {
2941       if(!dummy) {
2942         #ifdef HOST_IMM_ADDR32
2943         if(c)
2944           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2945         else
2946         #endif
2947         {
2948           //emit_xorimm(addr,3,tl);
2949           //gen_tlb_addr_r(tl,map);
2950           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2951           int x=0,a=tl;
2952 #ifdef BIG_ENDIAN_MIPS
2953           if(!c) emit_xorimm(addr,3,tl);
2954           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2955 #else
2956           if(!c) a=addr;
2957 #endif
2958           if(fastload_reg_override) a=fastload_reg_override;
2959
2960           emit_movzbl_indexed_tlb(x,a,map,tl);
2961         }
2962       }
2963       if(jaddr)
2964         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2965     }
2966     else
2967       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2968   }
2969   if (opcode[i]==0x25) { // LHU
2970     if(!c||memtarget) {
2971       if(!dummy) {
2972         #ifdef HOST_IMM_ADDR32
2973         if(c)
2974           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2975         else
2976         #endif
2977         {
2978           int x=0,a=tl;
2979 #ifdef BIG_ENDIAN_MIPS
2980           if(!c) emit_xorimm(addr,2,tl);
2981           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2982 #else
2983           if(!c) a=addr;
2984 #endif
2985           if(fastload_reg_override) a=fastload_reg_override;
2986           //#ifdef
2987           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2988           //#else
2989           if(map>=0) {
2990             gen_tlb_addr_r(a,map);
2991             emit_movzwl_indexed(x,a,tl);
2992           }else{
2993             #ifdef RAM_OFFSET
2994             emit_movzwl_indexed(x,a,tl);
2995             #else
2996             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2997             #endif
2998           }
2999         }
3000       }
3001       if(jaddr)
3002         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3003     }
3004     else
3005       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3006   }
3007   if (opcode[i]==0x27) { // LWU
3008     assert(th>=0);
3009     if(!c||memtarget) {
3010       if(!dummy) {
3011         int a=addr;
3012         if(fastload_reg_override) a=fastload_reg_override;
3013         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3014         #ifdef HOST_IMM_ADDR32
3015         if(c)
3016           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3017         else
3018         #endif
3019         emit_readword_indexed_tlb(0,a,map,tl);
3020       }
3021       if(jaddr)
3022         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3023     }
3024     else {
3025       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3026     }
3027     emit_zeroreg(th);
3028   }
3029   if (opcode[i]==0x37) { // LD
3030     if(!c||memtarget) {
3031       if(!dummy) {
3032         int a=addr;
3033         if(fastload_reg_override) a=fastload_reg_override;
3034         //gen_tlb_addr_r(tl,map);
3035         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3036         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3037         #ifdef HOST_IMM_ADDR32
3038         if(c)
3039           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3040         else
3041         #endif
3042         emit_readdword_indexed_tlb(0,a,map,th,tl);
3043       }
3044       if(jaddr)
3045         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3046     }
3047     else
3048       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3049   }
3050  }
3051   //emit_storereg(rt1[i],tl); // DEBUG
3052   //if(opcode[i]==0x23)
3053   //if(opcode[i]==0x24)
3054   //if(opcode[i]==0x23||opcode[i]==0x24)
3055   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3056   {
3057     //emit_pusha();
3058     save_regs(0x100f);
3059         emit_readword((int)&last_count,ECX);
3060         #ifdef __i386__
3061         if(get_reg(i_regs->regmap,CCREG)<0)
3062           emit_loadreg(CCREG,HOST_CCREG);
3063         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3064         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3065         emit_writeword(HOST_CCREG,(int)&Count);
3066         #endif
3067         #ifdef __arm__
3068         if(get_reg(i_regs->regmap,CCREG)<0)
3069           emit_loadreg(CCREG,0);
3070         else
3071           emit_mov(HOST_CCREG,0);
3072         emit_add(0,ECX,0);
3073         emit_addimm(0,2*ccadj[i],0);
3074         emit_writeword(0,(int)&Count);
3075         #endif
3076     emit_call((int)memdebug);
3077     //emit_popa();
3078     restore_regs(0x100f);
3079   }/**/
3080 }
3081
3082 #ifndef loadlr_assemble
3083 void loadlr_assemble(int i,struct regstat *i_regs)
3084 {
3085   printf("Need loadlr_assemble for this architecture.\n");
3086   exit(1);
3087 }
3088 #endif
3089
3090 void store_assemble(int i,struct regstat *i_regs)
3091 {
3092   int s,th,tl,map=-1;
3093   int addr,temp;
3094   int offset;
3095   int jaddr=0,jaddr2,type;
3096   int memtarget=0,c=0;
3097   int agr=AGEN1+(i&1);
3098   int faststore_reg_override=0;
3099   u_int hr,reglist=0;
3100   th=get_reg(i_regs->regmap,rs2[i]|64);
3101   tl=get_reg(i_regs->regmap,rs2[i]);
3102   s=get_reg(i_regs->regmap,rs1[i]);
3103   temp=get_reg(i_regs->regmap,agr);
3104   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3105   offset=imm[i];
3106   if(s>=0) {
3107     c=(i_regs->wasconst>>s)&1;
3108     if(c) {
3109       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3110       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3111     }
3112   }
3113   assert(tl>=0);
3114   assert(temp>=0);
3115   for(hr=0;hr<HOST_REGS;hr++) {
3116     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3117   }
3118   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3119   if(offset||s<0||c) addr=temp;
3120   else addr=s;
3121   if(!using_tlb) {
3122     if(!c) {
3123       #ifdef PCSX
3124       if(sp_in_mirror&&rs1[i]==29) {
3125         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3126         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3127         faststore_reg_override=HOST_TEMPREG;
3128       }
3129       else
3130       #endif
3131       #ifdef R29_HACK
3132       // Strmnnrmn's speed hack
3133       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3134       #endif
3135       emit_cmpimm(addr,RAM_SIZE);
3136       #ifdef DESTRUCTIVE_SHIFT
3137       if(s==addr) emit_mov(s,temp);
3138       #endif
3139       #ifdef R29_HACK
3140       memtarget=1;
3141       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3142       #endif
3143       {
3144         jaddr=(int)out;
3145         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3146         // Hint to branch predictor that the branch is unlikely to be taken
3147         if(rs1[i]>=28)
3148           emit_jno_unlikely(0);
3149         else
3150         #endif
3151         emit_jno(0);
3152       }
3153     }
3154   }else{ // using tlb
3155     int x=0;
3156     if (opcode[i]==0x28) x=3; // SB
3157     if (opcode[i]==0x29) x=2; // SH
3158     map=get_reg(i_regs->regmap,TLREG);
3159     assert(map>=0);
3160     reglist&=~(1<<map);
3161     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3162     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3163   }
3164
3165   if (opcode[i]==0x28) { // SB
3166     if(!c||memtarget) {
3167       int x=0,a=temp;
3168 #ifdef BIG_ENDIAN_MIPS
3169       if(!c) emit_xorimm(addr,3,temp);
3170       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3171 #else
3172       if(!c) a=addr;
3173 #endif
3174       if(faststore_reg_override) a=faststore_reg_override;
3175       //gen_tlb_addr_w(temp,map);
3176       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3177       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3178     }
3179     type=STOREB_STUB;
3180   }
3181   if (opcode[i]==0x29) { // SH
3182     if(!c||memtarget) {
3183       int x=0,a=temp;
3184 #ifdef BIG_ENDIAN_MIPS
3185       if(!c) emit_xorimm(addr,2,temp);
3186       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3187 #else
3188       if(!c) a=addr;
3189 #endif
3190       if(faststore_reg_override) a=faststore_reg_override;
3191       //#ifdef
3192       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3193       //#else
3194       if(map>=0) {
3195         gen_tlb_addr_w(a,map);
3196         emit_writehword_indexed(tl,x,a);
3197       }else
3198         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3199     }
3200     type=STOREH_STUB;
3201   }
3202   if (opcode[i]==0x2B) { // SW
3203     if(!c||memtarget) {
3204       int a=addr;
3205       if(faststore_reg_override) a=faststore_reg_override;
3206       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3207       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3208     }
3209     type=STOREW_STUB;
3210   }
3211   if (opcode[i]==0x3F) { // SD
3212     if(!c||memtarget) {
3213       int a=addr;
3214       if(faststore_reg_override) a=faststore_reg_override;
3215       if(rs2[i]) {
3216         assert(th>=0);
3217         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3218         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3219         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3220       }else{
3221         // Store zero
3222         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3223         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3224         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3225       }
3226     }
3227     type=STORED_STUB;
3228   }
3229   if(!using_tlb) {
3230     if(!c||memtarget) {
3231       #ifdef DESTRUCTIVE_SHIFT
3232       // The x86 shift operation is 'destructive'; it overwrites the
3233       // source register, so we need to make a copy first and use that.
3234       addr=temp;
3235       #endif
3236       #if defined(HOST_IMM8)
3237       int ir=get_reg(i_regs->regmap,INVCP);
3238       assert(ir>=0);
3239       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3240       #else
3241       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3242       #endif
3243       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3244       emit_callne(invalidate_addr_reg[addr]);
3245       #else
3246       jaddr2=(int)out;
3247       emit_jne(0);
3248       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3249       #endif
3250     }
3251   }
3252   if(jaddr) {
3253     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3254   } else if(c&&!memtarget) {
3255     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3256   }
3257   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3258   //if(opcode[i]==0x2B || opcode[i]==0x28)
3259   //if(opcode[i]==0x2B || opcode[i]==0x29)
3260   //if(opcode[i]==0x2B)
3261   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3262   {
3263     #ifdef __i386__
3264     emit_pusha();
3265     #endif
3266     #ifdef __arm__
3267     save_regs(0x100f);
3268     #endif
3269         emit_readword((int)&last_count,ECX);
3270         #ifdef __i386__
3271         if(get_reg(i_regs->regmap,CCREG)<0)
3272           emit_loadreg(CCREG,HOST_CCREG);
3273         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3274         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3275         emit_writeword(HOST_CCREG,(int)&Count);
3276         #endif
3277         #ifdef __arm__
3278         if(get_reg(i_regs->regmap,CCREG)<0)
3279           emit_loadreg(CCREG,0);
3280         else
3281           emit_mov(HOST_CCREG,0);
3282         emit_add(0,ECX,0);
3283         emit_addimm(0,2*ccadj[i],0);
3284         emit_writeword(0,(int)&Count);
3285         #endif
3286     emit_call((int)memdebug);
3287     #ifdef __i386__
3288     emit_popa();
3289     #endif
3290     #ifdef __arm__
3291     restore_regs(0x100f);
3292     #endif
3293   }/**/
3294 }
3295
3296 void storelr_assemble(int i,struct regstat *i_regs)
3297 {
3298   int s,th,tl;
3299   int temp;
3300   int temp2;
3301   int offset;
3302   int jaddr=0,jaddr2;
3303   int case1,case2,case3;
3304   int done0,done1,done2;
3305   int memtarget=0,c=0;
3306   int agr=AGEN1+(i&1);
3307   u_int hr,reglist=0;
3308   th=get_reg(i_regs->regmap,rs2[i]|64);
3309   tl=get_reg(i_regs->regmap,rs2[i]);
3310   s=get_reg(i_regs->regmap,rs1[i]);
3311   temp=get_reg(i_regs->regmap,agr);
3312   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3313   offset=imm[i];
3314   if(s>=0) {
3315     c=(i_regs->isconst>>s)&1;
3316     if(c) {
3317       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3318       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3319     }
3320   }
3321   assert(tl>=0);
3322   for(hr=0;hr<HOST_REGS;hr++) {
3323     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3324   }
3325   assert(temp>=0);
3326   if(!using_tlb) {
3327     if(!c) {
3328       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3329       if(!offset&&s!=temp) emit_mov(s,temp);
3330       jaddr=(int)out;
3331       emit_jno(0);
3332     }
3333     else
3334     {
3335       if(!memtarget||!rs1[i]) {
3336         jaddr=(int)out;
3337         emit_jmp(0);
3338       }
3339     }
3340     #ifdef RAM_OFFSET
3341     int map=get_reg(i_regs->regmap,ROREG);
3342     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3343     gen_tlb_addr_w(temp,map);
3344     #else
3345     if((u_int)rdram!=0x80000000) 
3346       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3347     #endif
3348   }else{ // using tlb
3349     int map=get_reg(i_regs->regmap,TLREG);
3350     assert(map>=0);
3351     reglist&=~(1<<map);
3352     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3353     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3354     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3355     if(!jaddr&&!memtarget) {
3356       jaddr=(int)out;
3357       emit_jmp(0);
3358     }
3359     gen_tlb_addr_w(temp,map);
3360   }
3361
3362   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3363     temp2=get_reg(i_regs->regmap,FTEMP);
3364     if(!rs2[i]) temp2=th=tl;
3365   }
3366
3367 #ifndef BIG_ENDIAN_MIPS
3368     emit_xorimm(temp,3,temp);
3369 #endif
3370   emit_testimm(temp,2);
3371   case2=(int)out;
3372   emit_jne(0);
3373   emit_testimm(temp,1);
3374   case1=(int)out;
3375   emit_jne(0);
3376   // 0
3377   if (opcode[i]==0x2A) { // SWL
3378     emit_writeword_indexed(tl,0,temp);
3379   }
3380   if (opcode[i]==0x2E) { // SWR
3381     emit_writebyte_indexed(tl,3,temp);
3382   }
3383   if (opcode[i]==0x2C) { // SDL
3384     emit_writeword_indexed(th,0,temp);
3385     if(rs2[i]) emit_mov(tl,temp2);
3386   }
3387   if (opcode[i]==0x2D) { // SDR
3388     emit_writebyte_indexed(tl,3,temp);
3389     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3390   }
3391   done0=(int)out;
3392   emit_jmp(0);
3393   // 1
3394   set_jump_target(case1,(int)out);
3395   if (opcode[i]==0x2A) { // SWL
3396     // Write 3 msb into three least significant bytes
3397     if(rs2[i]) emit_rorimm(tl,8,tl);
3398     emit_writehword_indexed(tl,-1,temp);
3399     if(rs2[i]) emit_rorimm(tl,16,tl);
3400     emit_writebyte_indexed(tl,1,temp);
3401     if(rs2[i]) emit_rorimm(tl,8,tl);
3402   }
3403   if (opcode[i]==0x2E) { // SWR
3404     // Write two lsb into two most significant bytes
3405     emit_writehword_indexed(tl,1,temp);
3406   }
3407   if (opcode[i]==0x2C) { // SDL
3408     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3409     // Write 3 msb into three least significant bytes
3410     if(rs2[i]) emit_rorimm(th,8,th);
3411     emit_writehword_indexed(th,-1,temp);
3412     if(rs2[i]) emit_rorimm(th,16,th);
3413     emit_writebyte_indexed(th,1,temp);
3414     if(rs2[i]) emit_rorimm(th,8,th);
3415   }
3416   if (opcode[i]==0x2D) { // SDR
3417     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3418     // Write two lsb into two most significant bytes
3419     emit_writehword_indexed(tl,1,temp);
3420   }
3421   done1=(int)out;
3422   emit_jmp(0);
3423   // 2
3424   set_jump_target(case2,(int)out);
3425   emit_testimm(temp,1);
3426   case3=(int)out;
3427   emit_jne(0);
3428   if (opcode[i]==0x2A) { // SWL
3429     // Write two msb into two least significant bytes
3430     if(rs2[i]) emit_rorimm(tl,16,tl);
3431     emit_writehword_indexed(tl,-2,temp);
3432     if(rs2[i]) emit_rorimm(tl,16,tl);
3433   }
3434   if (opcode[i]==0x2E) { // SWR
3435     // Write 3 lsb into three most significant bytes
3436     emit_writebyte_indexed(tl,-1,temp);
3437     if(rs2[i]) emit_rorimm(tl,8,tl);
3438     emit_writehword_indexed(tl,0,temp);
3439     if(rs2[i]) emit_rorimm(tl,24,tl);
3440   }
3441   if (opcode[i]==0x2C) { // SDL
3442     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3443     // Write two msb into two least significant bytes
3444     if(rs2[i]) emit_rorimm(th,16,th);
3445     emit_writehword_indexed(th,-2,temp);
3446     if(rs2[i]) emit_rorimm(th,16,th);
3447   }
3448   if (opcode[i]==0x2D) { // SDR
3449     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3450     // Write 3 lsb into three most significant bytes
3451     emit_writebyte_indexed(tl,-1,temp);
3452     if(rs2[i]) emit_rorimm(tl,8,tl);
3453     emit_writehword_indexed(tl,0,temp);
3454     if(rs2[i]) emit_rorimm(tl,24,tl);
3455   }
3456   done2=(int)out;
3457   emit_jmp(0);
3458   // 3
3459   set_jump_target(case3,(int)out);
3460   if (opcode[i]==0x2A) { // SWL
3461     // Write msb into least significant byte
3462     if(rs2[i]) emit_rorimm(tl,24,tl);
3463     emit_writebyte_indexed(tl,-3,temp);
3464     if(rs2[i]) emit_rorimm(tl,8,tl);
3465   }
3466   if (opcode[i]==0x2E) { // SWR
3467     // Write entire word
3468     emit_writeword_indexed(tl,-3,temp);
3469   }
3470   if (opcode[i]==0x2C) { // SDL
3471     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3472     // Write msb into least significant byte
3473     if(rs2[i]) emit_rorimm(th,24,th);
3474     emit_writebyte_indexed(th,-3,temp);
3475     if(rs2[i]) emit_rorimm(th,8,th);
3476   }
3477   if (opcode[i]==0x2D) { // SDR
3478     if(rs2[i]) emit_mov(th,temp2);
3479     // Write entire word
3480     emit_writeword_indexed(tl,-3,temp);
3481   }
3482   set_jump_target(done0,(int)out);
3483   set_jump_target(done1,(int)out);
3484   set_jump_target(done2,(int)out);
3485   if (opcode[i]==0x2C) { // SDL
3486     emit_testimm(temp,4);
3487     done0=(int)out;
3488     emit_jne(0);
3489     emit_andimm(temp,~3,temp);
3490     emit_writeword_indexed(temp2,4,temp);
3491     set_jump_target(done0,(int)out);
3492   }
3493   if (opcode[i]==0x2D) { // SDR
3494     emit_testimm(temp,4);
3495     done0=(int)out;
3496     emit_jeq(0);
3497     emit_andimm(temp,~3,temp);
3498     emit_writeword_indexed(temp2,-4,temp);
3499     set_jump_target(done0,(int)out);
3500   }
3501   if(!c||!memtarget)
3502     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3503   if(!using_tlb) {
3504     #ifdef RAM_OFFSET
3505     int map=get_reg(i_regs->regmap,ROREG);
3506     if(map<0) map=HOST_TEMPREG;
3507     gen_orig_addr_w(temp,map);
3508     #else
3509     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3510     #endif
3511     #if defined(HOST_IMM8)
3512     int ir=get_reg(i_regs->regmap,INVCP);
3513     assert(ir>=0);
3514     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3515     #else
3516     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3517     #endif
3518     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3519     emit_callne(invalidate_addr_reg[temp]);
3520     #else
3521     jaddr2=(int)out;
3522     emit_jne(0);
3523     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3524     #endif
3525   }
3526   /*
3527     emit_pusha();
3528     //save_regs(0x100f);
3529         emit_readword((int)&last_count,ECX);
3530         if(get_reg(i_regs->regmap,CCREG)<0)
3531           emit_loadreg(CCREG,HOST_CCREG);
3532         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3533         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3534         emit_writeword(HOST_CCREG,(int)&Count);
3535     emit_call((int)memdebug);
3536     emit_popa();
3537     //restore_regs(0x100f);
3538   /**/
3539 }
3540
3541 void c1ls_assemble(int i,struct regstat *i_regs)
3542 {
3543 #ifndef DISABLE_COP1
3544   int s,th,tl;
3545   int temp,ar;
3546   int map=-1;
3547   int offset;
3548   int c=0;
3549   int jaddr,jaddr2=0,jaddr3,type;
3550   int agr=AGEN1+(i&1);
3551   u_int hr,reglist=0;
3552   th=get_reg(i_regs->regmap,FTEMP|64);
3553   tl=get_reg(i_regs->regmap,FTEMP);
3554   s=get_reg(i_regs->regmap,rs1[i]);
3555   temp=get_reg(i_regs->regmap,agr);
3556   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3557   offset=imm[i];
3558   assert(tl>=0);
3559   assert(rs1[i]>0);
3560   assert(temp>=0);
3561   for(hr=0;hr<HOST_REGS;hr++) {
3562     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3563   }
3564   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3565   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3566   {
3567     // Loads use a temporary register which we need to save
3568     reglist|=1<<temp;
3569   }
3570   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3571     ar=temp;
3572   else // LWC1/LDC1
3573     ar=tl;
3574   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3575   //else c=(i_regs->wasconst>>s)&1;
3576   if(s>=0) c=(i_regs->wasconst>>s)&1;
3577   // Check cop1 unusable
3578   if(!cop1_usable) {
3579     signed char rs=get_reg(i_regs->regmap,CSREG);
3580     assert(rs>=0);
3581     emit_testimm(rs,0x20000000);
3582     jaddr=(int)out;
3583     emit_jeq(0);
3584     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3585     cop1_usable=1;
3586   }
3587   if (opcode[i]==0x39) { // SWC1 (get float address)
3588     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3589   }
3590   if (opcode[i]==0x3D) { // SDC1 (get double address)
3591     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3592   }
3593   // Generate address + offset
3594   if(!using_tlb) {
3595     if(!c)
3596       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3597   }
3598   else
3599   {
3600     map=get_reg(i_regs->regmap,TLREG);
3601     assert(map>=0);
3602     reglist&=~(1<<map);
3603     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3604       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3605     }
3606     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3607       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3608     }
3609   }
3610   if (opcode[i]==0x39) { // SWC1 (read float)
3611     emit_readword_indexed(0,tl,tl);
3612   }
3613   if (opcode[i]==0x3D) { // SDC1 (read double)
3614     emit_readword_indexed(4,tl,th);
3615     emit_readword_indexed(0,tl,tl);
3616   }
3617   if (opcode[i]==0x31) { // LWC1 (get target address)
3618     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3619   }
3620   if (opcode[i]==0x35) { // LDC1 (get target address)
3621     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3622   }
3623   if(!using_tlb) {
3624     if(!c) {
3625       jaddr2=(int)out;
3626       emit_jno(0);
3627     }
3628     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3629       jaddr2=(int)out;
3630       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3631     }
3632     #ifdef DESTRUCTIVE_SHIFT
3633     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3634       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3635     }
3636     #endif
3637   }else{
3638     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3639       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3640     }
3641     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3642       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3643     }
3644   }
3645   if (opcode[i]==0x31) { // LWC1
3646     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3647     //gen_tlb_addr_r(ar,map);
3648     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3649     #ifdef HOST_IMM_ADDR32
3650     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3651     else
3652     #endif
3653     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3654     type=LOADW_STUB;
3655   }
3656   if (opcode[i]==0x35) { // LDC1
3657     assert(th>=0);
3658     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3659     //gen_tlb_addr_r(ar,map);
3660     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3661     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3662     #ifdef HOST_IMM_ADDR32
3663     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3664     else
3665     #endif
3666     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3667     type=LOADD_STUB;
3668   }
3669   if (opcode[i]==0x39) { // SWC1
3670     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3671     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3672     type=STOREW_STUB;
3673   }
3674   if (opcode[i]==0x3D) { // SDC1
3675     assert(th>=0);
3676     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3677     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3678     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3679     type=STORED_STUB;
3680   }
3681   if(!using_tlb) {
3682     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3683       #ifndef DESTRUCTIVE_SHIFT
3684       temp=offset||c||s<0?ar:s;
3685       #endif
3686       #if defined(HOST_IMM8)
3687       int ir=get_reg(i_regs->regmap,INVCP);
3688       assert(ir>=0);
3689       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3690       #else
3691       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3692       #endif
3693       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3694       emit_callne(invalidate_addr_reg[temp]);
3695       #else
3696       jaddr3=(int)out;
3697       emit_jne(0);
3698       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3699       #endif
3700     }
3701   }
3702   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3703   if (opcode[i]==0x31) { // LWC1 (write float)
3704     emit_writeword_indexed(tl,0,temp);
3705   }
3706   if (opcode[i]==0x35) { // LDC1 (write double)
3707     emit_writeword_indexed(th,4,temp);
3708     emit_writeword_indexed(tl,0,temp);
3709   }
3710   //if(opcode[i]==0x39)
3711   /*if(opcode[i]==0x39||opcode[i]==0x31)
3712   {
3713     emit_pusha();
3714         emit_readword((int)&last_count,ECX);
3715         if(get_reg(i_regs->regmap,CCREG)<0)
3716           emit_loadreg(CCREG,HOST_CCREG);
3717         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3718         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3719         emit_writeword(HOST_CCREG,(int)&Count);
3720     emit_call((int)memdebug);
3721     emit_popa();
3722   }/**/
3723 #else
3724   cop1_unusable(i, i_regs);
3725 #endif
3726 }
3727
3728 void c2ls_assemble(int i,struct regstat *i_regs)
3729 {
3730   int s,tl;
3731   int ar;
3732   int offset;
3733   int memtarget=0,c=0;
3734   int jaddr2=0,jaddr3,type;
3735   int agr=AGEN1+(i&1);
3736   u_int hr,reglist=0;
3737   u_int copr=(source[i]>>16)&0x1f;
3738   s=get_reg(i_regs->regmap,rs1[i]);
3739   tl=get_reg(i_regs->regmap,FTEMP);
3740   offset=imm[i];
3741   assert(rs1[i]>0);
3742   assert(tl>=0);
3743   assert(!using_tlb);
3744
3745   for(hr=0;hr<HOST_REGS;hr++) {
3746     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3747   }
3748   if(i_regs->regmap[HOST_CCREG]==CCREG)
3749     reglist&=~(1<<HOST_CCREG);
3750
3751   // get the address
3752   if (opcode[i]==0x3a) { // SWC2
3753     ar=get_reg(i_regs->regmap,agr);
3754     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3755     reglist|=1<<ar;
3756   } else { // LWC2
3757     ar=tl;
3758   }
3759   if(s>=0) c=(i_regs->wasconst>>s)&1;
3760   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3761   if (!offset&&!c&&s>=0) ar=s;
3762   assert(ar>=0);
3763
3764   if (opcode[i]==0x3a) { // SWC2
3765     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3766     type=STOREW_STUB;
3767   }
3768   else
3769     type=LOADW_STUB;
3770
3771   if(c&&!memtarget) {
3772     jaddr2=(int)out;
3773     emit_jmp(0); // inline_readstub/inline_writestub?
3774   }
3775   else {
3776     if(!c) {
3777       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3778       jaddr2=(int)out;
3779       emit_jno(0);
3780     }
3781     if (opcode[i]==0x32) { // LWC2
3782       #ifdef HOST_IMM_ADDR32
3783       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3784       else
3785       #endif
3786       emit_readword_indexed(0,ar,tl);
3787     }
3788     if (opcode[i]==0x3a) { // SWC2
3789       #ifdef DESTRUCTIVE_SHIFT
3790       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3791       #endif
3792       emit_writeword_indexed(tl,0,ar);
3793     }
3794   }
3795   if(jaddr2)
3796     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3797   if (opcode[i]==0x3a) { // SWC2
3798 #if defined(HOST_IMM8)
3799     int ir=get_reg(i_regs->regmap,INVCP);
3800     assert(ir>=0);
3801     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3802 #else
3803     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3804 #endif
3805     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3806     emit_callne(invalidate_addr_reg[ar]);
3807     #else
3808     jaddr3=(int)out;
3809     emit_jne(0);
3810     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3811     #endif
3812   }
3813   if (opcode[i]==0x32) { // LWC2
3814     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3815   }
3816 }
3817
3818 #ifndef multdiv_assemble
3819 void multdiv_assemble(int i,struct regstat *i_regs)
3820 {
3821   printf("Need multdiv_assemble for this architecture.\n");
3822   exit(1);
3823 }
3824 #endif
3825
3826 void mov_assemble(int i,struct regstat *i_regs)
3827 {
3828   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3829   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3830   if(rt1[i]) {
3831     signed char sh,sl,th,tl;
3832     th=get_reg(i_regs->regmap,rt1[i]|64);
3833     tl=get_reg(i_regs->regmap,rt1[i]);
3834     //assert(tl>=0);
3835     if(tl>=0) {
3836       sh=get_reg(i_regs->regmap,rs1[i]|64);
3837       sl=get_reg(i_regs->regmap,rs1[i]);
3838       if(sl>=0) emit_mov(sl,tl);
3839       else emit_loadreg(rs1[i],tl);
3840       if(th>=0) {
3841         if(sh>=0) emit_mov(sh,th);
3842         else emit_loadreg(rs1[i]|64,th);
3843       }
3844     }
3845   }
3846 }
3847
3848 #ifndef fconv_assemble
3849 void fconv_assemble(int i,struct regstat *i_regs)
3850 {
3851   printf("Need fconv_assemble for this architecture.\n");
3852   exit(1);
3853 }
3854 #endif
3855
3856 #if 0
3857 void float_assemble(int i,struct regstat *i_regs)
3858 {
3859   printf("Need float_assemble for this architecture.\n");
3860   exit(1);
3861 }
3862 #endif
3863
3864 void syscall_assemble(int i,struct regstat *i_regs)
3865 {
3866   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3867   assert(ccreg==HOST_CCREG);
3868   assert(!is_delayslot);
3869   emit_movimm(start+i*4,EAX); // Get PC
3870   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3871   emit_jmp((int)jump_syscall_hle); // XXX
3872 }
3873
3874 void hlecall_assemble(int i,struct regstat *i_regs)
3875 {
3876   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3877   assert(ccreg==HOST_CCREG);
3878   assert(!is_delayslot);
3879   emit_movimm(start+i*4+4,0); // Get PC
3880   emit_movimm((int)psxHLEt[source[i]&7],1);
3881   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3882   emit_jmp((int)jump_hlecall);
3883 }
3884
3885 void intcall_assemble(int i,struct regstat *i_regs)
3886 {
3887   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3888   assert(ccreg==HOST_CCREG);
3889   assert(!is_delayslot);
3890   emit_movimm(start+i*4,0); // Get PC
3891   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3892   emit_jmp((int)jump_intcall);
3893 }
3894
3895 void ds_assemble(int i,struct regstat *i_regs)
3896 {
3897   is_delayslot=1;
3898   switch(itype[i]) {
3899     case ALU:
3900       alu_assemble(i,i_regs);break;
3901     case IMM16:
3902       imm16_assemble(i,i_regs);break;
3903     case SHIFT:
3904       shift_assemble(i,i_regs);break;
3905     case SHIFTIMM:
3906       shiftimm_assemble(i,i_regs);break;
3907     case LOAD:
3908       load_assemble(i,i_regs);break;
3909     case LOADLR:
3910       loadlr_assemble(i,i_regs);break;
3911     case STORE:
3912       store_assemble(i,i_regs);break;
3913     case STORELR:
3914       storelr_assemble(i,i_regs);break;
3915     case COP0:
3916       cop0_assemble(i,i_regs);break;
3917     case COP1:
3918       cop1_assemble(i,i_regs);break;
3919     case C1LS:
3920       c1ls_assemble(i,i_regs);break;
3921     case COP2:
3922       cop2_assemble(i,i_regs);break;
3923     case C2LS:
3924       c2ls_assemble(i,i_regs);break;
3925     case C2OP:
3926       c2op_assemble(i,i_regs);break;
3927     case FCONV:
3928       fconv_assemble(i,i_regs);break;
3929     case FLOAT:
3930       float_assemble(i,i_regs);break;
3931     case FCOMP:
3932       fcomp_assemble(i,i_regs);break;
3933     case MULTDIV:
3934       multdiv_assemble(i,i_regs);break;
3935     case MOV:
3936       mov_assemble(i,i_regs);break;
3937     case SYSCALL:
3938     case HLECALL:
3939     case INTCALL:
3940     case SPAN:
3941     case UJUMP:
3942     case RJUMP:
3943     case CJUMP:
3944     case SJUMP:
3945     case FJUMP:
3946       printf("Jump in the delay slot.  This is probably a bug.\n");
3947   }
3948   is_delayslot=0;
3949 }
3950
3951 // Is the branch target a valid internal jump?
3952 int internal_branch(uint64_t i_is32,int addr)
3953 {
3954   if(addr&1) return 0; // Indirect (register) jump
3955   if(addr>=start && addr<start+slen*4-4)
3956   {
3957     int t=(addr-start)>>2;
3958     // Delay slots are not valid branch targets
3959     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3960     // 64 -> 32 bit transition requires a recompile
3961     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3962     {
3963       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3964       else printf("optimizable: yes\n");
3965     }*/
3966     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3967 #ifndef FORCE32
3968     if(requires_32bit[t]&~i_is32) return 0;
3969     else
3970 #endif
3971       return 1;
3972   }
3973   return 0;
3974 }
3975
3976 #ifndef wb_invalidate
3977 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3978   uint64_t u,uint64_t uu)
3979 {
3980   int hr;
3981   for(hr=0;hr<HOST_REGS;hr++) {
3982     if(hr!=EXCLUDE_REG) {
3983       if(pre[hr]!=entry[hr]) {
3984         if(pre[hr]>=0) {
3985           if((dirty>>hr)&1) {
3986             if(get_reg(entry,pre[hr])<0) {
3987               if(pre[hr]<64) {
3988                 if(!((u>>pre[hr])&1)) {
3989                   emit_storereg(pre[hr],hr);
3990                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3991                     emit_sarimm(hr,31,hr);
3992                     emit_storereg(pre[hr]|64,hr);
3993                   }
3994                 }
3995               }else{
3996                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3997                   emit_storereg(pre[hr],hr);
3998                 }
3999               }
4000             }
4001           }
4002         }
4003       }
4004     }
4005   }
4006   // Move from one register to another (no writeback)
4007   for(hr=0;hr<HOST_REGS;hr++) {
4008     if(hr!=EXCLUDE_REG) {
4009       if(pre[hr]!=entry[hr]) {
4010         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4011           int nr;
4012           if((nr=get_reg(entry,pre[hr]))>=0) {
4013             emit_mov(hr,nr);
4014           }
4015         }
4016       }
4017     }
4018   }
4019 }
4020 #endif
4021
4022 // Load the specified registers
4023 // This only loads the registers given as arguments because
4024 // we don't want to load things that will be overwritten
4025 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4026 {
4027   int hr;
4028   // Load 32-bit regs
4029   for(hr=0;hr<HOST_REGS;hr++) {
4030     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4031       if(entry[hr]!=regmap[hr]) {
4032         if(regmap[hr]==rs1||regmap[hr]==rs2)
4033         {
4034           if(regmap[hr]==0) {
4035             emit_zeroreg(hr);
4036           }
4037           else
4038           {
4039             emit_loadreg(regmap[hr],hr);
4040           }
4041         }
4042       }
4043     }
4044   }
4045   //Load 64-bit regs
4046   for(hr=0;hr<HOST_REGS;hr++) {
4047     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4048       if(entry[hr]!=regmap[hr]) {
4049         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4050         {
4051           assert(regmap[hr]!=64);
4052           if((is32>>(regmap[hr]&63))&1) {
4053             int lr=get_reg(regmap,regmap[hr]-64);
4054             if(lr>=0)
4055               emit_sarimm(lr,31,hr);
4056             else
4057               emit_loadreg(regmap[hr],hr);
4058           }
4059           else
4060           {
4061             emit_loadreg(regmap[hr],hr);
4062           }
4063         }
4064       }
4065     }
4066   }
4067 }
4068
4069 // Load registers prior to the start of a loop
4070 // so that they are not loaded within the loop
4071 static void loop_preload(signed char pre[],signed char entry[])
4072 {
4073   int hr;
4074   for(hr=0;hr<HOST_REGS;hr++) {
4075     if(hr!=EXCLUDE_REG) {
4076       if(pre[hr]!=entry[hr]) {
4077         if(entry[hr]>=0) {
4078           if(get_reg(pre,entry[hr])<0) {
4079             assem_debug("loop preload:\n");
4080             //printf("loop preload: %d\n",hr);
4081             if(entry[hr]==0) {
4082               emit_zeroreg(hr);
4083             }
4084             else if(entry[hr]<TEMPREG)
4085             {
4086               emit_loadreg(entry[hr],hr);
4087             }
4088             else if(entry[hr]-64<TEMPREG)
4089             {
4090               emit_loadreg(entry[hr],hr);
4091             }
4092           }
4093         }
4094       }
4095     }
4096   }
4097 }
4098
4099 // Generate address for load/store instruction
4100 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4101 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4102 {
4103   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4104     int ra=-1;
4105     int agr=AGEN1+(i&1);
4106     int mgr=MGEN1+(i&1);
4107     if(itype[i]==LOAD) {
4108       ra=get_reg(i_regs->regmap,rt1[i]);
4109       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4110       assert(ra>=0);
4111     }
4112     if(itype[i]==LOADLR) {
4113       ra=get_reg(i_regs->regmap,FTEMP);
4114     }
4115     if(itype[i]==STORE||itype[i]==STORELR) {
4116       ra=get_reg(i_regs->regmap,agr);
4117       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4118     }
4119     if(itype[i]==C1LS||itype[i]==C2LS) {
4120       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4121         ra=get_reg(i_regs->regmap,FTEMP);
4122       else { // SWC1/SDC1/SWC2/SDC2
4123         ra=get_reg(i_regs->regmap,agr);
4124         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4125       }
4126     }
4127     int rs=get_reg(i_regs->regmap,rs1[i]);
4128     int rm=get_reg(i_regs->regmap,TLREG);
4129     if(ra>=0) {
4130       int offset=imm[i];
4131       int c=(i_regs->wasconst>>rs)&1;
4132       if(rs1[i]==0) {
4133         // Using r0 as a base address
4134         /*if(rm>=0) {
4135           if(!entry||entry[rm]!=mgr) {
4136             generate_map_const(offset,rm);
4137           } // else did it in the previous cycle
4138         }*/
4139         if(!entry||entry[ra]!=agr) {
4140           if (opcode[i]==0x22||opcode[i]==0x26) {
4141             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4142           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4143             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4144           }else{
4145             emit_movimm(offset,ra);
4146           }
4147         } // else did it in the previous cycle
4148       }
4149       else if(rs<0) {
4150         if(!entry||entry[ra]!=rs1[i])
4151           emit_loadreg(rs1[i],ra);
4152         //if(!entry||entry[ra]!=rs1[i])
4153         //  printf("poor load scheduling!\n");
4154       }
4155       else if(c) {
4156         if(rm>=0) {
4157           if(!entry||entry[rm]!=mgr) {
4158             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4159               // Stores to memory go thru the mapper to detect self-modifying
4160               // code, loads don't.
4161               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4162                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4163                 generate_map_const(constmap[i][rs]+offset,rm);
4164             }else{
4165               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4166                 generate_map_const(constmap[i][rs]+offset,rm);
4167             }
4168           }
4169         }
4170         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4171           if(!entry||entry[ra]!=agr) {
4172             if (opcode[i]==0x22||opcode[i]==0x26) {
4173               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4174             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4175               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4176             }else{
4177               #ifdef HOST_IMM_ADDR32
4178               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4179                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4180               #endif
4181               emit_movimm(constmap[i][rs]+offset,ra);
4182             }
4183           } // else did it in the previous cycle
4184         } // else load_consts already did it
4185       }
4186       if(offset&&!c&&rs1[i]) {
4187         if(rs>=0) {
4188           emit_addimm(rs,offset,ra);
4189         }else{
4190           emit_addimm(ra,offset,ra);
4191         }
4192       }
4193     }
4194   }
4195   // Preload constants for next instruction
4196   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4197     int agr,ra;
4198     #ifndef HOST_IMM_ADDR32
4199     // Mapper entry
4200     agr=MGEN1+((i+1)&1);
4201     ra=get_reg(i_regs->regmap,agr);
4202     if(ra>=0) {
4203       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4204       int offset=imm[i+1];
4205       int c=(regs[i+1].wasconst>>rs)&1;
4206       if(c) {
4207         if(itype[i+1]==STORE||itype[i+1]==STORELR
4208            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4209           // Stores to memory go thru the mapper to detect self-modifying
4210           // code, loads don't.
4211           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4212              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4213             generate_map_const(constmap[i+1][rs]+offset,ra);
4214         }else{
4215           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4216             generate_map_const(constmap[i+1][rs]+offset,ra);
4217         }
4218       }
4219       /*else if(rs1[i]==0) {
4220         generate_map_const(offset,ra);
4221       }*/
4222     }
4223     #endif
4224     // Actual address
4225     agr=AGEN1+((i+1)&1);
4226     ra=get_reg(i_regs->regmap,agr);
4227     if(ra>=0) {
4228       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4229       int offset=imm[i+1];
4230       int c=(regs[i+1].wasconst>>rs)&1;
4231       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4232         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4233           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4234         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4235           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4236         }else{
4237           #ifdef HOST_IMM_ADDR32
4238           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4239              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4240           #endif
4241           emit_movimm(constmap[i+1][rs]+offset,ra);
4242         }
4243       }
4244       else if(rs1[i+1]==0) {
4245         // Using r0 as a base address
4246         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4247           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4248         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4249           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4250         }else{
4251           emit_movimm(offset,ra);
4252         }
4253       }
4254     }
4255   }
4256 }
4257
4258 int get_final_value(int hr, int i, int *value)
4259 {
4260   int reg=regs[i].regmap[hr];
4261   while(i<slen-1) {
4262     if(regs[i+1].regmap[hr]!=reg) break;
4263     if(!((regs[i+1].isconst>>hr)&1)) break;
4264     if(bt[i+1]) break;
4265     i++;
4266   }
4267   if(i<slen-1) {
4268     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4269       *value=constmap[i][hr];
4270       return 1;
4271     }
4272     if(!bt[i+1]) {
4273       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4274         // Load in delay slot, out-of-order execution
4275         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4276         {
4277           #ifdef HOST_IMM_ADDR32
4278           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4279           #endif
4280           // Precompute load address
4281           *value=constmap[i][hr]+imm[i+2];
4282           return 1;
4283         }
4284       }
4285       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4286       {
4287         #ifdef HOST_IMM_ADDR32
4288         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4289         #endif
4290         // Precompute load address
4291         *value=constmap[i][hr]+imm[i+1];
4292         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4293         return 1;
4294       }
4295     }
4296   }
4297   *value=constmap[i][hr];
4298   //printf("c=%x\n",(int)constmap[i][hr]);
4299   if(i==slen-1) return 1;
4300   if(reg<64) {
4301     return !((unneeded_reg[i+1]>>reg)&1);
4302   }else{
4303     return !((unneeded_reg_upper[i+1]>>reg)&1);
4304   }
4305 }
4306
4307 // Load registers with known constants
4308 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4309 {
4310   int hr;
4311   // Load 32-bit regs
4312   for(hr=0;hr<HOST_REGS;hr++) {
4313     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4314       //if(entry[hr]!=regmap[hr]) {
4315       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4316         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4317           int value;
4318           if(get_final_value(hr,i,&value)) {
4319             if(value==0) {
4320               emit_zeroreg(hr);
4321             }
4322             else {
4323               emit_movimm(value,hr);
4324             }
4325           }
4326         }
4327       }
4328     }
4329   }
4330   // Load 64-bit regs
4331   for(hr=0;hr<HOST_REGS;hr++) {
4332     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4333       //if(entry[hr]!=regmap[hr]) {
4334       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4335         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4336           if((is32>>(regmap[hr]&63))&1) {
4337             int lr=get_reg(regmap,regmap[hr]-64);
4338             assert(lr>=0);
4339             emit_sarimm(lr,31,hr);
4340           }
4341           else
4342           {
4343             int value;
4344             if(get_final_value(hr,i,&value)) {
4345               if(value==0) {
4346                 emit_zeroreg(hr);
4347               }
4348               else {
4349                 emit_movimm(value,hr);
4350               }
4351             }
4352           }
4353         }
4354       }
4355     }
4356   }
4357 }
4358 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4359 {
4360   int hr;
4361   // Load 32-bit regs
4362   for(hr=0;hr<HOST_REGS;hr++) {
4363     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4364       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4365         int value=constmap[i][hr];
4366         if(value==0) {
4367           emit_zeroreg(hr);
4368         }
4369         else {
4370           emit_movimm(value,hr);
4371         }
4372       }
4373     }
4374   }
4375   // Load 64-bit regs
4376   for(hr=0;hr<HOST_REGS;hr++) {
4377     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4378       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4379         if((is32>>(regmap[hr]&63))&1) {
4380           int lr=get_reg(regmap,regmap[hr]-64);
4381           assert(lr>=0);
4382           emit_sarimm(lr,31,hr);
4383         }
4384         else
4385         {
4386           int value=constmap[i][hr];
4387           if(value==0) {
4388             emit_zeroreg(hr);
4389           }
4390           else {
4391             emit_movimm(value,hr);
4392           }
4393         }
4394       }
4395     }
4396   }
4397 }
4398
4399 // Write out all dirty registers (except cycle count)
4400 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4401 {
4402   int hr;
4403   for(hr=0;hr<HOST_REGS;hr++) {
4404     if(hr!=EXCLUDE_REG) {
4405       if(i_regmap[hr]>0) {
4406         if(i_regmap[hr]!=CCREG) {
4407           if((i_dirty>>hr)&1) {
4408             if(i_regmap[hr]<64) {
4409               emit_storereg(i_regmap[hr],hr);
4410 #ifndef FORCE32
4411               if( ((i_is32>>i_regmap[hr])&1) ) {
4412                 #ifdef DESTRUCTIVE_WRITEBACK
4413                 emit_sarimm(hr,31,hr);
4414                 emit_storereg(i_regmap[hr]|64,hr);
4415                 #else
4416                 emit_sarimm(hr,31,HOST_TEMPREG);
4417                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4418                 #endif
4419               }
4420 #endif
4421             }else{
4422               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4423                 emit_storereg(i_regmap[hr],hr);
4424               }
4425             }
4426           }
4427         }
4428       }
4429     }
4430   }
4431 }
4432 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4433 // This writes the registers not written by store_regs_bt
4434 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4435 {
4436   int hr;
4437   int t=(addr-start)>>2;
4438   for(hr=0;hr<HOST_REGS;hr++) {
4439     if(hr!=EXCLUDE_REG) {
4440       if(i_regmap[hr]>0) {
4441         if(i_regmap[hr]!=CCREG) {
4442           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4443             if((i_dirty>>hr)&1) {
4444               if(i_regmap[hr]<64) {
4445                 emit_storereg(i_regmap[hr],hr);
4446 #ifndef FORCE32
4447                 if( ((i_is32>>i_regmap[hr])&1) ) {
4448                   #ifdef DESTRUCTIVE_WRITEBACK
4449                   emit_sarimm(hr,31,hr);
4450                   emit_storereg(i_regmap[hr]|64,hr);
4451                   #else
4452                   emit_sarimm(hr,31,HOST_TEMPREG);
4453                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4454                   #endif
4455                 }
4456 #endif
4457               }else{
4458                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4459                   emit_storereg(i_regmap[hr],hr);
4460                 }
4461               }
4462             }
4463           }
4464         }
4465       }
4466     }
4467   }
4468 }
4469
4470 // Load all registers (except cycle count)
4471 void load_all_regs(signed char i_regmap[])
4472 {
4473   int hr;
4474   for(hr=0;hr<HOST_REGS;hr++) {
4475     if(hr!=EXCLUDE_REG) {
4476       if(i_regmap[hr]==0) {
4477         emit_zeroreg(hr);
4478       }
4479       else
4480       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4481       {
4482         emit_loadreg(i_regmap[hr],hr);
4483       }
4484     }
4485   }
4486 }
4487
4488 // Load all current registers also needed by next instruction
4489 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4490 {
4491   int hr;
4492   for(hr=0;hr<HOST_REGS;hr++) {
4493     if(hr!=EXCLUDE_REG) {
4494       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4495         if(i_regmap[hr]==0) {
4496           emit_zeroreg(hr);
4497         }
4498         else
4499         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4500         {
4501           emit_loadreg(i_regmap[hr],hr);
4502         }
4503       }
4504     }
4505   }
4506 }
4507
4508 // Load all regs, storing cycle count if necessary
4509 void load_regs_entry(int t)
4510 {
4511   int hr;
4512   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4513   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4514   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4515     emit_storereg(CCREG,HOST_CCREG);
4516   }
4517   // Load 32-bit regs
4518   for(hr=0;hr<HOST_REGS;hr++) {
4519     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4520       if(regs[t].regmap_entry[hr]==0) {
4521         emit_zeroreg(hr);
4522       }
4523       else if(regs[t].regmap_entry[hr]!=CCREG)
4524       {
4525         emit_loadreg(regs[t].regmap_entry[hr],hr);
4526       }
4527     }
4528   }
4529   // Load 64-bit regs
4530   for(hr=0;hr<HOST_REGS;hr++) {
4531     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4532       assert(regs[t].regmap_entry[hr]!=64);
4533       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4534         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4535         if(lr<0) {
4536           emit_loadreg(regs[t].regmap_entry[hr],hr);
4537         }
4538         else
4539         {
4540           emit_sarimm(lr,31,hr);
4541         }
4542       }
4543       else
4544       {
4545         emit_loadreg(regs[t].regmap_entry[hr],hr);
4546       }
4547     }
4548   }
4549 }
4550
4551 // Store dirty registers prior to branch
4552 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4553 {
4554   if(internal_branch(i_is32,addr))
4555   {
4556     int t=(addr-start)>>2;
4557     int hr;
4558     for(hr=0;hr<HOST_REGS;hr++) {
4559       if(hr!=EXCLUDE_REG) {
4560         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4561           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4562             if((i_dirty>>hr)&1) {
4563               if(i_regmap[hr]<64) {
4564                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4565                   emit_storereg(i_regmap[hr],hr);
4566                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4567                     #ifdef DESTRUCTIVE_WRITEBACK
4568                     emit_sarimm(hr,31,hr);
4569                     emit_storereg(i_regmap[hr]|64,hr);
4570                     #else
4571                     emit_sarimm(hr,31,HOST_TEMPREG);
4572                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4573                     #endif
4574                   }
4575                 }
4576               }else{
4577                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4578                   emit_storereg(i_regmap[hr],hr);
4579                 }
4580               }
4581             }
4582           }
4583         }
4584       }
4585     }
4586   }
4587   else
4588   {
4589     // Branch out of this block, write out all dirty regs
4590     wb_dirtys(i_regmap,i_is32,i_dirty);
4591   }
4592 }
4593
4594 // Load all needed registers for branch target
4595 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4596 {
4597   //if(addr>=start && addr<(start+slen*4))
4598   if(internal_branch(i_is32,addr))
4599   {
4600     int t=(addr-start)>>2;
4601     int hr;
4602     // Store the cycle count before loading something else
4603     if(i_regmap[HOST_CCREG]!=CCREG) {
4604       assert(i_regmap[HOST_CCREG]==-1);
4605     }
4606     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4607       emit_storereg(CCREG,HOST_CCREG);
4608     }
4609     // Load 32-bit regs
4610     for(hr=0;hr<HOST_REGS;hr++) {
4611       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4612         #ifdef DESTRUCTIVE_WRITEBACK
4613         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4614         #else
4615         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4616         #endif
4617           if(regs[t].regmap_entry[hr]==0) {
4618             emit_zeroreg(hr);
4619           }
4620           else if(regs[t].regmap_entry[hr]!=CCREG)
4621           {
4622             emit_loadreg(regs[t].regmap_entry[hr],hr);
4623           }
4624         }
4625       }
4626     }
4627     //Load 64-bit regs
4628     for(hr=0;hr<HOST_REGS;hr++) {
4629       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4630         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4631           assert(regs[t].regmap_entry[hr]!=64);
4632           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4633             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4634             if(lr<0) {
4635               emit_loadreg(regs[t].regmap_entry[hr],hr);
4636             }
4637             else
4638             {
4639               emit_sarimm(lr,31,hr);
4640             }
4641           }
4642           else
4643           {
4644             emit_loadreg(regs[t].regmap_entry[hr],hr);
4645           }
4646         }
4647         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4648           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4649           assert(lr>=0);
4650           emit_sarimm(lr,31,hr);
4651         }
4652       }
4653     }
4654   }
4655 }
4656
4657 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4658 {
4659   if(addr>=start && addr<start+slen*4-4)
4660   {
4661     int t=(addr-start)>>2;
4662     int hr;
4663     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4664     for(hr=0;hr<HOST_REGS;hr++)
4665     {
4666       if(hr!=EXCLUDE_REG)
4667       {
4668         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4669         {
4670           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4671           {
4672             return 0;
4673           }
4674           else 
4675           if((i_dirty>>hr)&1)
4676           {
4677             if(i_regmap[hr]<TEMPREG)
4678             {
4679               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4680                 return 0;
4681             }
4682             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4683             {
4684               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4685                 return 0;
4686             }
4687           }
4688         }
4689         else // Same register but is it 32-bit or dirty?
4690         if(i_regmap[hr]>=0)
4691         {
4692           if(!((regs[t].dirty>>hr)&1))
4693           {
4694             if((i_dirty>>hr)&1)
4695             {
4696               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4697               {
4698                 //printf("%x: dirty no match\n",addr);
4699                 return 0;
4700               }
4701             }
4702           }
4703           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4704           {
4705             //printf("%x: is32 no match\n",addr);
4706             return 0;
4707           }
4708         }
4709       }
4710     }
4711     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4712 #ifndef FORCE32
4713     if(requires_32bit[t]&~i_is32) return 0;
4714 #endif
4715     // Delay slots are not valid branch targets
4716     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4717     // Delay slots require additional processing, so do not match
4718     if(is_ds[t]) return 0;
4719   }
4720   else
4721   {
4722     int hr;
4723     for(hr=0;hr<HOST_REGS;hr++)
4724     {
4725       if(hr!=EXCLUDE_REG)
4726       {
4727         if(i_regmap[hr]>=0)
4728         {
4729           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4730           {
4731             if((i_dirty>>hr)&1)
4732             {
4733               return 0;
4734             }
4735           }
4736         }
4737       }
4738     }
4739   }
4740   return 1;
4741 }
4742
4743 // Used when a branch jumps into the delay slot of another branch
4744 void ds_assemble_entry(int i)
4745 {
4746   int t=(ba[i]-start)>>2;
4747   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4748   assem_debug("Assemble delay slot at %x\n",ba[i]);
4749   assem_debug("<->\n");
4750   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4751     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4752   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4753   address_generation(t,&regs[t],regs[t].regmap_entry);
4754   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4755     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4756   cop1_usable=0;
4757   is_delayslot=0;
4758   switch(itype[t]) {
4759     case ALU:
4760       alu_assemble(t,&regs[t]);break;
4761     case IMM16:
4762       imm16_assemble(t,&regs[t]);break;
4763     case SHIFT:
4764       shift_assemble(t,&regs[t]);break;
4765     case SHIFTIMM:
4766       shiftimm_assemble(t,&regs[t]);break;
4767     case LOAD:
4768       load_assemble(t,&regs[t]);break;
4769     case LOADLR:
4770       loadlr_assemble(t,&regs[t]);break;
4771     case STORE:
4772       store_assemble(t,&regs[t]);break;
4773     case STORELR:
4774       storelr_assemble(t,&regs[t]);break;
4775     case COP0:
4776       cop0_assemble(t,&regs[t]);break;
4777     case COP1:
4778       cop1_assemble(t,&regs[t]);break;
4779     case C1LS:
4780       c1ls_assemble(t,&regs[t]);break;
4781     case COP2:
4782       cop2_assemble(t,&regs[t]);break;
4783     case C2LS:
4784       c2ls_assemble(t,&regs[t]);break;
4785     case C2OP:
4786       c2op_assemble(t,&regs[t]);break;
4787     case FCONV:
4788       fconv_assemble(t,&regs[t]);break;
4789     case FLOAT:
4790       float_assemble(t,&regs[t]);break;
4791     case FCOMP:
4792       fcomp_assemble(t,&regs[t]);break;
4793     case MULTDIV:
4794       multdiv_assemble(t,&regs[t]);break;
4795     case MOV:
4796       mov_assemble(t,&regs[t]);break;
4797     case SYSCALL:
4798     case HLECALL:
4799     case INTCALL:
4800     case SPAN:
4801     case UJUMP:
4802     case RJUMP:
4803     case CJUMP:
4804     case SJUMP:
4805     case FJUMP:
4806       printf("Jump in the delay slot.  This is probably a bug.\n");
4807   }
4808   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4809   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4810   if(internal_branch(regs[t].is32,ba[i]+4))
4811     assem_debug("branch: internal\n");
4812   else
4813     assem_debug("branch: external\n");
4814   assert(internal_branch(regs[t].is32,ba[i]+4));
4815   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4816   emit_jmp(0);
4817 }
4818
4819 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4820 {
4821   int count;
4822   int jaddr;
4823   int idle=0;
4824   if(itype[i]==RJUMP)
4825   {
4826     *adj=0;
4827   }
4828   //if(ba[i]>=start && ba[i]<(start+slen*4))
4829   if(internal_branch(branch_regs[i].is32,ba[i]))
4830   {
4831     int t=(ba[i]-start)>>2;
4832     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4833     else *adj=ccadj[t];
4834   }
4835   else
4836   {
4837     *adj=0;
4838   }
4839   count=ccadj[i];
4840   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4841     // Idle loop
4842     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4843     idle=(int)out;
4844     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4845     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4846     jaddr=(int)out;
4847     emit_jmp(0);
4848   }
4849   else if(*adj==0||invert) {
4850     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4851     jaddr=(int)out;
4852     emit_jns(0);
4853   }
4854   else
4855   {
4856     emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2));
4857     jaddr=(int)out;
4858     emit_jns(0);
4859   }
4860   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4861 }
4862
4863 void do_ccstub(int n)
4864 {
4865   literal_pool(256);
4866   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4867   set_jump_target(stubs[n][1],(int)out);
4868   int i=stubs[n][4];
4869   if(stubs[n][6]==NULLDS) {
4870     // Delay slot instruction is nullified ("likely" branch)
4871     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4872   }
4873   else if(stubs[n][6]!=TAKEN) {
4874     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4875   }
4876   else {
4877     if(internal_branch(branch_regs[i].is32,ba[i]))
4878       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4879   }
4880   if(stubs[n][5]!=-1)
4881   {
4882     // Save PC as return address
4883     emit_movimm(stubs[n][5],EAX);
4884     emit_writeword(EAX,(int)&pcaddr);
4885   }
4886   else
4887   {
4888     // Return address depends on which way the branch goes
4889     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4890     {
4891       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4892       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4893       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4894       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4895       if(rs1[i]==0)
4896       {
4897         s1l=s2l;s1h=s2h;
4898         s2l=s2h=-1;
4899       }
4900       else if(rs2[i]==0)
4901       {
4902         s2l=s2h=-1;
4903       }
4904       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4905         s1h=s2h=-1;
4906       }
4907       assert(s1l>=0);
4908       #ifdef DESTRUCTIVE_WRITEBACK
4909       if(rs1[i]) {
4910         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4911           emit_loadreg(rs1[i],s1l);
4912       } 
4913       else {
4914         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4915           emit_loadreg(rs2[i],s1l);
4916       }
4917       if(s2l>=0)
4918         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4919           emit_loadreg(rs2[i],s2l);
4920       #endif
4921       int hr=0;
4922       int addr=-1,alt=-1,ntaddr=-1;
4923       while(hr<HOST_REGS)
4924       {
4925         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4926            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4927            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4928         {
4929           addr=hr++;break;
4930         }
4931         hr++;
4932       }
4933       while(hr<HOST_REGS)
4934       {
4935         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4936            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4937            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4938         {
4939           alt=hr++;break;
4940         }
4941         hr++;
4942       }
4943       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4944       {
4945         while(hr<HOST_REGS)
4946         {
4947           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4948              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4949              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4950           {
4951             ntaddr=hr;break;
4952           }
4953           hr++;
4954         }
4955         assert(hr<HOST_REGS);
4956       }
4957       if((opcode[i]&0x2f)==4) // BEQ
4958       {
4959         #ifdef HAVE_CMOV_IMM
4960         if(s1h<0) {
4961           if(s2l>=0) emit_cmp(s1l,s2l);
4962           else emit_test(s1l,s1l);
4963           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4964         }
4965         else
4966         #endif
4967         {
4968           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4969           if(s1h>=0) {
4970             if(s2h>=0) emit_cmp(s1h,s2h);
4971             else emit_test(s1h,s1h);
4972             emit_cmovne_reg(alt,addr);
4973           }
4974           if(s2l>=0) emit_cmp(s1l,s2l);
4975           else emit_test(s1l,s1l);
4976           emit_cmovne_reg(alt,addr);
4977         }
4978       }
4979       if((opcode[i]&0x2f)==5) // BNE
4980       {
4981         #ifdef HAVE_CMOV_IMM
4982         if(s1h<0) {
4983           if(s2l>=0) emit_cmp(s1l,s2l);
4984           else emit_test(s1l,s1l);
4985           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4986         }
4987         else
4988         #endif
4989         {
4990           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4991           if(s1h>=0) {
4992             if(s2h>=0) emit_cmp(s1h,s2h);
4993             else emit_test(s1h,s1h);
4994             emit_cmovne_reg(alt,addr);
4995           }
4996           if(s2l>=0) emit_cmp(s1l,s2l);
4997           else emit_test(s1l,s1l);
4998           emit_cmovne_reg(alt,addr);
4999         }
5000       }
5001       if((opcode[i]&0x2f)==6) // BLEZ
5002       {
5003         //emit_movimm(ba[i],alt);
5004         //emit_movimm(start+i*4+8,addr);
5005         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5006         emit_cmpimm(s1l,1);
5007         if(s1h>=0) emit_mov(addr,ntaddr);
5008         emit_cmovl_reg(alt,addr);
5009         if(s1h>=0) {
5010           emit_test(s1h,s1h);
5011           emit_cmovne_reg(ntaddr,addr);
5012           emit_cmovs_reg(alt,addr);
5013         }
5014       }
5015       if((opcode[i]&0x2f)==7) // BGTZ
5016       {
5017         //emit_movimm(ba[i],addr);
5018         //emit_movimm(start+i*4+8,ntaddr);
5019         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5020         emit_cmpimm(s1l,1);
5021         if(s1h>=0) emit_mov(addr,alt);
5022         emit_cmovl_reg(ntaddr,addr);
5023         if(s1h>=0) {
5024           emit_test(s1h,s1h);
5025           emit_cmovne_reg(alt,addr);
5026           emit_cmovs_reg(ntaddr,addr);
5027         }
5028       }
5029       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5030       {
5031         //emit_movimm(ba[i],alt);
5032         //emit_movimm(start+i*4+8,addr);
5033         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5034         if(s1h>=0) emit_test(s1h,s1h);
5035         else emit_test(s1l,s1l);
5036         emit_cmovs_reg(alt,addr);
5037       }
5038       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5039       {
5040         //emit_movimm(ba[i],addr);
5041         //emit_movimm(start+i*4+8,alt);
5042         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5043         if(s1h>=0) emit_test(s1h,s1h);
5044         else emit_test(s1l,s1l);
5045         emit_cmovs_reg(alt,addr);
5046       }
5047       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5048         if(source[i]&0x10000) // BC1T
5049         {
5050           //emit_movimm(ba[i],alt);
5051           //emit_movimm(start+i*4+8,addr);
5052           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5053           emit_testimm(s1l,0x800000);
5054           emit_cmovne_reg(alt,addr);
5055         }
5056         else // BC1F
5057         {
5058           //emit_movimm(ba[i],addr);
5059           //emit_movimm(start+i*4+8,alt);
5060           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5061           emit_testimm(s1l,0x800000);
5062           emit_cmovne_reg(alt,addr);
5063         }
5064       }
5065       emit_writeword(addr,(int)&pcaddr);
5066     }
5067     else
5068     if(itype[i]==RJUMP)
5069     {
5070       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5071       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5072         r=get_reg(branch_regs[i].regmap,RTEMP);
5073       }
5074       emit_writeword(r,(int)&pcaddr);
5075     }
5076     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5077   }
5078   // Update cycle count
5079   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5080   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5081   emit_call((int)cc_interrupt);
5082   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5083   if(stubs[n][6]==TAKEN) {
5084     if(internal_branch(branch_regs[i].is32,ba[i]))
5085       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5086     else if(itype[i]==RJUMP) {
5087       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5088         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5089       else
5090         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5091     }
5092   }else if(stubs[n][6]==NOTTAKEN) {
5093     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5094     else load_all_regs(branch_regs[i].regmap);
5095   }else if(stubs[n][6]==NULLDS) {
5096     // Delay slot instruction is nullified ("likely" branch)
5097     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5098     else load_all_regs(regs[i].regmap);
5099   }else{
5100     load_all_regs(branch_regs[i].regmap);
5101   }
5102   emit_jmp(stubs[n][2]); // return address
5103   
5104   /* This works but uses a lot of memory...
5105   emit_readword((int)&last_count,ECX);
5106   emit_add(HOST_CCREG,ECX,EAX);
5107   emit_writeword(EAX,(int)&Count);
5108   emit_call((int)gen_interupt);
5109   emit_readword((int)&Count,HOST_CCREG);
5110   emit_readword((int)&next_interupt,EAX);
5111   emit_readword((int)&pending_exception,EBX);
5112   emit_writeword(EAX,(int)&last_count);
5113   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5114   emit_test(EBX,EBX);
5115   int jne_instr=(int)out;
5116   emit_jne(0);
5117   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5118   load_all_regs(branch_regs[i].regmap);
5119   emit_jmp(stubs[n][2]); // return address
5120   set_jump_target(jne_instr,(int)out);
5121   emit_readword((int)&pcaddr,EAX);
5122   // Call get_addr_ht instead of doing the hash table here.
5123   // This code is executed infrequently and takes up a lot of space
5124   // so smaller is better.
5125   emit_storereg(CCREG,HOST_CCREG);
5126   emit_pushreg(EAX);
5127   emit_call((int)get_addr_ht);
5128   emit_loadreg(CCREG,HOST_CCREG);
5129   emit_addimm(ESP,4,ESP);
5130   emit_jmpreg(EAX);*/
5131 }
5132
5133 add_to_linker(int addr,int target,int ext)
5134 {
5135   link_addr[linkcount][0]=addr;
5136   link_addr[linkcount][1]=target;
5137   link_addr[linkcount][2]=ext;  
5138   linkcount++;
5139 }
5140
5141 static void ujump_assemble_write_ra(int i)
5142 {
5143   int rt;
5144   unsigned int return_address;
5145   rt=get_reg(branch_regs[i].regmap,31);
5146   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5147   //assert(rt>=0);
5148   return_address=start+i*4+8;
5149   if(rt>=0) {
5150     #ifdef USE_MINI_HT
5151     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5152       int temp=-1; // note: must be ds-safe
5153       #ifdef HOST_TEMPREG
5154       temp=HOST_TEMPREG;
5155       #endif
5156       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5157       else emit_movimm(return_address,rt);
5158     }
5159     else
5160     #endif
5161     {
5162       #ifdef REG_PREFETCH
5163       if(temp>=0) 
5164       {
5165         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5166       }
5167       #endif
5168       emit_movimm(return_address,rt); // PC into link register
5169       #ifdef IMM_PREFETCH
5170       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5171       #endif
5172     }
5173   }
5174 }
5175
5176 void ujump_assemble(int i,struct regstat *i_regs)
5177 {
5178   signed char *i_regmap=i_regs->regmap;
5179   int ra_done=0;
5180   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5181   address_generation(i+1,i_regs,regs[i].regmap_entry);
5182   #ifdef REG_PREFETCH
5183   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5184   if(rt1[i]==31&&temp>=0) 
5185   {
5186     int return_address=start+i*4+8;
5187     if(get_reg(branch_regs[i].regmap,31)>0) 
5188     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5189   }
5190   #endif
5191   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5192     ujump_assemble_write_ra(i); // writeback ra for DS
5193     ra_done=1;
5194   }
5195   ds_assemble(i+1,i_regs);
5196   uint64_t bc_unneeded=branch_regs[i].u;
5197   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5198   bc_unneeded|=1|(1LL<<rt1[i]);
5199   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5200   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5201                 bc_unneeded,bc_unneeded_upper);
5202   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5203   if(!ra_done&&rt1[i]==31)
5204     ujump_assemble_write_ra(i);
5205   int cc,adj;
5206   cc=get_reg(branch_regs[i].regmap,CCREG);
5207   assert(cc==HOST_CCREG);
5208   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5209   #ifdef REG_PREFETCH
5210   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5211   #endif
5212   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5213   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5214   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5215   if(internal_branch(branch_regs[i].is32,ba[i]))
5216     assem_debug("branch: internal\n");
5217   else
5218     assem_debug("branch: external\n");
5219   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5220     ds_assemble_entry(i);
5221   }
5222   else {
5223     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5224     emit_jmp(0);
5225   }
5226 }
5227
5228 static void rjump_assemble_write_ra(int i)
5229 {
5230   int rt,return_address;
5231   assert(rt1[i+1]!=rt1[i]);
5232   assert(rt2[i+1]!=rt1[i]);
5233   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5234   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5235   assert(rt>=0);
5236   return_address=start+i*4+8;
5237   #ifdef REG_PREFETCH
5238   if(temp>=0) 
5239   {
5240     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5241   }
5242   #endif
5243   emit_movimm(return_address,rt); // PC into link register
5244   #ifdef IMM_PREFETCH
5245   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5246   #endif
5247 }
5248
5249 void rjump_assemble(int i,struct regstat *i_regs)
5250 {
5251   signed char *i_regmap=i_regs->regmap;
5252   int temp;
5253   int rs,cc,adj;
5254   int ra_done=0;
5255   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5256   assert(rs>=0);
5257   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5258     // Delay slot abuse, make a copy of the branch address register
5259     temp=get_reg(branch_regs[i].regmap,RTEMP);
5260     assert(temp>=0);
5261     assert(regs[i].regmap[temp]==RTEMP);
5262     emit_mov(rs,temp);
5263     rs=temp;
5264   }
5265   address_generation(i+1,i_regs,regs[i].regmap_entry);
5266   #ifdef REG_PREFETCH
5267   if(rt1[i]==31) 
5268   {
5269     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5270       int return_address=start+i*4+8;
5271       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5272     }
5273   }
5274   #endif
5275   #ifdef USE_MINI_HT
5276   if(rs1[i]==31) {
5277     int rh=get_reg(regs[i].regmap,RHASH);
5278     if(rh>=0) do_preload_rhash(rh);
5279   }
5280   #endif
5281   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5282     rjump_assemble_write_ra(i);
5283     ra_done=1;
5284   }
5285   ds_assemble(i+1,i_regs);
5286   uint64_t bc_unneeded=branch_regs[i].u;
5287   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5288   bc_unneeded|=1|(1LL<<rt1[i]);
5289   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5290   bc_unneeded&=~(1LL<<rs1[i]);
5291   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5292                 bc_unneeded,bc_unneeded_upper);
5293   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5294   if(!ra_done&&rt1[i]!=0)
5295     rjump_assemble_write_ra(i);
5296   cc=get_reg(branch_regs[i].regmap,CCREG);
5297   assert(cc==HOST_CCREG);
5298   #ifdef USE_MINI_HT
5299   int rh=get_reg(branch_regs[i].regmap,RHASH);
5300   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5301   if(rs1[i]==31) {
5302     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5303     do_preload_rhtbl(ht);
5304     do_rhash(rs,rh);
5305   }
5306   #endif
5307   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5308   #ifdef DESTRUCTIVE_WRITEBACK
5309   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5310     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5311       emit_loadreg(rs1[i],rs);
5312     }
5313   }
5314   #endif
5315   #ifdef REG_PREFETCH
5316   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5317   #endif
5318   #ifdef USE_MINI_HT
5319   if(rs1[i]==31) {
5320     do_miniht_load(ht,rh);
5321   }
5322   #endif
5323   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5324   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5325   //assert(adj==0);
5326   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5327   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5328   emit_jns(0);
5329   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5330   #ifdef USE_MINI_HT
5331   if(rs1[i]==31) {
5332     do_miniht_jump(rs,rh,ht);
5333   }
5334   else
5335   #endif
5336   {
5337     //if(rs!=EAX) emit_mov(rs,EAX);
5338     //emit_jmp((int)jump_vaddr_eax);
5339     emit_jmp(jump_vaddr_reg[rs]);
5340   }
5341   /* Check hash table
5342   temp=!rs;
5343   emit_mov(rs,temp);
5344   emit_shrimm(rs,16,rs);
5345   emit_xor(temp,rs,rs);
5346   emit_movzwl_reg(rs,rs);
5347   emit_shlimm(rs,4,rs);
5348   emit_cmpmem_indexed((int)hash_table,rs,temp);
5349   emit_jne((int)out+14);
5350   emit_readword_indexed((int)hash_table+4,rs,rs);
5351   emit_jmpreg(rs);
5352   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5353   emit_addimm_no_flags(8,rs);
5354   emit_jeq((int)out-17);
5355   // No hit on hash table, call compiler
5356   emit_pushreg(temp);
5357 //DEBUG >
5358 #ifdef DEBUG_CYCLE_COUNT
5359   emit_readword((int)&last_count,ECX);
5360   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5361   emit_readword((int)&next_interupt,ECX);
5362   emit_writeword(HOST_CCREG,(int)&Count);
5363   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5364   emit_writeword(ECX,(int)&last_count);
5365 #endif
5366 //DEBUG <
5367   emit_storereg(CCREG,HOST_CCREG);
5368   emit_call((int)get_addr);
5369   emit_loadreg(CCREG,HOST_CCREG);
5370   emit_addimm(ESP,4,ESP);
5371   emit_jmpreg(EAX);*/
5372   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5373   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5374   #endif
5375 }
5376
5377 void cjump_assemble(int i,struct regstat *i_regs)
5378 {
5379   signed char *i_regmap=i_regs->regmap;
5380   int cc;
5381   int match;
5382   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5383   assem_debug("match=%d\n",match);
5384   int s1h,s1l,s2h,s2l;
5385   int prev_cop1_usable=cop1_usable;
5386   int unconditional=0,nop=0;
5387   int only32=0;
5388   int invert=0;
5389   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5390   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5391   if(!match) invert=1;
5392   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5393   if(i>(ba[i]-start)>>2) invert=1;
5394   #endif
5395   
5396   if(ooo[i]) {
5397     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5398     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5399     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5400     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5401   }
5402   else {
5403     s1l=get_reg(i_regmap,rs1[i]);
5404     s1h=get_reg(i_regmap,rs1[i]|64);
5405     s2l=get_reg(i_regmap,rs2[i]);
5406     s2h=get_reg(i_regmap,rs2[i]|64);
5407   }
5408   if(rs1[i]==0&&rs2[i]==0)
5409   {
5410     if(opcode[i]&1) nop=1;
5411     else unconditional=1;
5412     //assert(opcode[i]!=5);
5413     //assert(opcode[i]!=7);
5414     //assert(opcode[i]!=0x15);
5415     //assert(opcode[i]!=0x17);
5416   }
5417   else if(rs1[i]==0)
5418   {
5419     s1l=s2l;s1h=s2h;
5420     s2l=s2h=-1;
5421     only32=(regs[i].was32>>rs2[i])&1;
5422   }
5423   else if(rs2[i]==0)
5424   {
5425     s2l=s2h=-1;
5426     only32=(regs[i].was32>>rs1[i])&1;
5427   }
5428   else {
5429     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5430   }
5431
5432   if(ooo[i]) {
5433     // Out of order execution (delay slot first)
5434     //printf("OOOE\n");
5435     address_generation(i+1,i_regs,regs[i].regmap_entry);
5436     ds_assemble(i+1,i_regs);
5437     int adj;
5438     uint64_t bc_unneeded=branch_regs[i].u;
5439     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5440     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5441     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5442     bc_unneeded|=1;
5443     bc_unneeded_upper|=1;
5444     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5445                   bc_unneeded,bc_unneeded_upper);
5446     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5447     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5448     cc=get_reg(branch_regs[i].regmap,CCREG);
5449     assert(cc==HOST_CCREG);
5450     if(unconditional) 
5451       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5452     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5453     //assem_debug("cycle count (adj)\n");
5454     if(unconditional) {
5455       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5456       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5457         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5458         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5459         if(internal)
5460           assem_debug("branch: internal\n");
5461         else
5462           assem_debug("branch: external\n");
5463         if(internal&&is_ds[(ba[i]-start)>>2]) {
5464           ds_assemble_entry(i);
5465         }
5466         else {
5467           add_to_linker((int)out,ba[i],internal);
5468           emit_jmp(0);
5469         }
5470         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5471         if(((u_int)out)&7) emit_addnop(0);
5472         #endif
5473       }
5474     }
5475     else if(nop) {
5476       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5477       int jaddr=(int)out;
5478       emit_jns(0);
5479       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5480     }
5481     else {
5482       int taken=0,nottaken=0,nottaken1=0;
5483       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5484       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5485       if(!only32)
5486       {
5487         assert(s1h>=0);
5488         if(opcode[i]==4) // BEQ
5489         {
5490           if(s2h>=0) emit_cmp(s1h,s2h);
5491           else emit_test(s1h,s1h);
5492           nottaken1=(int)out;
5493           emit_jne(1);
5494         }
5495         if(opcode[i]==5) // BNE
5496         {
5497           if(s2h>=0) emit_cmp(s1h,s2h);
5498           else emit_test(s1h,s1h);
5499           if(invert) taken=(int)out;
5500           else add_to_linker((int)out,ba[i],internal);
5501           emit_jne(0);
5502         }
5503         if(opcode[i]==6) // BLEZ
5504         {
5505           emit_test(s1h,s1h);
5506           if(invert) taken=(int)out;
5507           else add_to_linker((int)out,ba[i],internal);
5508           emit_js(0);
5509           nottaken1=(int)out;
5510           emit_jne(1);
5511         }
5512         if(opcode[i]==7) // BGTZ
5513         {
5514           emit_test(s1h,s1h);
5515           nottaken1=(int)out;
5516           emit_js(1);
5517           if(invert) taken=(int)out;
5518           else add_to_linker((int)out,ba[i],internal);
5519           emit_jne(0);
5520         }
5521       } // if(!only32)
5522           
5523       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5524       assert(s1l>=0);
5525       if(opcode[i]==4) // BEQ
5526       {
5527         if(s2l>=0) emit_cmp(s1l,s2l);
5528         else emit_test(s1l,s1l);
5529         if(invert){
5530           nottaken=(int)out;
5531           emit_jne(1);
5532         }else{
5533           add_to_linker((int)out,ba[i],internal);
5534           emit_jeq(0);
5535         }
5536       }
5537       if(opcode[i]==5) // BNE
5538       {
5539         if(s2l>=0) emit_cmp(s1l,s2l);
5540         else emit_test(s1l,s1l);
5541         if(invert){
5542           nottaken=(int)out;
5543           emit_jeq(1);
5544         }else{
5545           add_to_linker((int)out,ba[i],internal);
5546           emit_jne(0);
5547         }
5548       }
5549       if(opcode[i]==6) // BLEZ
5550       {
5551         emit_cmpimm(s1l,1);
5552         if(invert){
5553           nottaken=(int)out;
5554           emit_jge(1);
5555         }else{
5556           add_to_linker((int)out,ba[i],internal);
5557           emit_jl(0);
5558         }
5559       }
5560       if(opcode[i]==7) // BGTZ
5561       {
5562         emit_cmpimm(s1l,1);
5563         if(invert){
5564           nottaken=(int)out;
5565           emit_jl(1);
5566         }else{
5567           add_to_linker((int)out,ba[i],internal);
5568           emit_jge(0);
5569         }
5570       }
5571       if(invert) {
5572         if(taken) set_jump_target(taken,(int)out);
5573         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5574         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5575           if(adj) {
5576             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5577             add_to_linker((int)out,ba[i],internal);
5578           }else{
5579             emit_addnop(13);
5580             add_to_linker((int)out,ba[i],internal*2);
5581           }
5582           emit_jmp(0);
5583         }else
5584         #endif
5585         {
5586           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5587           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5588           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5589           if(internal)
5590             assem_debug("branch: internal\n");
5591           else
5592             assem_debug("branch: external\n");
5593           if(internal&&is_ds[(ba[i]-start)>>2]) {
5594             ds_assemble_entry(i);
5595           }
5596           else {
5597             add_to_linker((int)out,ba[i],internal);
5598             emit_jmp(0);
5599           }
5600         }
5601         set_jump_target(nottaken,(int)out);
5602       }
5603
5604       if(nottaken1) set_jump_target(nottaken1,(int)out);
5605       if(adj) {
5606         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5607       }
5608     } // (!unconditional)
5609   } // if(ooo)
5610   else
5611   {
5612     // In-order execution (branch first)
5613     //if(likely[i]) printf("IOL\n");
5614     //else
5615     //printf("IOE\n");
5616     int taken=0,nottaken=0,nottaken1=0;
5617     if(!unconditional&&!nop) {
5618       if(!only32)
5619       {
5620         assert(s1h>=0);
5621         if((opcode[i]&0x2f)==4) // BEQ
5622         {
5623           if(s2h>=0) emit_cmp(s1h,s2h);
5624           else emit_test(s1h,s1h);
5625           nottaken1=(int)out;
5626           emit_jne(2);
5627         }
5628         if((opcode[i]&0x2f)==5) // BNE
5629         {
5630           if(s2h>=0) emit_cmp(s1h,s2h);
5631           else emit_test(s1h,s1h);
5632           taken=(int)out;
5633           emit_jne(1);
5634         }
5635         if((opcode[i]&0x2f)==6) // BLEZ
5636         {
5637           emit_test(s1h,s1h);
5638           taken=(int)out;
5639           emit_js(1);
5640           nottaken1=(int)out;
5641           emit_jne(2);
5642         }
5643         if((opcode[i]&0x2f)==7) // BGTZ
5644         {
5645           emit_test(s1h,s1h);
5646           nottaken1=(int)out;
5647           emit_js(2);
5648           taken=(int)out;
5649           emit_jne(1);
5650         }
5651       } // if(!only32)
5652           
5653       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5654       assert(s1l>=0);
5655       if((opcode[i]&0x2f)==4) // BEQ
5656       {
5657         if(s2l>=0) emit_cmp(s1l,s2l);
5658         else emit_test(s1l,s1l);
5659         nottaken=(int)out;
5660         emit_jne(2);
5661       }
5662       if((opcode[i]&0x2f)==5) // BNE
5663       {
5664         if(s2l>=0) emit_cmp(s1l,s2l);
5665         else emit_test(s1l,s1l);
5666         nottaken=(int)out;
5667         emit_jeq(2);
5668       }
5669       if((opcode[i]&0x2f)==6) // BLEZ
5670       {
5671         emit_cmpimm(s1l,1);
5672         nottaken=(int)out;
5673         emit_jge(2);
5674       }
5675       if((opcode[i]&0x2f)==7) // BGTZ
5676       {
5677         emit_cmpimm(s1l,1);
5678         nottaken=(int)out;
5679         emit_jl(2);
5680       }
5681     } // if(!unconditional)
5682     int adj;
5683     uint64_t ds_unneeded=branch_regs[i].u;
5684     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5685     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5686     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5687     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5688     ds_unneeded|=1;
5689     ds_unneeded_upper|=1;
5690     // branch taken
5691     if(!nop) {
5692       if(taken) set_jump_target(taken,(int)out);
5693       assem_debug("1:\n");
5694       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5695                     ds_unneeded,ds_unneeded_upper);
5696       // load regs
5697       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5698       address_generation(i+1,&branch_regs[i],0);
5699       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5700       ds_assemble(i+1,&branch_regs[i]);
5701       cc=get_reg(branch_regs[i].regmap,CCREG);
5702       if(cc==-1) {
5703         emit_loadreg(CCREG,cc=HOST_CCREG);
5704         // CHECK: Is the following instruction (fall thru) allocated ok?
5705       }
5706       assert(cc==HOST_CCREG);
5707       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5708       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5709       assem_debug("cycle count (adj)\n");
5710       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5711       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5712       if(internal)
5713         assem_debug("branch: internal\n");
5714       else
5715         assem_debug("branch: external\n");
5716       if(internal&&is_ds[(ba[i]-start)>>2]) {
5717         ds_assemble_entry(i);
5718       }
5719       else {
5720         add_to_linker((int)out,ba[i],internal);
5721         emit_jmp(0);
5722       }
5723     }
5724     // branch not taken
5725     cop1_usable=prev_cop1_usable;
5726     if(!unconditional) {
5727       if(nottaken1) set_jump_target(nottaken1,(int)out);
5728       set_jump_target(nottaken,(int)out);
5729       assem_debug("2:\n");
5730       if(!likely[i]) {
5731         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5732                       ds_unneeded,ds_unneeded_upper);
5733         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5734         address_generation(i+1,&branch_regs[i],0);
5735         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5736         ds_assemble(i+1,&branch_regs[i]);
5737       }
5738       cc=get_reg(branch_regs[i].regmap,CCREG);
5739       if(cc==-1&&!likely[i]) {
5740         // Cycle count isn't in a register, temporarily load it then write it out
5741         emit_loadreg(CCREG,HOST_CCREG);
5742         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5743         int jaddr=(int)out;
5744         emit_jns(0);
5745         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5746         emit_storereg(CCREG,HOST_CCREG);
5747       }
5748       else{
5749         cc=get_reg(i_regmap,CCREG);
5750         assert(cc==HOST_CCREG);
5751         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5752         int jaddr=(int)out;
5753         emit_jns(0);
5754         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5755       }
5756     }
5757   }
5758 }
5759
5760 void sjump_assemble(int i,struct regstat *i_regs)
5761 {
5762   signed char *i_regmap=i_regs->regmap;
5763   int cc;
5764   int match;
5765   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5766   assem_debug("smatch=%d\n",match);
5767   int s1h,s1l;
5768   int prev_cop1_usable=cop1_usable;
5769   int unconditional=0,nevertaken=0;
5770   int only32=0;
5771   int invert=0;
5772   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5773   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5774   if(!match) invert=1;
5775   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5776   if(i>(ba[i]-start)>>2) invert=1;
5777   #endif
5778
5779   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5780   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5781
5782   if(ooo[i]) {
5783     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5784     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5785   }
5786   else {
5787     s1l=get_reg(i_regmap,rs1[i]);
5788     s1h=get_reg(i_regmap,rs1[i]|64);
5789   }
5790   if(rs1[i]==0)
5791   {
5792     if(opcode2[i]&1) unconditional=1;
5793     else nevertaken=1;
5794     // These are never taken (r0 is never less than zero)
5795     //assert(opcode2[i]!=0);
5796     //assert(opcode2[i]!=2);
5797     //assert(opcode2[i]!=0x10);
5798     //assert(opcode2[i]!=0x12);
5799   }
5800   else {
5801     only32=(regs[i].was32>>rs1[i])&1;
5802   }
5803
5804   if(ooo[i]) {
5805     // Out of order execution (delay slot first)
5806     //printf("OOOE\n");
5807     address_generation(i+1,i_regs,regs[i].regmap_entry);
5808     ds_assemble(i+1,i_regs);
5809     int adj;
5810     uint64_t bc_unneeded=branch_regs[i].u;
5811     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5812     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5813     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5814     bc_unneeded|=1;
5815     bc_unneeded_upper|=1;
5816     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5817                   bc_unneeded,bc_unneeded_upper);
5818     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5819     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5820     if(rt1[i]==31) {
5821       int rt,return_address;
5822       rt=get_reg(branch_regs[i].regmap,31);
5823       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5824       if(rt>=0) {
5825         // Save the PC even if the branch is not taken
5826         return_address=start+i*4+8;
5827         emit_movimm(return_address,rt); // PC into link register
5828         #ifdef IMM_PREFETCH
5829         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5830         #endif
5831       }
5832     }
5833     cc=get_reg(branch_regs[i].regmap,CCREG);
5834     assert(cc==HOST_CCREG);
5835     if(unconditional) 
5836       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5837     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5838     assem_debug("cycle count (adj)\n");
5839     if(unconditional) {
5840       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5841       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5842         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5843         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5844         if(internal)
5845           assem_debug("branch: internal\n");
5846         else
5847           assem_debug("branch: external\n");
5848         if(internal&&is_ds[(ba[i]-start)>>2]) {
5849           ds_assemble_entry(i);
5850         }
5851         else {
5852           add_to_linker((int)out,ba[i],internal);
5853           emit_jmp(0);
5854         }
5855         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5856         if(((u_int)out)&7) emit_addnop(0);
5857         #endif
5858       }
5859     }
5860     else if(nevertaken) {
5861       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5862       int jaddr=(int)out;
5863       emit_jns(0);
5864       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5865     }
5866     else {
5867       int nottaken=0;
5868       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5869       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5870       if(!only32)
5871       {
5872         assert(s1h>=0);
5873         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5874         {
5875           emit_test(s1h,s1h);
5876           if(invert){
5877             nottaken=(int)out;
5878             emit_jns(1);
5879           }else{
5880             add_to_linker((int)out,ba[i],internal);
5881             emit_js(0);
5882           }
5883         }
5884         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5885         {
5886           emit_test(s1h,s1h);
5887           if(invert){
5888             nottaken=(int)out;
5889             emit_js(1);
5890           }else{
5891             add_to_linker((int)out,ba[i],internal);
5892             emit_jns(0);
5893           }
5894         }
5895       } // if(!only32)
5896       else
5897       {
5898         assert(s1l>=0);
5899         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5900         {
5901           emit_test(s1l,s1l);
5902           if(invert){
5903             nottaken=(int)out;
5904             emit_jns(1);
5905           }else{
5906             add_to_linker((int)out,ba[i],internal);
5907             emit_js(0);
5908           }
5909         }
5910         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5911         {
5912           emit_test(s1l,s1l);
5913           if(invert){
5914             nottaken=(int)out;
5915             emit_js(1);
5916           }else{
5917             add_to_linker((int)out,ba[i],internal);
5918             emit_jns(0);
5919           }
5920         }
5921       } // if(!only32)
5922           
5923       if(invert) {
5924         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5925         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5926           if(adj) {
5927             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5928             add_to_linker((int)out,ba[i],internal);
5929           }else{
5930             emit_addnop(13);
5931             add_to_linker((int)out,ba[i],internal*2);
5932           }
5933           emit_jmp(0);
5934         }else
5935         #endif
5936         {
5937           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5938           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5939           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5940           if(internal)
5941             assem_debug("branch: internal\n");
5942           else
5943             assem_debug("branch: external\n");
5944           if(internal&&is_ds[(ba[i]-start)>>2]) {
5945             ds_assemble_entry(i);
5946           }
5947           else {
5948             add_to_linker((int)out,ba[i],internal);
5949             emit_jmp(0);
5950           }
5951         }
5952         set_jump_target(nottaken,(int)out);
5953       }
5954
5955       if(adj) {
5956         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5957       }
5958     } // (!unconditional)
5959   } // if(ooo)
5960   else
5961   {
5962     // In-order execution (branch first)
5963     //printf("IOE\n");
5964     int nottaken=0;
5965     if(rt1[i]==31) {
5966       int rt,return_address;
5967       rt=get_reg(branch_regs[i].regmap,31);
5968       if(rt>=0) {
5969         // Save the PC even if the branch is not taken
5970         return_address=start+i*4+8;
5971         emit_movimm(return_address,rt); // PC into link register
5972         #ifdef IMM_PREFETCH
5973         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5974         #endif
5975       }
5976     }
5977     if(!unconditional) {
5978       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5979       if(!only32)
5980       {
5981         assert(s1h>=0);
5982         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5983         {
5984           emit_test(s1h,s1h);
5985           nottaken=(int)out;
5986           emit_jns(1);
5987         }
5988         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5989         {
5990           emit_test(s1h,s1h);
5991           nottaken=(int)out;
5992           emit_js(1);
5993         }
5994       } // if(!only32)
5995       else
5996       {
5997         assert(s1l>=0);
5998         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5999         {
6000           emit_test(s1l,s1l);
6001           nottaken=(int)out;
6002           emit_jns(1);
6003         }
6004         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6005         {
6006           emit_test(s1l,s1l);
6007           nottaken=(int)out;
6008           emit_js(1);
6009         }
6010       }
6011     } // if(!unconditional)
6012     int adj;
6013     uint64_t ds_unneeded=branch_regs[i].u;
6014     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6015     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6016     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6017     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6018     ds_unneeded|=1;
6019     ds_unneeded_upper|=1;
6020     // branch taken
6021     if(!nevertaken) {
6022       //assem_debug("1:\n");
6023       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6024                     ds_unneeded,ds_unneeded_upper);
6025       // load regs
6026       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6027       address_generation(i+1,&branch_regs[i],0);
6028       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6029       ds_assemble(i+1,&branch_regs[i]);
6030       cc=get_reg(branch_regs[i].regmap,CCREG);
6031       if(cc==-1) {
6032         emit_loadreg(CCREG,cc=HOST_CCREG);
6033         // CHECK: Is the following instruction (fall thru) allocated ok?
6034       }
6035       assert(cc==HOST_CCREG);
6036       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6037       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6038       assem_debug("cycle count (adj)\n");
6039       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6040       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6041       if(internal)
6042         assem_debug("branch: internal\n");
6043       else
6044         assem_debug("branch: external\n");
6045       if(internal&&is_ds[(ba[i]-start)>>2]) {
6046         ds_assemble_entry(i);
6047       }
6048       else {
6049         add_to_linker((int)out,ba[i],internal);
6050         emit_jmp(0);
6051       }
6052     }
6053     // branch not taken
6054     cop1_usable=prev_cop1_usable;
6055     if(!unconditional) {
6056       set_jump_target(nottaken,(int)out);
6057       assem_debug("1:\n");
6058       if(!likely[i]) {
6059         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6060                       ds_unneeded,ds_unneeded_upper);
6061         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6062         address_generation(i+1,&branch_regs[i],0);
6063         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6064         ds_assemble(i+1,&branch_regs[i]);
6065       }
6066       cc=get_reg(branch_regs[i].regmap,CCREG);
6067       if(cc==-1&&!likely[i]) {
6068         // Cycle count isn't in a register, temporarily load it then write it out
6069         emit_loadreg(CCREG,HOST_CCREG);
6070         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6071         int jaddr=(int)out;
6072         emit_jns(0);
6073         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6074         emit_storereg(CCREG,HOST_CCREG);
6075       }
6076       else{
6077         cc=get_reg(i_regmap,CCREG);
6078         assert(cc==HOST_CCREG);
6079         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6080         int jaddr=(int)out;
6081         emit_jns(0);
6082         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6083       }
6084     }
6085   }
6086 }
6087
6088 void fjump_assemble(int i,struct regstat *i_regs)
6089 {
6090   signed char *i_regmap=i_regs->regmap;
6091   int cc;
6092   int match;
6093   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6094   assem_debug("fmatch=%d\n",match);
6095   int fs,cs;
6096   int eaddr;
6097   int invert=0;
6098   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6099   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6100   if(!match) invert=1;
6101   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6102   if(i>(ba[i]-start)>>2) invert=1;
6103   #endif
6104
6105   if(ooo[i]) {
6106     fs=get_reg(branch_regs[i].regmap,FSREG);
6107     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6108   }
6109   else {
6110     fs=get_reg(i_regmap,FSREG);
6111   }
6112
6113   // Check cop1 unusable
6114   if(!cop1_usable) {
6115     cs=get_reg(i_regmap,CSREG);
6116     assert(cs>=0);
6117     emit_testimm(cs,0x20000000);
6118     eaddr=(int)out;
6119     emit_jeq(0);
6120     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6121     cop1_usable=1;
6122   }
6123
6124   if(ooo[i]) {
6125     // Out of order execution (delay slot first)
6126     //printf("OOOE\n");
6127     ds_assemble(i+1,i_regs);
6128     int adj;
6129     uint64_t bc_unneeded=branch_regs[i].u;
6130     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6131     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6132     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6133     bc_unneeded|=1;
6134     bc_unneeded_upper|=1;
6135     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6136                   bc_unneeded,bc_unneeded_upper);
6137     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6138     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6139     cc=get_reg(branch_regs[i].regmap,CCREG);
6140     assert(cc==HOST_CCREG);
6141     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6142     assem_debug("cycle count (adj)\n");
6143     if(1) {
6144       int nottaken=0;
6145       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6146       if(1) {
6147         assert(fs>=0);
6148         emit_testimm(fs,0x800000);
6149         if(source[i]&0x10000) // BC1T
6150         {
6151           if(invert){
6152             nottaken=(int)out;
6153             emit_jeq(1);
6154           }else{
6155             add_to_linker((int)out,ba[i],internal);
6156             emit_jne(0);
6157           }
6158         }
6159         else // BC1F
6160           if(invert){
6161             nottaken=(int)out;
6162             emit_jne(1);
6163           }else{
6164             add_to_linker((int)out,ba[i],internal);
6165             emit_jeq(0);
6166           }
6167         {
6168         }
6169       } // if(!only32)
6170           
6171       if(invert) {
6172         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6173         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6174         else if(match) emit_addnop(13);
6175         #endif
6176         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6177         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6178         if(internal)
6179           assem_debug("branch: internal\n");
6180         else
6181           assem_debug("branch: external\n");
6182         if(internal&&is_ds[(ba[i]-start)>>2]) {
6183           ds_assemble_entry(i);
6184         }
6185         else {
6186           add_to_linker((int)out,ba[i],internal);
6187           emit_jmp(0);
6188         }
6189         set_jump_target(nottaken,(int)out);
6190       }
6191
6192       if(adj) {
6193         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6194       }
6195     } // (!unconditional)
6196   } // if(ooo)
6197   else
6198   {
6199     // In-order execution (branch first)
6200     //printf("IOE\n");
6201     int nottaken=0;
6202     if(1) {
6203       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6204       if(1) {
6205         assert(fs>=0);
6206         emit_testimm(fs,0x800000);
6207         if(source[i]&0x10000) // BC1T
6208         {
6209           nottaken=(int)out;
6210           emit_jeq(1);
6211         }
6212         else // BC1F
6213         {
6214           nottaken=(int)out;
6215           emit_jne(1);
6216         }
6217       }
6218     } // if(!unconditional)
6219     int adj;
6220     uint64_t ds_unneeded=branch_regs[i].u;
6221     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6222     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6223     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6224     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6225     ds_unneeded|=1;
6226     ds_unneeded_upper|=1;
6227     // branch taken
6228     //assem_debug("1:\n");
6229     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6230                   ds_unneeded,ds_unneeded_upper);
6231     // load regs
6232     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6233     address_generation(i+1,&branch_regs[i],0);
6234     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6235     ds_assemble(i+1,&branch_regs[i]);
6236     cc=get_reg(branch_regs[i].regmap,CCREG);
6237     if(cc==-1) {
6238       emit_loadreg(CCREG,cc=HOST_CCREG);
6239       // CHECK: Is the following instruction (fall thru) allocated ok?
6240     }
6241     assert(cc==HOST_CCREG);
6242     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6243     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6244     assem_debug("cycle count (adj)\n");
6245     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6246     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6247     if(internal)
6248       assem_debug("branch: internal\n");
6249     else
6250       assem_debug("branch: external\n");
6251     if(internal&&is_ds[(ba[i]-start)>>2]) {
6252       ds_assemble_entry(i);
6253     }
6254     else {
6255       add_to_linker((int)out,ba[i],internal);
6256       emit_jmp(0);
6257     }
6258
6259     // branch not taken
6260     if(1) { // <- FIXME (don't need this)
6261       set_jump_target(nottaken,(int)out);
6262       assem_debug("1:\n");
6263       if(!likely[i]) {
6264         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6265                       ds_unneeded,ds_unneeded_upper);
6266         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6267         address_generation(i+1,&branch_regs[i],0);
6268         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6269         ds_assemble(i+1,&branch_regs[i]);
6270       }
6271       cc=get_reg(branch_regs[i].regmap,CCREG);
6272       if(cc==-1&&!likely[i]) {
6273         // Cycle count isn't in a register, temporarily load it then write it out
6274         emit_loadreg(CCREG,HOST_CCREG);
6275         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6276         int jaddr=(int)out;
6277         emit_jns(0);
6278         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6279         emit_storereg(CCREG,HOST_CCREG);
6280       }
6281       else{
6282         cc=get_reg(i_regmap,CCREG);
6283         assert(cc==HOST_CCREG);
6284         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6285         int jaddr=(int)out;
6286         emit_jns(0);
6287         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6288       }
6289     }
6290   }
6291 }
6292
6293 static void pagespan_assemble(int i,struct regstat *i_regs)
6294 {
6295   int s1l=get_reg(i_regs->regmap,rs1[i]);
6296   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6297   int s2l=get_reg(i_regs->regmap,rs2[i]);
6298   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6299   void *nt_branch=NULL;
6300   int taken=0;
6301   int nottaken=0;
6302   int unconditional=0;
6303   if(rs1[i]==0)
6304   {
6305     s1l=s2l;s1h=s2h;
6306     s2l=s2h=-1;
6307   }
6308   else if(rs2[i]==0)
6309   {
6310     s2l=s2h=-1;
6311   }
6312   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6313     s1h=s2h=-1;
6314   }
6315   int hr=0;
6316   int addr,alt,ntaddr;
6317   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6318   else {
6319     while(hr<HOST_REGS)
6320     {
6321       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6322          (i_regs->regmap[hr]&63)!=rs1[i] &&
6323          (i_regs->regmap[hr]&63)!=rs2[i] )
6324       {
6325         addr=hr++;break;
6326       }
6327       hr++;
6328     }
6329   }
6330   while(hr<HOST_REGS)
6331   {
6332     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6333        (i_regs->regmap[hr]&63)!=rs1[i] &&
6334        (i_regs->regmap[hr]&63)!=rs2[i] )
6335     {
6336       alt=hr++;break;
6337     }
6338     hr++;
6339   }
6340   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6341   {
6342     while(hr<HOST_REGS)
6343     {
6344       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6345          (i_regs->regmap[hr]&63)!=rs1[i] &&
6346          (i_regs->regmap[hr]&63)!=rs2[i] )
6347       {
6348         ntaddr=hr;break;
6349       }
6350       hr++;
6351     }
6352   }
6353   assert(hr<HOST_REGS);
6354   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6355     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6356   }
6357   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6358   if(opcode[i]==2) // J
6359   {
6360     unconditional=1;
6361   }
6362   if(opcode[i]==3) // JAL
6363   {
6364     // TODO: mini_ht
6365     int rt=get_reg(i_regs->regmap,31);
6366     emit_movimm(start+i*4+8,rt);
6367     unconditional=1;
6368   }
6369   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6370   {
6371     emit_mov(s1l,addr);
6372     if(opcode2[i]==9) // JALR
6373     {
6374       int rt=get_reg(i_regs->regmap,rt1[i]);
6375       emit_movimm(start+i*4+8,rt);
6376     }
6377   }
6378   if((opcode[i]&0x3f)==4) // BEQ
6379   {
6380     if(rs1[i]==rs2[i])
6381     {
6382       unconditional=1;
6383     }
6384     else
6385     #ifdef HAVE_CMOV_IMM
6386     if(s1h<0) {
6387       if(s2l>=0) emit_cmp(s1l,s2l);
6388       else emit_test(s1l,s1l);
6389       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6390     }
6391     else
6392     #endif
6393     {
6394       assert(s1l>=0);
6395       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6396       if(s1h>=0) {
6397         if(s2h>=0) emit_cmp(s1h,s2h);
6398         else emit_test(s1h,s1h);
6399         emit_cmovne_reg(alt,addr);
6400       }
6401       if(s2l>=0) emit_cmp(s1l,s2l);
6402       else emit_test(s1l,s1l);
6403       emit_cmovne_reg(alt,addr);
6404     }
6405   }
6406   if((opcode[i]&0x3f)==5) // BNE
6407   {
6408     #ifdef HAVE_CMOV_IMM
6409     if(s1h<0) {
6410       if(s2l>=0) emit_cmp(s1l,s2l);
6411       else emit_test(s1l,s1l);
6412       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6413     }
6414     else
6415     #endif
6416     {
6417       assert(s1l>=0);
6418       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6419       if(s1h>=0) {
6420         if(s2h>=0) emit_cmp(s1h,s2h);
6421         else emit_test(s1h,s1h);
6422         emit_cmovne_reg(alt,addr);
6423       }
6424       if(s2l>=0) emit_cmp(s1l,s2l);
6425       else emit_test(s1l,s1l);
6426       emit_cmovne_reg(alt,addr);
6427     }
6428   }
6429   if((opcode[i]&0x3f)==0x14) // BEQL
6430   {
6431     if(s1h>=0) {
6432       if(s2h>=0) emit_cmp(s1h,s2h);
6433       else emit_test(s1h,s1h);
6434       nottaken=(int)out;
6435       emit_jne(0);
6436     }
6437     if(s2l>=0) emit_cmp(s1l,s2l);
6438     else emit_test(s1l,s1l);
6439     if(nottaken) set_jump_target(nottaken,(int)out);
6440     nottaken=(int)out;
6441     emit_jne(0);
6442   }
6443   if((opcode[i]&0x3f)==0x15) // BNEL
6444   {
6445     if(s1h>=0) {
6446       if(s2h>=0) emit_cmp(s1h,s2h);
6447       else emit_test(s1h,s1h);
6448       taken=(int)out;
6449       emit_jne(0);
6450     }
6451     if(s2l>=0) emit_cmp(s1l,s2l);
6452     else emit_test(s1l,s1l);
6453     nottaken=(int)out;
6454     emit_jeq(0);
6455     if(taken) set_jump_target(taken,(int)out);
6456   }
6457   if((opcode[i]&0x3f)==6) // BLEZ
6458   {
6459     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6460     emit_cmpimm(s1l,1);
6461     if(s1h>=0) emit_mov(addr,ntaddr);
6462     emit_cmovl_reg(alt,addr);
6463     if(s1h>=0) {
6464       emit_test(s1h,s1h);
6465       emit_cmovne_reg(ntaddr,addr);
6466       emit_cmovs_reg(alt,addr);
6467     }
6468   }
6469   if((opcode[i]&0x3f)==7) // BGTZ
6470   {
6471     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6472     emit_cmpimm(s1l,1);
6473     if(s1h>=0) emit_mov(addr,alt);
6474     emit_cmovl_reg(ntaddr,addr);
6475     if(s1h>=0) {
6476       emit_test(s1h,s1h);
6477       emit_cmovne_reg(alt,addr);
6478       emit_cmovs_reg(ntaddr,addr);
6479     }
6480   }
6481   if((opcode[i]&0x3f)==0x16) // BLEZL
6482   {
6483     assert((opcode[i]&0x3f)!=0x16);
6484   }
6485   if((opcode[i]&0x3f)==0x17) // BGTZL
6486   {
6487     assert((opcode[i]&0x3f)!=0x17);
6488   }
6489   assert(opcode[i]!=1); // BLTZ/BGEZ
6490
6491   //FIXME: Check CSREG
6492   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6493     if((source[i]&0x30000)==0) // BC1F
6494     {
6495       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6496       emit_testimm(s1l,0x800000);
6497       emit_cmovne_reg(alt,addr);
6498     }
6499     if((source[i]&0x30000)==0x10000) // BC1T
6500     {
6501       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6502       emit_testimm(s1l,0x800000);
6503       emit_cmovne_reg(alt,addr);
6504     }
6505     if((source[i]&0x30000)==0x20000) // BC1FL
6506     {
6507       emit_testimm(s1l,0x800000);
6508       nottaken=(int)out;
6509       emit_jne(0);
6510     }
6511     if((source[i]&0x30000)==0x30000) // BC1TL
6512     {
6513       emit_testimm(s1l,0x800000);
6514       nottaken=(int)out;
6515       emit_jeq(0);
6516     }
6517   }
6518
6519   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6520   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6521   if(likely[i]||unconditional)
6522   {
6523     emit_movimm(ba[i],HOST_BTREG);
6524   }
6525   else if(addr!=HOST_BTREG)
6526   {
6527     emit_mov(addr,HOST_BTREG);
6528   }
6529   void *branch_addr=out;
6530   emit_jmp(0);
6531   int target_addr=start+i*4+5;
6532   void *stub=out;
6533   void *compiled_target_addr=check_addr(target_addr);
6534   emit_extjump_ds((int)branch_addr,target_addr);
6535   if(compiled_target_addr) {
6536     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6537     add_link(target_addr,stub);
6538   }
6539   else set_jump_target((int)branch_addr,(int)stub);
6540   if(likely[i]) {
6541     // Not-taken path
6542     set_jump_target((int)nottaken,(int)out);
6543     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6544     void *branch_addr=out;
6545     emit_jmp(0);
6546     int target_addr=start+i*4+8;
6547     void *stub=out;
6548     void *compiled_target_addr=check_addr(target_addr);
6549     emit_extjump_ds((int)branch_addr,target_addr);
6550     if(compiled_target_addr) {
6551       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6552       add_link(target_addr,stub);
6553     }
6554     else set_jump_target((int)branch_addr,(int)stub);
6555   }
6556 }
6557
6558 // Assemble the delay slot for the above
6559 static void pagespan_ds()
6560 {
6561   assem_debug("initial delay slot:\n");
6562   u_int vaddr=start+1;
6563   u_int page=get_page(vaddr);
6564   u_int vpage=get_vpage(vaddr);
6565   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6566   do_dirty_stub_ds();
6567   ll_add(jump_in+page,vaddr,(void *)out);
6568   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6569   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6570     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6571   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6572     emit_writeword(HOST_BTREG,(int)&branch_target);
6573   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6574   address_generation(0,&regs[0],regs[0].regmap_entry);
6575   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6576     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6577   cop1_usable=0;
6578   is_delayslot=0;
6579   switch(itype[0]) {
6580     case ALU:
6581       alu_assemble(0,&regs[0]);break;
6582     case IMM16:
6583       imm16_assemble(0,&regs[0]);break;
6584     case SHIFT:
6585       shift_assemble(0,&regs[0]);break;
6586     case SHIFTIMM:
6587       shiftimm_assemble(0,&regs[0]);break;
6588     case LOAD:
6589       load_assemble(0,&regs[0]);break;
6590     case LOADLR:
6591       loadlr_assemble(0,&regs[0]);break;
6592     case STORE:
6593       store_assemble(0,&regs[0]);break;
6594     case STORELR:
6595       storelr_assemble(0,&regs[0]);break;
6596     case COP0:
6597       cop0_assemble(0,&regs[0]);break;
6598     case COP1:
6599       cop1_assemble(0,&regs[0]);break;
6600     case C1LS:
6601       c1ls_assemble(0,&regs[0]);break;
6602     case COP2:
6603       cop2_assemble(0,&regs[0]);break;
6604     case C2LS:
6605       c2ls_assemble(0,&regs[0]);break;
6606     case C2OP:
6607       c2op_assemble(0,&regs[0]);break;
6608     case FCONV:
6609       fconv_assemble(0,&regs[0]);break;
6610     case FLOAT:
6611       float_assemble(0,&regs[0]);break;
6612     case FCOMP:
6613       fcomp_assemble(0,&regs[0]);break;
6614     case MULTDIV:
6615       multdiv_assemble(0,&regs[0]);break;
6616     case MOV:
6617       mov_assemble(0,&regs[0]);break;
6618     case SYSCALL:
6619     case HLECALL:
6620     case INTCALL:
6621     case SPAN:
6622     case UJUMP:
6623     case RJUMP:
6624     case CJUMP:
6625     case SJUMP:
6626     case FJUMP:
6627       printf("Jump in the delay slot.  This is probably a bug.\n");
6628   }
6629   int btaddr=get_reg(regs[0].regmap,BTREG);
6630   if(btaddr<0) {
6631     btaddr=get_reg(regs[0].regmap,-1);
6632     emit_readword((int)&branch_target,btaddr);
6633   }
6634   assert(btaddr!=HOST_CCREG);
6635   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6636 #ifdef HOST_IMM8
6637   emit_movimm(start+4,HOST_TEMPREG);
6638   emit_cmp(btaddr,HOST_TEMPREG);
6639 #else
6640   emit_cmpimm(btaddr,start+4);
6641 #endif
6642   int branch=(int)out;
6643   emit_jeq(0);
6644   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6645   emit_jmp(jump_vaddr_reg[btaddr]);
6646   set_jump_target(branch,(int)out);
6647   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6648   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6649 }
6650
6651 // Basic liveness analysis for MIPS registers
6652 void unneeded_registers(int istart,int iend,int r)
6653 {
6654   int i;
6655   uint64_t u,uu,b,bu;
6656   uint64_t temp_u,temp_uu;
6657   uint64_t tdep;
6658   if(iend==slen-1) {
6659     u=1;uu=1;
6660   }else{
6661     u=unneeded_reg[iend+1];
6662     uu=unneeded_reg_upper[iend+1];
6663     u=1;uu=1;
6664   }
6665   for (i=iend;i>=istart;i--)
6666   {
6667     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6668     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6669     {
6670       // If subroutine call, flag return address as a possible branch target
6671       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6672       
6673       if(ba[i]<start || ba[i]>=(start+slen*4))
6674       {
6675         // Branch out of this block, flush all regs
6676         u=1;
6677         uu=1;
6678         /* Hexagon hack 
6679         if(itype[i]==UJUMP&&rt1[i]==31)
6680         {
6681           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6682         }
6683         if(itype[i]==RJUMP&&rs1[i]==31)
6684         {
6685           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6686         }
6687         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6688           if(itype[i]==UJUMP&&rt1[i]==31)
6689           {
6690             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6691             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6692           }
6693           if(itype[i]==RJUMP&&rs1[i]==31)
6694           {
6695             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6696             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6697           }
6698         }*/
6699         branch_unneeded_reg[i]=u;
6700         branch_unneeded_reg_upper[i]=uu;
6701         // Merge in delay slot
6702         tdep=(~uu>>rt1[i+1])&1;
6703         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6704         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6705         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6706         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6707         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6708         u|=1;uu|=1;
6709         // If branch is "likely" (and conditional)
6710         // then we skip the delay slot on the fall-thru path
6711         if(likely[i]) {
6712           if(i<slen-1) {
6713             u&=unneeded_reg[i+2];
6714             uu&=unneeded_reg_upper[i+2];
6715           }
6716           else
6717           {
6718             u=1;
6719             uu=1;
6720           }
6721         }
6722       }
6723       else
6724       {
6725         // Internal branch, flag target
6726         bt[(ba[i]-start)>>2]=1;
6727         if(ba[i]<=start+i*4) {
6728           // Backward branch
6729           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6730           {
6731             // Unconditional branch
6732             temp_u=1;temp_uu=1;
6733           } else {
6734             // Conditional branch (not taken case)
6735             temp_u=unneeded_reg[i+2];
6736             temp_uu=unneeded_reg_upper[i+2];
6737           }
6738           // Merge in delay slot
6739           tdep=(~temp_uu>>rt1[i+1])&1;
6740           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6741           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6742           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6743           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6744           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6745           temp_u|=1;temp_uu|=1;
6746           // If branch is "likely" (and conditional)
6747           // then we skip the delay slot on the fall-thru path
6748           if(likely[i]) {
6749             if(i<slen-1) {
6750               temp_u&=unneeded_reg[i+2];
6751               temp_uu&=unneeded_reg_upper[i+2];
6752             }
6753             else
6754             {
6755               temp_u=1;
6756               temp_uu=1;
6757             }
6758           }
6759           tdep=(~temp_uu>>rt1[i])&1;
6760           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6761           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6762           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6763           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6764           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6765           temp_u|=1;temp_uu|=1;
6766           unneeded_reg[i]=temp_u;
6767           unneeded_reg_upper[i]=temp_uu;
6768           // Only go three levels deep.  This recursion can take an
6769           // excessive amount of time if there are a lot of nested loops.
6770           if(r<2) {
6771             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6772           }else{
6773             unneeded_reg[(ba[i]-start)>>2]=1;
6774             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6775           }
6776         } /*else*/ if(1) {
6777           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6778           {
6779             // Unconditional branch
6780             u=unneeded_reg[(ba[i]-start)>>2];
6781             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6782             branch_unneeded_reg[i]=u;
6783             branch_unneeded_reg_upper[i]=uu;
6784         //u=1;
6785         //uu=1;
6786         //branch_unneeded_reg[i]=u;
6787         //branch_unneeded_reg_upper[i]=uu;
6788             // Merge in delay slot
6789             tdep=(~uu>>rt1[i+1])&1;
6790             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6791             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6792             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6793             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6794             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6795             u|=1;uu|=1;
6796           } else {
6797             // Conditional branch
6798             b=unneeded_reg[(ba[i]-start)>>2];
6799             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6800             branch_unneeded_reg[i]=b;
6801             branch_unneeded_reg_upper[i]=bu;
6802         //b=1;
6803         //bu=1;
6804         //branch_unneeded_reg[i]=b;
6805         //branch_unneeded_reg_upper[i]=bu;
6806             // Branch delay slot
6807             tdep=(~uu>>rt1[i+1])&1;
6808             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6809             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6810             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6811             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6812             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6813             b|=1;bu|=1;
6814             // If branch is "likely" then we skip the
6815             // delay slot on the fall-thru path
6816             if(likely[i]) {
6817               u=b;
6818               uu=bu;
6819               if(i<slen-1) {
6820                 u&=unneeded_reg[i+2];
6821                 uu&=unneeded_reg_upper[i+2];
6822         //u=1;
6823         //uu=1;
6824               }
6825             } else {
6826               u&=b;
6827               uu&=bu;
6828         //u=1;
6829         //uu=1;
6830             }
6831             if(i<slen-1) {
6832               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6833               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6834         //branch_unneeded_reg[i]=1;
6835         //branch_unneeded_reg_upper[i]=1;
6836             } else {
6837               branch_unneeded_reg[i]=1;
6838               branch_unneeded_reg_upper[i]=1;
6839             }
6840           }
6841         }
6842       }
6843     }
6844     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6845     {
6846       // SYSCALL instruction (software interrupt)
6847       u=1;
6848       uu=1;
6849     }
6850     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6851     {
6852       // ERET instruction (return from interrupt)
6853       u=1;
6854       uu=1;
6855     }
6856     //u=uu=1; // DEBUG
6857     tdep=(~uu>>rt1[i])&1;
6858     // Written registers are unneeded
6859     u|=1LL<<rt1[i];
6860     u|=1LL<<rt2[i];
6861     uu|=1LL<<rt1[i];
6862     uu|=1LL<<rt2[i];
6863     // Accessed registers are needed
6864     u&=~(1LL<<rs1[i]);
6865     u&=~(1LL<<rs2[i]);
6866     uu&=~(1LL<<us1[i]);
6867     uu&=~(1LL<<us2[i]);
6868     // Source-target dependencies
6869     uu&=~(tdep<<dep1[i]);
6870     uu&=~(tdep<<dep2[i]);
6871     // R0 is always unneeded
6872     u|=1;uu|=1;
6873     // Save it
6874     unneeded_reg[i]=u;
6875     unneeded_reg_upper[i]=uu;
6876     /*
6877     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6878     printf("U:");
6879     int r;
6880     for(r=1;r<=CCREG;r++) {
6881       if((unneeded_reg[i]>>r)&1) {
6882         if(r==HIREG) printf(" HI");
6883         else if(r==LOREG) printf(" LO");
6884         else printf(" r%d",r);
6885       }
6886     }
6887     printf(" UU:");
6888     for(r=1;r<=CCREG;r++) {
6889       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6890         if(r==HIREG) printf(" HI");
6891         else if(r==LOREG) printf(" LO");
6892         else printf(" r%d",r);
6893       }
6894     }
6895     printf("\n");*/
6896   }
6897 #ifdef FORCE32
6898   for (i=iend;i>=istart;i--)
6899   {
6900     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6901   }
6902 #endif
6903 }
6904
6905 // Identify registers which are likely to contain 32-bit values
6906 // This is used to predict whether any branches will jump to a
6907 // location with 64-bit values in registers.
6908 static void provisional_32bit()
6909 {
6910   int i,j;
6911   uint64_t is32=1;
6912   uint64_t lastbranch=1;
6913   
6914   for(i=0;i<slen;i++)
6915   {
6916     if(i>0) {
6917       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6918         if(i>1) is32=lastbranch;
6919         else is32=1;
6920       }
6921     }
6922     if(i>1)
6923     {
6924       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6925         if(likely[i-2]) {
6926           if(i>2) is32=lastbranch;
6927           else is32=1;
6928         }
6929       }
6930       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6931       {
6932         if(rs1[i-2]==0||rs2[i-2]==0)
6933         {
6934           if(rs1[i-2]) {
6935             is32|=1LL<<rs1[i-2];
6936           }
6937           if(rs2[i-2]) {
6938             is32|=1LL<<rs2[i-2];
6939           }
6940         }
6941       }
6942     }
6943     // If something jumps here with 64-bit values
6944     // then promote those registers to 64 bits
6945     if(bt[i])
6946     {
6947       uint64_t temp_is32=is32;
6948       for(j=i-1;j>=0;j--)
6949       {
6950         if(ba[j]==start+i*4) 
6951           //temp_is32&=branch_regs[j].is32;
6952           temp_is32&=p32[j];
6953       }
6954       for(j=i;j<slen;j++)
6955       {
6956         if(ba[j]==start+i*4) 
6957           temp_is32=1;
6958       }
6959       is32=temp_is32;
6960     }
6961     int type=itype[i];
6962     int op=opcode[i];
6963     int op2=opcode2[i];
6964     int rt=rt1[i];
6965     int s1=rs1[i];
6966     int s2=rs2[i];
6967     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6968       // Branches don't write registers, consider the delay slot instead.
6969       type=itype[i+1];
6970       op=opcode[i+1];
6971       op2=opcode2[i+1];
6972       rt=rt1[i+1];
6973       s1=rs1[i+1];
6974       s2=rs2[i+1];
6975       lastbranch=is32;
6976     }
6977     switch(type) {
6978       case LOAD:
6979         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6980            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6981           is32&=~(1LL<<rt);
6982         else
6983           is32|=1LL<<rt;
6984         break;
6985       case STORE:
6986       case STORELR:
6987         break;
6988       case LOADLR:
6989         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6990         if(op==0x22) is32|=1LL<<rt; // LWL
6991         break;
6992       case IMM16:
6993         if (op==0x08||op==0x09|| // ADDI/ADDIU
6994             op==0x0a||op==0x0b|| // SLTI/SLTIU
6995             op==0x0c|| // ANDI
6996             op==0x0f)  // LUI
6997         {
6998           is32|=1LL<<rt;
6999         }
7000         if(op==0x18||op==0x19) { // DADDI/DADDIU
7001           is32&=~(1LL<<rt);
7002           //if(imm[i]==0)
7003           //  is32|=((is32>>s1)&1LL)<<rt;
7004         }
7005         if(op==0x0d||op==0x0e) { // ORI/XORI
7006           uint64_t sr=((is32>>s1)&1LL);
7007           is32&=~(1LL<<rt);
7008           is32|=sr<<rt;
7009         }
7010         break;
7011       case UJUMP:
7012         break;
7013       case RJUMP:
7014         break;
7015       case CJUMP:
7016         break;
7017       case SJUMP:
7018         break;
7019       case FJUMP:
7020         break;
7021       case ALU:
7022         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7023           is32|=1LL<<rt;
7024         }
7025         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7026           is32|=1LL<<rt;
7027         }
7028         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7029           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7030           is32&=~(1LL<<rt);
7031           is32|=sr<<rt;
7032         }
7033         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7034           if(s1==0&&s2==0) {
7035             is32|=1LL<<rt;
7036           }
7037           else if(s2==0) {
7038             uint64_t sr=((is32>>s1)&1LL);
7039             is32&=~(1LL<<rt);
7040             is32|=sr<<rt;
7041           }
7042           else if(s1==0) {
7043             uint64_t sr=((is32>>s2)&1LL);
7044             is32&=~(1LL<<rt);
7045             is32|=sr<<rt;
7046           }
7047           else {
7048             is32&=~(1LL<<rt);
7049           }
7050         }
7051         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7052           if(s1==0&&s2==0) {
7053             is32|=1LL<<rt;
7054           }
7055           else if(s2==0) {
7056             uint64_t sr=((is32>>s1)&1LL);
7057             is32&=~(1LL<<rt);
7058             is32|=sr<<rt;
7059           }
7060           else {
7061             is32&=~(1LL<<rt);
7062           }
7063         }
7064         break;
7065       case MULTDIV:
7066         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7067           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7068         }
7069         else {
7070           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7071         }
7072         break;
7073       case MOV:
7074         {
7075           uint64_t sr=((is32>>s1)&1LL);
7076           is32&=~(1LL<<rt);
7077           is32|=sr<<rt;
7078         }
7079         break;
7080       case SHIFT:
7081         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7082         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7083         break;
7084       case SHIFTIMM:
7085         is32|=1LL<<rt;
7086         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7087         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7088         break;
7089       case COP0:
7090         if(op2==0) is32|=1LL<<rt; // MFC0
7091         break;
7092       case COP1:
7093       case COP2:
7094         if(op2==0) is32|=1LL<<rt; // MFC1
7095         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7096         if(op2==2) is32|=1LL<<rt; // CFC1
7097         break;
7098       case C1LS:
7099       case C2LS:
7100         break;
7101       case FLOAT:
7102       case FCONV:
7103         break;
7104       case FCOMP:
7105         break;
7106       case C2OP:
7107       case SYSCALL:
7108       case HLECALL:
7109         break;
7110       default:
7111         break;
7112     }
7113     is32|=1;
7114     p32[i]=is32;
7115
7116     if(i>0)
7117     {
7118       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7119       {
7120         if(rt1[i-1]==31) // JAL/JALR
7121         {
7122           // Subroutine call will return here, don't alloc any registers
7123           is32=1;
7124         }
7125         else if(i+1<slen)
7126         {
7127           // Internal branch will jump here, match registers to caller
7128           is32=0x3FFFFFFFFLL;
7129         }
7130       }
7131     }
7132   }
7133 }
7134
7135 // Identify registers which may be assumed to contain 32-bit values
7136 // and where optimizations will rely on this.
7137 // This is used to determine whether backward branches can safely
7138 // jump to a location with 64-bit values in registers.
7139 static void provisional_r32()
7140 {
7141   u_int r32=0;
7142   int i;
7143   
7144   for (i=slen-1;i>=0;i--)
7145   {
7146     int hr;
7147     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7148     {
7149       if(ba[i]<start || ba[i]>=(start+slen*4))
7150       {
7151         // Branch out of this block, don't need anything
7152         r32=0;
7153       }
7154       else
7155       {
7156         // Internal branch
7157         // Need whatever matches the target
7158         // (and doesn't get overwritten by the delay slot instruction)
7159         r32=0;
7160         int t=(ba[i]-start)>>2;
7161         if(ba[i]>start+i*4) {
7162           // Forward branch
7163           //if(!(requires_32bit[t]&~regs[i].was32))
7164           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7165           if(!(pr32[t]&~regs[i].was32))
7166             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7167         }else{
7168           // Backward branch
7169           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7170             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7171         }
7172       }
7173       // Conditional branch may need registers for following instructions
7174       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7175       {
7176         if(i<slen-2) {
7177           //r32|=requires_32bit[i+2];
7178           r32|=pr32[i+2];
7179           r32&=regs[i].was32;
7180           // Mark this address as a branch target since it may be called
7181           // upon return from interrupt
7182           //bt[i+2]=1;
7183         }
7184       }
7185       // Merge in delay slot
7186       if(!likely[i]) {
7187         // These are overwritten unless the branch is "likely"
7188         // and the delay slot is nullified if not taken
7189         r32&=~(1LL<<rt1[i+1]);
7190         r32&=~(1LL<<rt2[i+1]);
7191       }
7192       // Assume these are needed (delay slot)
7193       if(us1[i+1]>0)
7194       {
7195         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7196       }
7197       if(us2[i+1]>0)
7198       {
7199         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7200       }
7201       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7202       {
7203         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7204       }
7205       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7206       {
7207         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7208       }
7209     }
7210     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7211     {
7212       // SYSCALL instruction (software interrupt)
7213       r32=0;
7214     }
7215     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7216     {
7217       // ERET instruction (return from interrupt)
7218       r32=0;
7219     }
7220     // Check 32 bits
7221     r32&=~(1LL<<rt1[i]);
7222     r32&=~(1LL<<rt2[i]);
7223     if(us1[i]>0)
7224     {
7225       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7226     }
7227     if(us2[i]>0)
7228     {
7229       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7230     }
7231     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7232     {
7233       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7234     }
7235     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7236     {
7237       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7238     }
7239     //requires_32bit[i]=r32;
7240     pr32[i]=r32;
7241     
7242     // Dirty registers which are 32-bit, require 32-bit input
7243     // as they will be written as 32-bit values
7244     for(hr=0;hr<HOST_REGS;hr++)
7245     {
7246       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7247         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7248           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7249           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7250           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7251         }
7252       }
7253     }
7254   }
7255 }
7256
7257 // Write back dirty registers as soon as we will no longer modify them,
7258 // so that we don't end up with lots of writes at the branches.
7259 void clean_registers(int istart,int iend,int wr)
7260 {
7261   int i;
7262   int r;
7263   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7264   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7265   if(iend==slen-1) {
7266     will_dirty_i=will_dirty_next=0;
7267     wont_dirty_i=wont_dirty_next=0;
7268   }else{
7269     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7270     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7271   }
7272   for (i=iend;i>=istart;i--)
7273   {
7274     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7275     {
7276       if(ba[i]<start || ba[i]>=(start+slen*4))
7277       {
7278         // Branch out of this block, flush all regs
7279         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7280         {
7281           // Unconditional branch
7282           will_dirty_i=0;
7283           wont_dirty_i=0;
7284           // Merge in delay slot (will dirty)
7285           for(r=0;r<HOST_REGS;r++) {
7286             if(r!=EXCLUDE_REG) {
7287               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7288               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7289               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7290               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7291               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7292               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7293               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7294               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7295               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7296               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7297               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7298               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7299               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7300               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7301             }
7302           }
7303         }
7304         else
7305         {
7306           // Conditional branch
7307           will_dirty_i=0;
7308           wont_dirty_i=wont_dirty_next;
7309           // Merge in delay slot (will dirty)
7310           for(r=0;r<HOST_REGS;r++) {
7311             if(r!=EXCLUDE_REG) {
7312               if(!likely[i]) {
7313                 // Might not dirty if likely branch is not taken
7314                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7315                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7316                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7317                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7318                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7319                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7320                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7321                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7322                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7323                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7324                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7325                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7326                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7327                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7328               }
7329             }
7330           }
7331         }
7332         // Merge in delay slot (wont dirty)
7333         for(r=0;r<HOST_REGS;r++) {
7334           if(r!=EXCLUDE_REG) {
7335             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7336             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7337             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7338             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7339             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7340             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7341             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7342             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7343             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7344             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7345           }
7346         }
7347         if(wr) {
7348           #ifndef DESTRUCTIVE_WRITEBACK
7349           branch_regs[i].dirty&=wont_dirty_i;
7350           #endif
7351           branch_regs[i].dirty|=will_dirty_i;
7352         }
7353       }
7354       else
7355       {
7356         // Internal branch
7357         if(ba[i]<=start+i*4) {
7358           // Backward branch
7359           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7360           {
7361             // Unconditional branch
7362             temp_will_dirty=0;
7363             temp_wont_dirty=0;
7364             // Merge in delay slot (will dirty)
7365             for(r=0;r<HOST_REGS;r++) {
7366               if(r!=EXCLUDE_REG) {
7367                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7368                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7369                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7370                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7371                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7372                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7373                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7374                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7375                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7376                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7377                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7378                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7379                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7380                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7381               }
7382             }
7383           } else {
7384             // Conditional branch (not taken case)
7385             temp_will_dirty=will_dirty_next;
7386             temp_wont_dirty=wont_dirty_next;
7387             // Merge in delay slot (will dirty)
7388             for(r=0;r<HOST_REGS;r++) {
7389               if(r!=EXCLUDE_REG) {
7390                 if(!likely[i]) {
7391                   // Will not dirty if likely branch is not taken
7392                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7393                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7394                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7395                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7396                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7397                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7398                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7399                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7400                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7401                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7402                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7403                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7404                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7405                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7406                 }
7407               }
7408             }
7409           }
7410           // Merge in delay slot (wont dirty)
7411           for(r=0;r<HOST_REGS;r++) {
7412             if(r!=EXCLUDE_REG) {
7413               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7414               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7415               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7416               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7417               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7418               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7419               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7420               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7421               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7422               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7423             }
7424           }
7425           // Deal with changed mappings
7426           if(i<iend) {
7427             for(r=0;r<HOST_REGS;r++) {
7428               if(r!=EXCLUDE_REG) {
7429                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7430                   temp_will_dirty&=~(1<<r);
7431                   temp_wont_dirty&=~(1<<r);
7432                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7433                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7434                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7435                   } else {
7436                     temp_will_dirty|=1<<r;
7437                     temp_wont_dirty|=1<<r;
7438                   }
7439                 }
7440               }
7441             }
7442           }
7443           if(wr) {
7444             will_dirty[i]=temp_will_dirty;
7445             wont_dirty[i]=temp_wont_dirty;
7446             clean_registers((ba[i]-start)>>2,i-1,0);
7447           }else{
7448             // Limit recursion.  It can take an excessive amount
7449             // of time if there are a lot of nested loops.
7450             will_dirty[(ba[i]-start)>>2]=0;
7451             wont_dirty[(ba[i]-start)>>2]=-1;
7452           }
7453         }
7454         /*else*/ if(1)
7455         {
7456           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7457           {
7458             // Unconditional branch
7459             will_dirty_i=0;
7460             wont_dirty_i=0;
7461           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7462             for(r=0;r<HOST_REGS;r++) {
7463               if(r!=EXCLUDE_REG) {
7464                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7465                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7466                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7467                 }
7468                 if(branch_regs[i].regmap[r]>=0) {
7469                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7470                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7471                 }
7472               }
7473             }
7474           //}
7475             // Merge in delay slot
7476             for(r=0;r<HOST_REGS;r++) {
7477               if(r!=EXCLUDE_REG) {
7478                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7479                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7480                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7481                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7482                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7483                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7484                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7485                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7486                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7487                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7488                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7489                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7490                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7491                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7492               }
7493             }
7494           } else {
7495             // Conditional branch
7496             will_dirty_i=will_dirty_next;
7497             wont_dirty_i=wont_dirty_next;
7498           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7499             for(r=0;r<HOST_REGS;r++) {
7500               if(r!=EXCLUDE_REG) {
7501                 signed char target_reg=branch_regs[i].regmap[r];
7502                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7503                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7504                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7505                 }
7506                 else if(target_reg>=0) {
7507                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7508                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7509                 }
7510                 // Treat delay slot as part of branch too
7511                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7512                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7513                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7514                 }
7515                 else
7516                 {
7517                   will_dirty[i+1]&=~(1<<r);
7518                 }*/
7519               }
7520             }
7521           //}
7522             // Merge in delay slot
7523             for(r=0;r<HOST_REGS;r++) {
7524               if(r!=EXCLUDE_REG) {
7525                 if(!likely[i]) {
7526                   // Might not dirty if likely branch is not taken
7527                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7528                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7529                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7530                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7531                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7532                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7533                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7534                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7535                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7536                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7537                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7538                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7539                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7540                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7541                 }
7542               }
7543             }
7544           }
7545           // Merge in delay slot (won't dirty)
7546           for(r=0;r<HOST_REGS;r++) {
7547             if(r!=EXCLUDE_REG) {
7548               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7549               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7550               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7551               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7552               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7553               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7554               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7555               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7556               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7557               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7558             }
7559           }
7560           if(wr) {
7561             #ifndef DESTRUCTIVE_WRITEBACK
7562             branch_regs[i].dirty&=wont_dirty_i;
7563             #endif
7564             branch_regs[i].dirty|=will_dirty_i;
7565           }
7566         }
7567       }
7568     }
7569     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7570     {
7571       // SYSCALL instruction (software interrupt)
7572       will_dirty_i=0;
7573       wont_dirty_i=0;
7574     }
7575     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7576     {
7577       // ERET instruction (return from interrupt)
7578       will_dirty_i=0;
7579       wont_dirty_i=0;
7580     }
7581     will_dirty_next=will_dirty_i;
7582     wont_dirty_next=wont_dirty_i;
7583     for(r=0;r<HOST_REGS;r++) {
7584       if(r!=EXCLUDE_REG) {
7585         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7586         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7587         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7588         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7589         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7590         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7591         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7592         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7593         if(i>istart) {
7594           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7595           {
7596             // Don't store a register immediately after writing it,
7597             // may prevent dual-issue.
7598             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7599             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7600           }
7601         }
7602       }
7603     }
7604     // Save it
7605     will_dirty[i]=will_dirty_i;
7606     wont_dirty[i]=wont_dirty_i;
7607     // Mark registers that won't be dirtied as not dirty
7608     if(wr) {
7609       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7610       for(r=0;r<HOST_REGS;r++) {
7611         if((will_dirty_i>>r)&1) {
7612           printf(" r%d",r);
7613         }
7614       }
7615       printf("\n");*/
7616
7617       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7618         regs[i].dirty|=will_dirty_i;
7619         #ifndef DESTRUCTIVE_WRITEBACK
7620         regs[i].dirty&=wont_dirty_i;
7621         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7622         {
7623           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7624             for(r=0;r<HOST_REGS;r++) {
7625               if(r!=EXCLUDE_REG) {
7626                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7627                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7628                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7629               }
7630             }
7631           }
7632         }
7633         else
7634         {
7635           if(i<iend) {
7636             for(r=0;r<HOST_REGS;r++) {
7637               if(r!=EXCLUDE_REG) {
7638                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7639                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7640                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7641               }
7642             }
7643           }
7644         }
7645         #endif
7646       //}
7647     }
7648     // Deal with changed mappings
7649     temp_will_dirty=will_dirty_i;
7650     temp_wont_dirty=wont_dirty_i;
7651     for(r=0;r<HOST_REGS;r++) {
7652       if(r!=EXCLUDE_REG) {
7653         int nr;
7654         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7655           if(wr) {
7656             #ifndef DESTRUCTIVE_WRITEBACK
7657             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7658             #endif
7659             regs[i].wasdirty|=will_dirty_i&(1<<r);
7660           }
7661         }
7662         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7663           // Register moved to a different register
7664           will_dirty_i&=~(1<<r);
7665           wont_dirty_i&=~(1<<r);
7666           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7667           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7668           if(wr) {
7669             #ifndef DESTRUCTIVE_WRITEBACK
7670             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7671             #endif
7672             regs[i].wasdirty|=will_dirty_i&(1<<r);
7673           }
7674         }
7675         else {
7676           will_dirty_i&=~(1<<r);
7677           wont_dirty_i&=~(1<<r);
7678           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7679             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7680             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7681           } else {
7682             wont_dirty_i|=1<<r;
7683             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7684           }
7685         }
7686       }
7687     }
7688   }
7689 }
7690
7691   /* disassembly */
7692 void disassemble_inst(int i)
7693 {
7694     if (bt[i]) printf("*"); else printf(" ");
7695     switch(itype[i]) {
7696       case UJUMP:
7697         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7698       case CJUMP:
7699         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7700       case SJUMP:
7701         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7702       case FJUMP:
7703         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7704       case RJUMP:
7705         if (opcode[i]==0x9&&rt1[i]!=31)
7706           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7707         else
7708           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7709         break;
7710       case SPAN:
7711         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7712       case IMM16:
7713         if(opcode[i]==0xf) //LUI
7714           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7715         else
7716           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7717         break;
7718       case LOAD:
7719       case LOADLR:
7720         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7721         break;
7722       case STORE:
7723       case STORELR:
7724         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7725         break;
7726       case ALU:
7727       case SHIFT:
7728         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7729         break;
7730       case MULTDIV:
7731         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7732         break;
7733       case SHIFTIMM:
7734         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7735         break;
7736       case MOV:
7737         if((opcode2[i]&0x1d)==0x10)
7738           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7739         else if((opcode2[i]&0x1d)==0x11)
7740           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7741         else
7742           printf (" %x: %s\n",start+i*4,insn[i]);
7743         break;
7744       case COP0:
7745         if(opcode2[i]==0)
7746           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7747         else if(opcode2[i]==4)
7748           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7749         else printf (" %x: %s\n",start+i*4,insn[i]);
7750         break;
7751       case COP1:
7752         if(opcode2[i]<3)
7753           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7754         else if(opcode2[i]>3)
7755           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7756         else printf (" %x: %s\n",start+i*4,insn[i]);
7757         break;
7758       case COP2:
7759         if(opcode2[i]<3)
7760           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7761         else if(opcode2[i]>3)
7762           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7763         else printf (" %x: %s\n",start+i*4,insn[i]);
7764         break;
7765       case C1LS:
7766         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7767         break;
7768       case C2LS:
7769         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7770         break;
7771       case INTCALL:
7772         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7773         break;
7774       default:
7775         //printf (" %s %8x\n",insn[i],source[i]);
7776         printf (" %x: %s\n",start+i*4,insn[i]);
7777     }
7778 }
7779
7780 // clear the state completely, instead of just marking
7781 // things invalid like invalidate_all_pages() does
7782 void new_dynarec_clear_full()
7783 {
7784   int n;
7785   out=(u_char *)BASE_ADDR;
7786   memset(invalid_code,1,sizeof(invalid_code));
7787   memset(hash_table,0xff,sizeof(hash_table));
7788   memset(mini_ht,-1,sizeof(mini_ht));
7789   memset(restore_candidate,0,sizeof(restore_candidate));
7790   memset(shadow,0,sizeof(shadow));
7791   copy=shadow;
7792   expirep=16384; // Expiry pointer, +2 blocks
7793   pending_exception=0;
7794   literalcount=0;
7795   stop_after_jal=0;
7796   // TLB
7797 #ifndef DISABLE_TLB
7798   using_tlb=0;
7799 #endif
7800   sp_in_mirror=0;
7801   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7802     memory_map[n]=-1;
7803   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7804     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7805   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7806     memory_map[n]=-1;
7807   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7808   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7809   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7810 }
7811
7812 void new_dynarec_init()
7813 {
7814   printf("Init new dynarec\n");
7815   out=(u_char *)BASE_ADDR;
7816   if (mmap (out, 1<<TARGET_SIZE_2,
7817             PROT_READ | PROT_WRITE | PROT_EXEC,
7818             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7819             -1, 0) <= 0) {printf("mmap() failed\n");}
7820 #ifdef MUPEN64
7821   rdword=&readmem_dword;
7822   fake_pc.f.r.rs=&readmem_dword;
7823   fake_pc.f.r.rt=&readmem_dword;
7824   fake_pc.f.r.rd=&readmem_dword;
7825 #endif
7826   int n;
7827   new_dynarec_clear_full();
7828 #ifdef HOST_IMM8
7829   // Copy this into local area so we don't have to put it in every literal pool
7830   invc_ptr=invalid_code;
7831 #endif
7832 #ifdef MUPEN64
7833   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7834     writemem[n] = write_nomem_new;
7835     writememb[n] = write_nomemb_new;
7836     writememh[n] = write_nomemh_new;
7837 #ifndef FORCE32
7838     writememd[n] = write_nomemd_new;
7839 #endif
7840     readmem[n] = read_nomem_new;
7841     readmemb[n] = read_nomemb_new;
7842     readmemh[n] = read_nomemh_new;
7843 #ifndef FORCE32
7844     readmemd[n] = read_nomemd_new;
7845 #endif
7846   }
7847   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7848     writemem[n] = write_rdram_new;
7849     writememb[n] = write_rdramb_new;
7850     writememh[n] = write_rdramh_new;
7851 #ifndef FORCE32
7852     writememd[n] = write_rdramd_new;
7853 #endif
7854   }
7855   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7856     writemem[n] = write_nomem_new;
7857     writememb[n] = write_nomemb_new;
7858     writememh[n] = write_nomemh_new;
7859 #ifndef FORCE32
7860     writememd[n] = write_nomemd_new;
7861 #endif
7862     readmem[n] = read_nomem_new;
7863     readmemb[n] = read_nomemb_new;
7864     readmemh[n] = read_nomemh_new;
7865 #ifndef FORCE32
7866     readmemd[n] = read_nomemd_new;
7867 #endif
7868   }
7869 #endif
7870   tlb_hacks();
7871   arch_init();
7872 }
7873
7874 void new_dynarec_cleanup()
7875 {
7876   int n;
7877   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7878   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7879   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7880   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7881   #ifdef ROM_COPY
7882   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7883   #endif
7884 }
7885
7886 int new_recompile_block(int addr)
7887 {
7888 /*
7889   if(addr==0x800cd050) {
7890     int block;
7891     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7892     int n;
7893     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7894   }
7895 */
7896   //if(Count==365117028) tracedebug=1;
7897   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7898   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7899   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7900   //if(debug) 
7901   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7902   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7903   /*if(Count>=312978186) {
7904     rlist();
7905   }*/
7906   //rlist();
7907   start = (u_int)addr&~3;
7908   //assert(((u_int)addr&1)==0);
7909 #ifdef PCSX
7910   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
7911      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
7912     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp, psxRegs.pc);
7913     sp_in_mirror=1;
7914   }
7915   if (Config.HLE && start == 0x80001000) // hlecall
7916   {
7917     // XXX: is this enough? Maybe check hleSoftCall?
7918     u_int beginning=(u_int)out;
7919     u_int page=get_page(start);
7920     invalid_code[start>>12]=0;
7921     emit_movimm(start,0);
7922     emit_writeword(0,(int)&pcaddr);
7923     emit_jmp((int)new_dyna_leave);
7924 #ifdef __arm__
7925     __clear_cache((void *)beginning,out);
7926 #endif
7927     ll_add(jump_in+page,start,(void *)beginning);
7928     return 0;
7929   }
7930   else if ((u_int)addr < 0x00200000 ||
7931     (0xa0000000 <= addr && addr < 0xa0200000)) {
7932     // used for BIOS calls mostly?
7933     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7934     pagelimit = (addr&0xa0000000)|0x00200000;
7935   }
7936   else if (!Config.HLE && (
7937 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7938     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7939     // BIOS
7940     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7941     pagelimit = (addr&0xfff00000)|0x80000;
7942   }
7943   else
7944 #endif
7945 #ifdef MUPEN64
7946   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7947     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7948     pagelimit = 0xa4001000;
7949   }
7950   else
7951 #endif
7952   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7953     source = (u_int *)((u_int)rdram+start-0x80000000);
7954     pagelimit = 0x80000000+RAM_SIZE;
7955   }
7956 #ifndef DISABLE_TLB
7957   else if ((signed int)addr >= (signed int)0xC0000000) {
7958     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7959     //if(tlb_LUT_r[start>>12])
7960       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7961     if((signed int)memory_map[start>>12]>=0) {
7962       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7963       pagelimit=(start+4096)&0xFFFFF000;
7964       int map=memory_map[start>>12];
7965       int i;
7966       for(i=0;i<5;i++) {
7967         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7968         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7969       }
7970       assem_debug("pagelimit=%x\n",pagelimit);
7971       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7972     }
7973     else {
7974       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7975       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7976       return -1; // Caller will invoke exception handler
7977     }
7978     //printf("source= %x\n",(int)source);
7979   }
7980 #endif
7981   else {
7982     printf("Compile at bogus memory address: %x \n", (int)addr);
7983     exit(1);
7984   }
7985
7986   /* Pass 1: disassemble */
7987   /* Pass 2: register dependencies, branch targets */
7988   /* Pass 3: register allocation */
7989   /* Pass 4: branch dependencies */
7990   /* Pass 5: pre-alloc */
7991   /* Pass 6: optimize clean/dirty state */
7992   /* Pass 7: flag 32-bit registers */
7993   /* Pass 8: assembly */
7994   /* Pass 9: linker */
7995   /* Pass 10: garbage collection / free memory */
7996
7997   int i,j;
7998   int done=0;
7999   unsigned int type,op,op2;
8000
8001   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8002   
8003   /* Pass 1 disassembly */
8004
8005   for(i=0;!done;i++) {
8006     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8007     minimum_free_regs[i]=0;
8008     opcode[i]=op=source[i]>>26;
8009     switch(op)
8010     {
8011       case 0x00: strcpy(insn[i],"special"); type=NI;
8012         op2=source[i]&0x3f;
8013         switch(op2)
8014         {
8015           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8016           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8017           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8018           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8019           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8020           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8021           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8022           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8023           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8024           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8025           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8026           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8027           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8028           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8029           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8030           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8031           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8032           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8033           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8034           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8035           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8036           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8037           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8038           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8039           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8040           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8041           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8042           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8043           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8044           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8045           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8046           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8047           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8048           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8049           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8050 #ifndef FORCE32
8051           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8052           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8053           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8054           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8055           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8056           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8057           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8058           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8059           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8060           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8061           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8062           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8063           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8064           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8065           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8066           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8067           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8068 #endif
8069         }
8070         break;
8071       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8072         op2=(source[i]>>16)&0x1f;
8073         switch(op2)
8074         {
8075           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8076           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8077           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8078           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8079           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8080           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8081           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8082           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8083           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8084           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8085           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8086           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8087           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8088           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8089         }
8090         break;
8091       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8092       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8093       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8094       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8095       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8096       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8097       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8098       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8099       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8100       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8101       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8102       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8103       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8104       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8105       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8106         op2=(source[i]>>21)&0x1f;
8107         switch(op2)
8108         {
8109           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8110           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8111           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8112           switch(source[i]&0x3f)
8113           {
8114             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8115             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8116             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8117             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8118 #ifdef PCSX
8119             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8120 #else
8121             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8122 #endif
8123           }
8124         }
8125         break;
8126       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8127         op2=(source[i]>>21)&0x1f;
8128         switch(op2)
8129         {
8130           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8131           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8132           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8133           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8134           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8135           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8136           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8137           switch((source[i]>>16)&0x3)
8138           {
8139             case 0x00: strcpy(insn[i],"BC1F"); break;
8140             case 0x01: strcpy(insn[i],"BC1T"); break;
8141             case 0x02: strcpy(insn[i],"BC1FL"); break;
8142             case 0x03: strcpy(insn[i],"BC1TL"); break;
8143           }
8144           break;
8145           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8146           switch(source[i]&0x3f)
8147           {
8148             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8149             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8150             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8151             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8152             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8153             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8154             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8155             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8156             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8157             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8158             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8159             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8160             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8161             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8162             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8163             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8164             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8165             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8166             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8167             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8168             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8169             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8170             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8171             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8172             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8173             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8174             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8175             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8176             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8177             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8178             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8179             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8180             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8181             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8182             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8183           }
8184           break;
8185           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8186           switch(source[i]&0x3f)
8187           {
8188             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8189             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8190             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8191             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8192             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8193             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8194             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8195             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8196             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8197             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8198             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8199             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8200             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8201             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8202             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8203             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8204             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8205             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8206             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8207             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8208             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8209             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8210             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8211             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8212             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8213             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8214             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8215             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8216             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8217             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8218             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8219             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8220             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8221             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8222             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8223           }
8224           break;
8225           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8226           switch(source[i]&0x3f)
8227           {
8228             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8229             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8230           }
8231           break;
8232           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8233           switch(source[i]&0x3f)
8234           {
8235             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8236             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8237           }
8238           break;
8239         }
8240         break;
8241 #ifndef FORCE32
8242       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8243       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8244       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8245       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8246       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8247       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8248       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8249       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8250 #endif
8251       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8252       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8253       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8254       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8255       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8256       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8257       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8258 #ifndef FORCE32
8259       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8260 #endif
8261       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8262       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8263       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8264       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8265 #ifndef FORCE32
8266       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8267       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8268 #endif
8269       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8270       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8271       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8272       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8273 #ifndef FORCE32
8274       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8275       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8276       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8277 #endif
8278       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8279       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8280 #ifndef FORCE32
8281       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8282       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8283       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8284 #endif
8285 #ifdef PCSX
8286       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8287         // note: COP MIPS-1 encoding differs from MIPS32
8288         op2=(source[i]>>21)&0x1f;
8289         if (source[i]&0x3f) {
8290           if (gte_handlers[source[i]&0x3f]!=NULL) {
8291             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8292             type=C2OP;
8293           }
8294         }
8295         else switch(op2)
8296         {
8297           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8298           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8299           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8300           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8301         }
8302         break;
8303       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8304       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8305       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8306 #endif
8307       default: strcpy(insn[i],"???"); type=NI;
8308         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8309         break;
8310     }
8311     itype[i]=type;
8312     opcode2[i]=op2;
8313     /* Get registers/immediates */
8314     lt1[i]=0;
8315     us1[i]=0;
8316     us2[i]=0;
8317     dep1[i]=0;
8318     dep2[i]=0;
8319     switch(type) {
8320       case LOAD:
8321         rs1[i]=(source[i]>>21)&0x1f;
8322         rs2[i]=0;
8323         rt1[i]=(source[i]>>16)&0x1f;
8324         rt2[i]=0;
8325         imm[i]=(short)source[i];
8326         break;
8327       case STORE:
8328       case STORELR:
8329         rs1[i]=(source[i]>>21)&0x1f;
8330         rs2[i]=(source[i]>>16)&0x1f;
8331         rt1[i]=0;
8332         rt2[i]=0;
8333         imm[i]=(short)source[i];
8334         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8335         break;
8336       case LOADLR:
8337         // LWL/LWR only load part of the register,
8338         // therefore the target register must be treated as a source too
8339         rs1[i]=(source[i]>>21)&0x1f;
8340         rs2[i]=(source[i]>>16)&0x1f;
8341         rt1[i]=(source[i]>>16)&0x1f;
8342         rt2[i]=0;
8343         imm[i]=(short)source[i];
8344         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8345         if(op==0x26) dep1[i]=rt1[i]; // LWR
8346         break;
8347       case IMM16:
8348         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8349         else rs1[i]=(source[i]>>21)&0x1f;
8350         rs2[i]=0;
8351         rt1[i]=(source[i]>>16)&0x1f;
8352         rt2[i]=0;
8353         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8354           imm[i]=(unsigned short)source[i];
8355         }else{
8356           imm[i]=(short)source[i];
8357         }
8358         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8359         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8360         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8361         break;
8362       case UJUMP:
8363         rs1[i]=0;
8364         rs2[i]=0;
8365         rt1[i]=0;
8366         rt2[i]=0;
8367         // The JAL instruction writes to r31.
8368         if (op&1) {
8369           rt1[i]=31;
8370         }
8371         rs2[i]=CCREG;
8372         break;
8373       case RJUMP:
8374         rs1[i]=(source[i]>>21)&0x1f;
8375         rs2[i]=0;
8376         rt1[i]=0;
8377         rt2[i]=0;
8378         // The JALR instruction writes to rd.
8379         if (op2&1) {
8380           rt1[i]=(source[i]>>11)&0x1f;
8381         }
8382         rs2[i]=CCREG;
8383         break;
8384       case CJUMP:
8385         rs1[i]=(source[i]>>21)&0x1f;
8386         rs2[i]=(source[i]>>16)&0x1f;
8387         rt1[i]=0;
8388         rt2[i]=0;
8389         if(op&2) { // BGTZ/BLEZ
8390           rs2[i]=0;
8391         }
8392         us1[i]=rs1[i];
8393         us2[i]=rs2[i];
8394         likely[i]=op>>4;
8395         break;
8396       case SJUMP:
8397         rs1[i]=(source[i]>>21)&0x1f;
8398         rs2[i]=CCREG;
8399         rt1[i]=0;
8400         rt2[i]=0;
8401         us1[i]=rs1[i];
8402         if(op2&0x10) { // BxxAL
8403           rt1[i]=31;
8404           // NOTE: If the branch is not taken, r31 is still overwritten
8405         }
8406         likely[i]=(op2&2)>>1;
8407         break;
8408       case FJUMP:
8409         rs1[i]=FSREG;
8410         rs2[i]=CSREG;
8411         rt1[i]=0;
8412         rt2[i]=0;
8413         likely[i]=((source[i])>>17)&1;
8414         break;
8415       case ALU:
8416         rs1[i]=(source[i]>>21)&0x1f; // source
8417         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8418         rt1[i]=(source[i]>>11)&0x1f; // destination
8419         rt2[i]=0;
8420         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8421           us1[i]=rs1[i];us2[i]=rs2[i];
8422         }
8423         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8424           dep1[i]=rs1[i];dep2[i]=rs2[i];
8425         }
8426         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8427           dep1[i]=rs1[i];dep2[i]=rs2[i];
8428         }
8429         break;
8430       case MULTDIV:
8431         rs1[i]=(source[i]>>21)&0x1f; // source
8432         rs2[i]=(source[i]>>16)&0x1f; // divisor
8433         rt1[i]=HIREG;
8434         rt2[i]=LOREG;
8435         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8436           us1[i]=rs1[i];us2[i]=rs2[i];
8437         }
8438         break;
8439       case MOV:
8440         rs1[i]=0;
8441         rs2[i]=0;
8442         rt1[i]=0;
8443         rt2[i]=0;
8444         if(op2==0x10) rs1[i]=HIREG; // MFHI
8445         if(op2==0x11) rt1[i]=HIREG; // MTHI
8446         if(op2==0x12) rs1[i]=LOREG; // MFLO
8447         if(op2==0x13) rt1[i]=LOREG; // MTLO
8448         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8449         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8450         dep1[i]=rs1[i];
8451         break;
8452       case SHIFT:
8453         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8454         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8455         rt1[i]=(source[i]>>11)&0x1f; // destination
8456         rt2[i]=0;
8457         // DSLLV/DSRLV/DSRAV are 64-bit
8458         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8459         break;
8460       case SHIFTIMM:
8461         rs1[i]=(source[i]>>16)&0x1f;
8462         rs2[i]=0;
8463         rt1[i]=(source[i]>>11)&0x1f;
8464         rt2[i]=0;
8465         imm[i]=(source[i]>>6)&0x1f;
8466         // DSxx32 instructions
8467         if(op2>=0x3c) imm[i]|=0x20;
8468         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8469         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8470         break;
8471       case COP0:
8472         rs1[i]=0;
8473         rs2[i]=0;
8474         rt1[i]=0;
8475         rt2[i]=0;
8476         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8477         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8478         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8479         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8480         break;
8481       case COP1:
8482       case COP2:
8483         rs1[i]=0;
8484         rs2[i]=0;
8485         rt1[i]=0;
8486         rt2[i]=0;
8487         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8488         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8489         if(op2==5) us1[i]=rs1[i]; // DMTC1
8490         rs2[i]=CSREG;
8491         break;
8492       case C1LS:
8493         rs1[i]=(source[i]>>21)&0x1F;
8494         rs2[i]=CSREG;
8495         rt1[i]=0;
8496         rt2[i]=0;
8497         imm[i]=(short)source[i];
8498         break;
8499       case C2LS:
8500         rs1[i]=(source[i]>>21)&0x1F;
8501         rs2[i]=0;
8502         rt1[i]=0;
8503         rt2[i]=0;
8504         imm[i]=(short)source[i];
8505         break;
8506       case FLOAT:
8507       case FCONV:
8508         rs1[i]=0;
8509         rs2[i]=CSREG;
8510         rt1[i]=0;
8511         rt2[i]=0;
8512         break;
8513       case FCOMP:
8514         rs1[i]=FSREG;
8515         rs2[i]=CSREG;
8516         rt1[i]=FSREG;
8517         rt2[i]=0;
8518         break;
8519       case SYSCALL:
8520       case HLECALL:
8521       case INTCALL:
8522         rs1[i]=CCREG;
8523         rs2[i]=0;
8524         rt1[i]=0;
8525         rt2[i]=0;
8526         break;
8527       default:
8528         rs1[i]=0;
8529         rs2[i]=0;
8530         rt1[i]=0;
8531         rt2[i]=0;
8532     }
8533     /* Calculate branch target addresses */
8534     if(type==UJUMP)
8535       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8536     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8537       ba[i]=start+i*4+8; // Ignore never taken branch
8538     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8539       ba[i]=start+i*4+8; // Ignore never taken branch
8540     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8541       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8542     else ba[i]=-1;
8543 #ifdef PCSX
8544     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8545       int do_in_intrp=0;
8546       // branch in delay slot?
8547       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8548         // don't handle first branch and call interpreter if it's hit
8549         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8550         do_in_intrp=1;
8551       }
8552       // basic load delay detection
8553       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8554         int t=(ba[i-1]-start)/4;
8555         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8556           // jump target wants DS result - potential load delay effect
8557           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8558           do_in_intrp=1;
8559           bt[t+1]=1; // expected return from interpreter
8560         }
8561         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8562               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8563           // v0 overwrite like this is a sign of trouble, bail out
8564           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8565           do_in_intrp=1;
8566         }
8567       }
8568       if(do_in_intrp) {
8569         rs1[i-1]=CCREG;
8570         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8571         ba[i-1]=-1;
8572         itype[i-1]=INTCALL;
8573         done=2;
8574         i--; // don't compile the DS
8575       }
8576     }
8577 #endif
8578     /* Is this the end of the block? */
8579     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8580       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8581         done=2;
8582       }
8583       else {
8584         if(stop_after_jal) done=1;
8585         // Stop on BREAK
8586         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8587       }
8588       // Don't recompile stuff that's already compiled
8589       if(check_addr(start+i*4+4)) done=1;
8590       // Don't get too close to the limit
8591       if(i>MAXBLOCK/2) done=1;
8592     }
8593     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8594     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8595     if(done==2) {
8596       // Does the block continue due to a branch?
8597       for(j=i-1;j>=0;j--)
8598       {
8599         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8600         if(ba[j]==start+i*4+4) done=j=0;
8601         if(ba[j]==start+i*4+8) done=j=0;
8602       }
8603     }
8604     //assert(i<MAXBLOCK-1);
8605     if(start+i*4==pagelimit-4) done=1;
8606     assert(start+i*4<pagelimit);
8607     if (i==MAXBLOCK-1) done=1;
8608     // Stop if we're compiling junk
8609     if(itype[i]==NI&&opcode[i]==0x11) {
8610       done=stop_after_jal=1;
8611       printf("Disabled speculative precompilation\n");
8612     }
8613   }
8614   slen=i;
8615   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8616     if(start+i*4==pagelimit) {
8617       itype[i-1]=SPAN;
8618     }
8619   }
8620   assert(slen>0);
8621
8622   /* Pass 2 - Register dependencies and branch targets */
8623
8624   unneeded_registers(0,slen-1,0);
8625   
8626   /* Pass 3 - Register allocation */
8627
8628   struct regstat current; // Current register allocations/status
8629   current.is32=1;
8630   current.dirty=0;
8631   current.u=unneeded_reg[0];
8632   current.uu=unneeded_reg_upper[0];
8633   clear_all_regs(current.regmap);
8634   alloc_reg(&current,0,CCREG);
8635   dirty_reg(&current,CCREG);
8636   current.isconst=0;
8637   current.wasconst=0;
8638   int ds=0;
8639   int cc=0;
8640   int hr=-1;
8641
8642 #ifndef FORCE32
8643   provisional_32bit();
8644 #endif
8645   if((u_int)addr&1) {
8646     // First instruction is delay slot
8647     cc=-1;
8648     bt[1]=1;
8649     ds=1;
8650     unneeded_reg[0]=1;
8651     unneeded_reg_upper[0]=1;
8652     current.regmap[HOST_BTREG]=BTREG;
8653   }
8654   
8655   for(i=0;i<slen;i++)
8656   {
8657     if(bt[i])
8658     {
8659       int hr;
8660       for(hr=0;hr<HOST_REGS;hr++)
8661       {
8662         // Is this really necessary?
8663         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8664       }
8665       current.isconst=0;
8666     }
8667     if(i>1)
8668     {
8669       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8670       {
8671         if(rs1[i-2]==0||rs2[i-2]==0)
8672         {
8673           if(rs1[i-2]) {
8674             current.is32|=1LL<<rs1[i-2];
8675             int hr=get_reg(current.regmap,rs1[i-2]|64);
8676             if(hr>=0) current.regmap[hr]=-1;
8677           }
8678           if(rs2[i-2]) {
8679             current.is32|=1LL<<rs2[i-2];
8680             int hr=get_reg(current.regmap,rs2[i-2]|64);
8681             if(hr>=0) current.regmap[hr]=-1;
8682           }
8683         }
8684       }
8685     }
8686 #ifndef FORCE32
8687     // If something jumps here with 64-bit values
8688     // then promote those registers to 64 bits
8689     if(bt[i])
8690     {
8691       uint64_t temp_is32=current.is32;
8692       for(j=i-1;j>=0;j--)
8693       {
8694         if(ba[j]==start+i*4) 
8695           temp_is32&=branch_regs[j].is32;
8696       }
8697       for(j=i;j<slen;j++)
8698       {
8699         if(ba[j]==start+i*4) 
8700           //temp_is32=1;
8701           temp_is32&=p32[j];
8702       }
8703       if(temp_is32!=current.is32) {
8704         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8705         #ifndef DESTRUCTIVE_WRITEBACK
8706         if(ds)
8707         #endif
8708         for(hr=0;hr<HOST_REGS;hr++)
8709         {
8710           int r=current.regmap[hr];
8711           if(r>0&&r<64)
8712           {
8713             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8714               temp_is32|=1LL<<r;
8715               //printf("restore %d\n",r);
8716             }
8717           }
8718         }
8719         current.is32=temp_is32;
8720       }
8721     }
8722 #else
8723     current.is32=-1LL;
8724 #endif
8725
8726     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8727     regs[i].wasconst=current.isconst;
8728     regs[i].was32=current.is32;
8729     regs[i].wasdirty=current.dirty;
8730     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8731     // To change a dirty register from 32 to 64 bits, we must write
8732     // it out during the previous cycle (for branches, 2 cycles)
8733     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8734     {
8735       uint64_t temp_is32=current.is32;
8736       for(j=i-1;j>=0;j--)
8737       {
8738         if(ba[j]==start+i*4+4) 
8739           temp_is32&=branch_regs[j].is32;
8740       }
8741       for(j=i;j<slen;j++)
8742       {
8743         if(ba[j]==start+i*4+4) 
8744           //temp_is32=1;
8745           temp_is32&=p32[j];
8746       }
8747       if(temp_is32!=current.is32) {
8748         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8749         for(hr=0;hr<HOST_REGS;hr++)
8750         {
8751           int r=current.regmap[hr];
8752           if(r>0)
8753           {
8754             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8755               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8756               {
8757                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8758                 {
8759                   //printf("dump %d/r%d\n",hr,r);
8760                   current.regmap[hr]=-1;
8761                   if(get_reg(current.regmap,r|64)>=0) 
8762                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8763                 }
8764               }
8765             }
8766           }
8767         }
8768       }
8769     }
8770     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8771     {
8772       uint64_t temp_is32=current.is32;
8773       for(j=i-1;j>=0;j--)
8774       {
8775         if(ba[j]==start+i*4+8) 
8776           temp_is32&=branch_regs[j].is32;
8777       }
8778       for(j=i;j<slen;j++)
8779       {
8780         if(ba[j]==start+i*4+8) 
8781           //temp_is32=1;
8782           temp_is32&=p32[j];
8783       }
8784       if(temp_is32!=current.is32) {
8785         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8786         for(hr=0;hr<HOST_REGS;hr++)
8787         {
8788           int r=current.regmap[hr];
8789           if(r>0)
8790           {
8791             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8792               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8793               {
8794                 //printf("dump %d/r%d\n",hr,r);
8795                 current.regmap[hr]=-1;
8796                 if(get_reg(current.regmap,r|64)>=0) 
8797                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8798               }
8799             }
8800           }
8801         }
8802       }
8803     }
8804     #endif
8805     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8806       if(i+1<slen) {
8807         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8808         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8809         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8810         current.u|=1;
8811         current.uu|=1;
8812       } else {
8813         current.u=1;
8814         current.uu=1;
8815       }
8816     } else {
8817       if(i+1<slen) {
8818         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8819         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8820         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8821         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8822         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8823         current.u|=1;
8824         current.uu|=1;
8825       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8826     }
8827     is_ds[i]=ds;
8828     if(ds) {
8829       ds=0; // Skip delay slot, already allocated as part of branch
8830       // ...but we need to alloc it in case something jumps here
8831       if(i+1<slen) {
8832         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8833         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8834       }else{
8835         current.u=branch_unneeded_reg[i-1];
8836         current.uu=branch_unneeded_reg_upper[i-1];
8837       }
8838       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8839       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8840       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8841       current.u|=1;
8842       current.uu|=1;
8843       struct regstat temp;
8844       memcpy(&temp,&current,sizeof(current));
8845       temp.wasdirty=temp.dirty;
8846       temp.was32=temp.is32;
8847       // TODO: Take into account unconditional branches, as below
8848       delayslot_alloc(&temp,i);
8849       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8850       regs[i].wasdirty=temp.wasdirty;
8851       regs[i].was32=temp.was32;
8852       regs[i].dirty=temp.dirty;
8853       regs[i].is32=temp.is32;
8854       regs[i].isconst=0;
8855       regs[i].wasconst=0;
8856       current.isconst=0;
8857       // Create entry (branch target) regmap
8858       for(hr=0;hr<HOST_REGS;hr++)
8859       {
8860         int r=temp.regmap[hr];
8861         if(r>=0) {
8862           if(r!=regmap_pre[i][hr]) {
8863             regs[i].regmap_entry[hr]=-1;
8864           }
8865           else
8866           {
8867             if(r<64){
8868               if((current.u>>r)&1) {
8869                 regs[i].regmap_entry[hr]=-1;
8870                 regs[i].regmap[hr]=-1;
8871                 //Don't clear regs in the delay slot as the branch might need them
8872                 //current.regmap[hr]=-1;
8873               }else
8874                 regs[i].regmap_entry[hr]=r;
8875             }
8876             else {
8877               if((current.uu>>(r&63))&1) {
8878                 regs[i].regmap_entry[hr]=-1;
8879                 regs[i].regmap[hr]=-1;
8880                 //Don't clear regs in the delay slot as the branch might need them
8881                 //current.regmap[hr]=-1;
8882               }else
8883                 regs[i].regmap_entry[hr]=r;
8884             }
8885           }
8886         } else {
8887           // First instruction expects CCREG to be allocated
8888           if(i==0&&hr==HOST_CCREG) 
8889             regs[i].regmap_entry[hr]=CCREG;
8890           else
8891             regs[i].regmap_entry[hr]=-1;
8892         }
8893       }
8894     }
8895     else { // Not delay slot
8896       switch(itype[i]) {
8897         case UJUMP:
8898           //current.isconst=0; // DEBUG
8899           //current.wasconst=0; // DEBUG
8900           //regs[i].wasconst=0; // DEBUG
8901           clear_const(&current,rt1[i]);
8902           alloc_cc(&current,i);
8903           dirty_reg(&current,CCREG);
8904           if (rt1[i]==31) {
8905             alloc_reg(&current,i,31);
8906             dirty_reg(&current,31);
8907             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8908             //assert(rt1[i+1]!=rt1[i]);
8909             #ifdef REG_PREFETCH
8910             alloc_reg(&current,i,PTEMP);
8911             #endif
8912             //current.is32|=1LL<<rt1[i];
8913           }
8914           ooo[i]=1;
8915           delayslot_alloc(&current,i+1);
8916           //current.isconst=0; // DEBUG
8917           ds=1;
8918           //printf("i=%d, isconst=%x\n",i,current.isconst);
8919           break;
8920         case RJUMP:
8921           //current.isconst=0;
8922           //current.wasconst=0;
8923           //regs[i].wasconst=0;
8924           clear_const(&current,rs1[i]);
8925           clear_const(&current,rt1[i]);
8926           alloc_cc(&current,i);
8927           dirty_reg(&current,CCREG);
8928           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8929             alloc_reg(&current,i,rs1[i]);
8930             if (rt1[i]!=0) {
8931               alloc_reg(&current,i,rt1[i]);
8932               dirty_reg(&current,rt1[i]);
8933               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8934               assert(rt1[i+1]!=rt1[i]);
8935               #ifdef REG_PREFETCH
8936               alloc_reg(&current,i,PTEMP);
8937               #endif
8938             }
8939             #ifdef USE_MINI_HT
8940             if(rs1[i]==31) { // JALR
8941               alloc_reg(&current,i,RHASH);
8942               #ifndef HOST_IMM_ADDR32
8943               alloc_reg(&current,i,RHTBL);
8944               #endif
8945             }
8946             #endif
8947             delayslot_alloc(&current,i+1);
8948           } else {
8949             // The delay slot overwrites our source register,
8950             // allocate a temporary register to hold the old value.
8951             current.isconst=0;
8952             current.wasconst=0;
8953             regs[i].wasconst=0;
8954             delayslot_alloc(&current,i+1);
8955             current.isconst=0;
8956             alloc_reg(&current,i,RTEMP);
8957           }
8958           //current.isconst=0; // DEBUG
8959           ooo[i]=1;
8960           ds=1;
8961           break;
8962         case CJUMP:
8963           //current.isconst=0;
8964           //current.wasconst=0;
8965           //regs[i].wasconst=0;
8966           clear_const(&current,rs1[i]);
8967           clear_const(&current,rs2[i]);
8968           if((opcode[i]&0x3E)==4) // BEQ/BNE
8969           {
8970             alloc_cc(&current,i);
8971             dirty_reg(&current,CCREG);
8972             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8973             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8974             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8975             {
8976               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8977               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8978             }
8979             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8980                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8981               // The delay slot overwrites one of our conditions.
8982               // Allocate the branch condition registers instead.
8983               current.isconst=0;
8984               current.wasconst=0;
8985               regs[i].wasconst=0;
8986               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8987               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8988               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8989               {
8990                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8991                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8992               }
8993             }
8994             else
8995             {
8996               ooo[i]=1;
8997               delayslot_alloc(&current,i+1);
8998             }
8999           }
9000           else
9001           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9002           {
9003             alloc_cc(&current,i);
9004             dirty_reg(&current,CCREG);
9005             alloc_reg(&current,i,rs1[i]);
9006             if(!(current.is32>>rs1[i]&1))
9007             {
9008               alloc_reg64(&current,i,rs1[i]);
9009             }
9010             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9011               // The delay slot overwrites one of our conditions.
9012               // Allocate the branch condition registers instead.
9013               current.isconst=0;
9014               current.wasconst=0;
9015               regs[i].wasconst=0;
9016               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9017               if(!((current.is32>>rs1[i])&1))
9018               {
9019                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9020               }
9021             }
9022             else
9023             {
9024               ooo[i]=1;
9025               delayslot_alloc(&current,i+1);
9026             }
9027           }
9028           else
9029           // Don't alloc the delay slot yet because we might not execute it
9030           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9031           {
9032             current.isconst=0;
9033             current.wasconst=0;
9034             regs[i].wasconst=0;
9035             alloc_cc(&current,i);
9036             dirty_reg(&current,CCREG);
9037             alloc_reg(&current,i,rs1[i]);
9038             alloc_reg(&current,i,rs2[i]);
9039             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9040             {
9041               alloc_reg64(&current,i,rs1[i]);
9042               alloc_reg64(&current,i,rs2[i]);
9043             }
9044           }
9045           else
9046           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9047           {
9048             current.isconst=0;
9049             current.wasconst=0;
9050             regs[i].wasconst=0;
9051             alloc_cc(&current,i);
9052             dirty_reg(&current,CCREG);
9053             alloc_reg(&current,i,rs1[i]);
9054             if(!(current.is32>>rs1[i]&1))
9055             {
9056               alloc_reg64(&current,i,rs1[i]);
9057             }
9058           }
9059           ds=1;
9060           //current.isconst=0;
9061           break;
9062         case SJUMP:
9063           //current.isconst=0;
9064           //current.wasconst=0;
9065           //regs[i].wasconst=0;
9066           clear_const(&current,rs1[i]);
9067           clear_const(&current,rt1[i]);
9068           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9069           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9070           {
9071             alloc_cc(&current,i);
9072             dirty_reg(&current,CCREG);
9073             alloc_reg(&current,i,rs1[i]);
9074             if(!(current.is32>>rs1[i]&1))
9075             {
9076               alloc_reg64(&current,i,rs1[i]);
9077             }
9078             if (rt1[i]==31) { // BLTZAL/BGEZAL
9079               alloc_reg(&current,i,31);
9080               dirty_reg(&current,31);
9081               //#ifdef REG_PREFETCH
9082               //alloc_reg(&current,i,PTEMP);
9083               //#endif
9084               //current.is32|=1LL<<rt1[i];
9085             }
9086             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9087                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9088               // Allocate the branch condition registers instead.
9089               current.isconst=0;
9090               current.wasconst=0;
9091               regs[i].wasconst=0;
9092               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9093               if(!((current.is32>>rs1[i])&1))
9094               {
9095                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9096               }
9097             }
9098             else
9099             {
9100               ooo[i]=1;
9101               delayslot_alloc(&current,i+1);
9102             }
9103           }
9104           else
9105           // Don't alloc the delay slot yet because we might not execute it
9106           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9107           {
9108             current.isconst=0;
9109             current.wasconst=0;
9110             regs[i].wasconst=0;
9111             alloc_cc(&current,i);
9112             dirty_reg(&current,CCREG);
9113             alloc_reg(&current,i,rs1[i]);
9114             if(!(current.is32>>rs1[i]&1))
9115             {
9116               alloc_reg64(&current,i,rs1[i]);
9117             }
9118           }
9119           ds=1;
9120           //current.isconst=0;
9121           break;
9122         case FJUMP:
9123           current.isconst=0;
9124           current.wasconst=0;
9125           regs[i].wasconst=0;
9126           if(likely[i]==0) // BC1F/BC1T
9127           {
9128             // TODO: Theoretically we can run out of registers here on x86.
9129             // The delay slot can allocate up to six, and we need to check
9130             // CSREG before executing the delay slot.  Possibly we can drop
9131             // the cycle count and then reload it after checking that the
9132             // FPU is in a usable state, or don't do out-of-order execution.
9133             alloc_cc(&current,i);
9134             dirty_reg(&current,CCREG);
9135             alloc_reg(&current,i,FSREG);
9136             alloc_reg(&current,i,CSREG);
9137             if(itype[i+1]==FCOMP) {
9138               // The delay slot overwrites the branch condition.
9139               // Allocate the branch condition registers instead.
9140               alloc_cc(&current,i);
9141               dirty_reg(&current,CCREG);
9142               alloc_reg(&current,i,CSREG);
9143               alloc_reg(&current,i,FSREG);
9144             }
9145             else {
9146               ooo[i]=1;
9147               delayslot_alloc(&current,i+1);
9148               alloc_reg(&current,i+1,CSREG);
9149             }
9150           }
9151           else
9152           // Don't alloc the delay slot yet because we might not execute it
9153           if(likely[i]) // BC1FL/BC1TL
9154           {
9155             alloc_cc(&current,i);
9156             dirty_reg(&current,CCREG);
9157             alloc_reg(&current,i,CSREG);
9158             alloc_reg(&current,i,FSREG);
9159           }
9160           ds=1;
9161           current.isconst=0;
9162           break;
9163         case IMM16:
9164           imm16_alloc(&current,i);
9165           break;
9166         case LOAD:
9167         case LOADLR:
9168           load_alloc(&current,i);
9169           break;
9170         case STORE:
9171         case STORELR:
9172           store_alloc(&current,i);
9173           break;
9174         case ALU:
9175           alu_alloc(&current,i);
9176           break;
9177         case SHIFT:
9178           shift_alloc(&current,i);
9179           break;
9180         case MULTDIV:
9181           multdiv_alloc(&current,i);
9182           break;
9183         case SHIFTIMM:
9184           shiftimm_alloc(&current,i);
9185           break;
9186         case MOV:
9187           mov_alloc(&current,i);
9188           break;
9189         case COP0:
9190           cop0_alloc(&current,i);
9191           break;
9192         case COP1:
9193         case COP2:
9194           cop1_alloc(&current,i);
9195           break;
9196         case C1LS:
9197           c1ls_alloc(&current,i);
9198           break;
9199         case C2LS:
9200           c2ls_alloc(&current,i);
9201           break;
9202         case C2OP:
9203           c2op_alloc(&current,i);
9204           break;
9205         case FCONV:
9206           fconv_alloc(&current,i);
9207           break;
9208         case FLOAT:
9209           float_alloc(&current,i);
9210           break;
9211         case FCOMP:
9212           fcomp_alloc(&current,i);
9213           break;
9214         case SYSCALL:
9215         case HLECALL:
9216         case INTCALL:
9217           syscall_alloc(&current,i);
9218           break;
9219         case SPAN:
9220           pagespan_alloc(&current,i);
9221           break;
9222       }
9223       
9224       // Drop the upper half of registers that have become 32-bit
9225       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9226       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9227         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9228         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9229         current.uu|=1;
9230       } else {
9231         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9232         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9233         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9234         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9235         current.uu|=1;
9236       }
9237
9238       // Create entry (branch target) regmap
9239       for(hr=0;hr<HOST_REGS;hr++)
9240       {
9241         int r,or,er;
9242         r=current.regmap[hr];
9243         if(r>=0) {
9244           if(r!=regmap_pre[i][hr]) {
9245             // TODO: delay slot (?)
9246             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9247             if(or<0||(r&63)>=TEMPREG){
9248               regs[i].regmap_entry[hr]=-1;
9249             }
9250             else
9251             {
9252               // Just move it to a different register
9253               regs[i].regmap_entry[hr]=r;
9254               // If it was dirty before, it's still dirty
9255               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9256             }
9257           }
9258           else
9259           {
9260             // Unneeded
9261             if(r==0){
9262               regs[i].regmap_entry[hr]=0;
9263             }
9264             else
9265             if(r<64){
9266               if((current.u>>r)&1) {
9267                 regs[i].regmap_entry[hr]=-1;
9268                 //regs[i].regmap[hr]=-1;
9269                 current.regmap[hr]=-1;
9270               }else
9271                 regs[i].regmap_entry[hr]=r;
9272             }
9273             else {
9274               if((current.uu>>(r&63))&1) {
9275                 regs[i].regmap_entry[hr]=-1;
9276                 //regs[i].regmap[hr]=-1;
9277                 current.regmap[hr]=-1;
9278               }else
9279                 regs[i].regmap_entry[hr]=r;
9280             }
9281           }
9282         } else {
9283           // Branches expect CCREG to be allocated at the target
9284           if(regmap_pre[i][hr]==CCREG) 
9285             regs[i].regmap_entry[hr]=CCREG;
9286           else
9287             regs[i].regmap_entry[hr]=-1;
9288         }
9289       }
9290       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9291     }
9292     /* Branch post-alloc */
9293     if(i>0)
9294     {
9295       current.was32=current.is32;
9296       current.wasdirty=current.dirty;
9297       switch(itype[i-1]) {
9298         case UJUMP:
9299           memcpy(&branch_regs[i-1],&current,sizeof(current));
9300           branch_regs[i-1].isconst=0;
9301           branch_regs[i-1].wasconst=0;
9302           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9303           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9304           alloc_cc(&branch_regs[i-1],i-1);
9305           dirty_reg(&branch_regs[i-1],CCREG);
9306           if(rt1[i-1]==31) { // JAL
9307             alloc_reg(&branch_regs[i-1],i-1,31);
9308             dirty_reg(&branch_regs[i-1],31);
9309             branch_regs[i-1].is32|=1LL<<31;
9310           }
9311           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9312           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9313           break;
9314         case RJUMP:
9315           memcpy(&branch_regs[i-1],&current,sizeof(current));
9316           branch_regs[i-1].isconst=0;
9317           branch_regs[i-1].wasconst=0;
9318           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9319           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9320           alloc_cc(&branch_regs[i-1],i-1);
9321           dirty_reg(&branch_regs[i-1],CCREG);
9322           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9323           if(rt1[i-1]!=0) { // JALR
9324             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9325             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9326             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9327           }
9328           #ifdef USE_MINI_HT
9329           if(rs1[i-1]==31) { // JALR
9330             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9331             #ifndef HOST_IMM_ADDR32
9332             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9333             #endif
9334           }
9335           #endif
9336           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9337           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9338           break;
9339         case CJUMP:
9340           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9341           {
9342             alloc_cc(&current,i-1);
9343             dirty_reg(&current,CCREG);
9344             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9345                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9346               // The delay slot overwrote one of our conditions
9347               // Delay slot goes after the test (in order)
9348               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9349               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9350               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9351               current.u|=1;
9352               current.uu|=1;
9353               delayslot_alloc(&current,i);
9354               current.isconst=0;
9355             }
9356             else
9357             {
9358               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9359               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9360               // Alloc the branch condition registers
9361               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9362               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9363               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9364               {
9365                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9366                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9367               }
9368             }
9369             memcpy(&branch_regs[i-1],&current,sizeof(current));
9370             branch_regs[i-1].isconst=0;
9371             branch_regs[i-1].wasconst=0;
9372             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9373             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9374           }
9375           else
9376           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9377           {
9378             alloc_cc(&current,i-1);
9379             dirty_reg(&current,CCREG);
9380             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9381               // The delay slot overwrote the branch condition
9382               // Delay slot goes after the test (in order)
9383               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9384               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9385               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9386               current.u|=1;
9387               current.uu|=1;
9388               delayslot_alloc(&current,i);
9389               current.isconst=0;
9390             }
9391             else
9392             {
9393               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9394               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9395               // Alloc the branch condition register
9396               alloc_reg(&current,i-1,rs1[i-1]);
9397               if(!(current.is32>>rs1[i-1]&1))
9398               {
9399                 alloc_reg64(&current,i-1,rs1[i-1]);
9400               }
9401             }
9402             memcpy(&branch_regs[i-1],&current,sizeof(current));
9403             branch_regs[i-1].isconst=0;
9404             branch_regs[i-1].wasconst=0;
9405             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9406             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9407           }
9408           else
9409           // Alloc the delay slot in case the branch is taken
9410           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9411           {
9412             memcpy(&branch_regs[i-1],&current,sizeof(current));
9413             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9414             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9415             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9416             alloc_cc(&branch_regs[i-1],i);
9417             dirty_reg(&branch_regs[i-1],CCREG);
9418             delayslot_alloc(&branch_regs[i-1],i);
9419             branch_regs[i-1].isconst=0;
9420             alloc_reg(&current,i,CCREG); // Not taken path
9421             dirty_reg(&current,CCREG);
9422             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9423           }
9424           else
9425           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9426           {
9427             memcpy(&branch_regs[i-1],&current,sizeof(current));
9428             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9429             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9430             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9431             alloc_cc(&branch_regs[i-1],i);
9432             dirty_reg(&branch_regs[i-1],CCREG);
9433             delayslot_alloc(&branch_regs[i-1],i);
9434             branch_regs[i-1].isconst=0;
9435             alloc_reg(&current,i,CCREG); // Not taken path
9436             dirty_reg(&current,CCREG);
9437             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9438           }
9439           break;
9440         case SJUMP:
9441           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9442           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9443           {
9444             alloc_cc(&current,i-1);
9445             dirty_reg(&current,CCREG);
9446             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9447               // The delay slot overwrote the branch condition
9448               // Delay slot goes after the test (in order)
9449               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9450               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9451               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9452               current.u|=1;
9453               current.uu|=1;
9454               delayslot_alloc(&current,i);
9455               current.isconst=0;
9456             }
9457             else
9458             {
9459               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9460               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9461               // Alloc the branch condition register
9462               alloc_reg(&current,i-1,rs1[i-1]);
9463               if(!(current.is32>>rs1[i-1]&1))
9464               {
9465                 alloc_reg64(&current,i-1,rs1[i-1]);
9466               }
9467             }
9468             memcpy(&branch_regs[i-1],&current,sizeof(current));
9469             branch_regs[i-1].isconst=0;
9470             branch_regs[i-1].wasconst=0;
9471             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9472             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9473           }
9474           else
9475           // Alloc the delay slot in case the branch is taken
9476           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9477           {
9478             memcpy(&branch_regs[i-1],&current,sizeof(current));
9479             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9480             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9481             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9482             alloc_cc(&branch_regs[i-1],i);
9483             dirty_reg(&branch_regs[i-1],CCREG);
9484             delayslot_alloc(&branch_regs[i-1],i);
9485             branch_regs[i-1].isconst=0;
9486             alloc_reg(&current,i,CCREG); // Not taken path
9487             dirty_reg(&current,CCREG);
9488             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9489           }
9490           // FIXME: BLTZAL/BGEZAL
9491           if(opcode2[i-1]&0x10) { // BxxZAL
9492             alloc_reg(&branch_regs[i-1],i-1,31);
9493             dirty_reg(&branch_regs[i-1],31);
9494             branch_regs[i-1].is32|=1LL<<31;
9495           }
9496           break;
9497         case FJUMP:
9498           if(likely[i-1]==0) // BC1F/BC1T
9499           {
9500             alloc_cc(&current,i-1);
9501             dirty_reg(&current,CCREG);
9502             if(itype[i]==FCOMP) {
9503               // The delay slot overwrote the branch condition
9504               // Delay slot goes after the test (in order)
9505               delayslot_alloc(&current,i);
9506               current.isconst=0;
9507             }
9508             else
9509             {
9510               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9511               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9512               // Alloc the branch condition register
9513               alloc_reg(&current,i-1,FSREG);
9514             }
9515             memcpy(&branch_regs[i-1],&current,sizeof(current));
9516             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9517           }
9518           else // BC1FL/BC1TL
9519           {
9520             // Alloc the delay slot in case the branch is taken
9521             memcpy(&branch_regs[i-1],&current,sizeof(current));
9522             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9523             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9524             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9525             alloc_cc(&branch_regs[i-1],i);
9526             dirty_reg(&branch_regs[i-1],CCREG);
9527             delayslot_alloc(&branch_regs[i-1],i);
9528             branch_regs[i-1].isconst=0;
9529             alloc_reg(&current,i,CCREG); // Not taken path
9530             dirty_reg(&current,CCREG);
9531             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9532           }
9533           break;
9534       }
9535
9536       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9537       {
9538         if(rt1[i-1]==31) // JAL/JALR
9539         {
9540           // Subroutine call will return here, don't alloc any registers
9541           current.is32=1;
9542           current.dirty=0;
9543           clear_all_regs(current.regmap);
9544           alloc_reg(&current,i,CCREG);
9545           dirty_reg(&current,CCREG);
9546         }
9547         else if(i+1<slen)
9548         {
9549           // Internal branch will jump here, match registers to caller
9550           current.is32=0x3FFFFFFFFLL;
9551           current.dirty=0;
9552           clear_all_regs(current.regmap);
9553           alloc_reg(&current,i,CCREG);
9554           dirty_reg(&current,CCREG);
9555           for(j=i-1;j>=0;j--)
9556           {
9557             if(ba[j]==start+i*4+4) {
9558               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9559               current.is32=branch_regs[j].is32;
9560               current.dirty=branch_regs[j].dirty;
9561               break;
9562             }
9563           }
9564           while(j>=0) {
9565             if(ba[j]==start+i*4+4) {
9566               for(hr=0;hr<HOST_REGS;hr++) {
9567                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9568                   current.regmap[hr]=-1;
9569                 }
9570                 current.is32&=branch_regs[j].is32;
9571                 current.dirty&=branch_regs[j].dirty;
9572               }
9573             }
9574             j--;
9575           }
9576         }
9577       }
9578     }
9579
9580     // Count cycles in between branches
9581     ccadj[i]=cc;
9582     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9583     {
9584       cc=0;
9585     }
9586 #ifdef PCSX
9587     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9588     {
9589       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9590     }
9591     else if(itype[i]==C2LS)
9592     {
9593       cc+=4;
9594     }
9595 #endif
9596     else
9597     {
9598       cc++;
9599     }
9600
9601     flush_dirty_uppers(&current);
9602     if(!is_ds[i]) {
9603       regs[i].is32=current.is32;
9604       regs[i].dirty=current.dirty;
9605       regs[i].isconst=current.isconst;
9606       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9607     }
9608     for(hr=0;hr<HOST_REGS;hr++) {
9609       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9610         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9611           regs[i].wasconst&=~(1<<hr);
9612         }
9613       }
9614     }
9615     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9616   }
9617   
9618   /* Pass 4 - Cull unused host registers */
9619   
9620   uint64_t nr=0;
9621   
9622   for (i=slen-1;i>=0;i--)
9623   {
9624     int hr;
9625     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9626     {
9627       if(ba[i]<start || ba[i]>=(start+slen*4))
9628       {
9629         // Branch out of this block, don't need anything
9630         nr=0;
9631       }
9632       else
9633       {
9634         // Internal branch
9635         // Need whatever matches the target
9636         nr=0;
9637         int t=(ba[i]-start)>>2;
9638         for(hr=0;hr<HOST_REGS;hr++)
9639         {
9640           if(regs[i].regmap_entry[hr]>=0) {
9641             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9642           }
9643         }
9644       }
9645       // Conditional branch may need registers for following instructions
9646       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9647       {
9648         if(i<slen-2) {
9649           nr|=needed_reg[i+2];
9650           for(hr=0;hr<HOST_REGS;hr++)
9651           {
9652             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9653             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9654           }
9655         }
9656       }
9657       // Don't need stuff which is overwritten
9658       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9659       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9660       // Merge in delay slot
9661       for(hr=0;hr<HOST_REGS;hr++)
9662       {
9663         if(!likely[i]) {
9664           // These are overwritten unless the branch is "likely"
9665           // and the delay slot is nullified if not taken
9666           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9667           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9668         }
9669         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9670         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9671         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9672         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9673         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9674         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9675         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9676         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9677         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9678           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9679           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9680         }
9681         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9682           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9683           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9684         }
9685         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9686           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9687           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9688         }
9689       }
9690     }
9691     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9692     {
9693       // SYSCALL instruction (software interrupt)
9694       nr=0;
9695     }
9696     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9697     {
9698       // ERET instruction (return from interrupt)
9699       nr=0;
9700     }
9701     else // Non-branch
9702     {
9703       if(i<slen-1) {
9704         for(hr=0;hr<HOST_REGS;hr++) {
9705           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9706           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9707           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9708           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9709         }
9710       }
9711     }
9712     for(hr=0;hr<HOST_REGS;hr++)
9713     {
9714       // Overwritten registers are not needed
9715       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9716       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9717       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9718       // Source registers are needed
9719       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9720       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9721       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9722       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9723       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9724       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9725       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9726       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9727       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9728         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9729         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9730       }
9731       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9732         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9733         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9734       }
9735       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9736         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9737         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9738       }
9739       // Don't store a register immediately after writing it,
9740       // may prevent dual-issue.
9741       // But do so if this is a branch target, otherwise we
9742       // might have to load the register before the branch.
9743       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9744         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9745            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9746           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9747           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9748         }
9749         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9750            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9751           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9752           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9753         }
9754       }
9755     }
9756     // Cycle count is needed at branches.  Assume it is needed at the target too.
9757     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9758       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9759       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9760     }
9761     // Save it
9762     needed_reg[i]=nr;
9763     
9764     // Deallocate unneeded registers
9765     for(hr=0;hr<HOST_REGS;hr++)
9766     {
9767       if(!((nr>>hr)&1)) {
9768         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9769         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9770            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9771            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9772         {
9773           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9774           {
9775             if(likely[i]) {
9776               regs[i].regmap[hr]=-1;
9777               regs[i].isconst&=~(1<<hr);
9778               if(i<slen-2) {
9779                 regmap_pre[i+2][hr]=-1;
9780                 regs[i+2].wasconst&=~(1<<hr);
9781               }
9782             }
9783           }
9784         }
9785         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9786         {
9787           int d1=0,d2=0,map=0,temp=0;
9788           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9789           {
9790             d1=dep1[i+1];
9791             d2=dep2[i+1];
9792           }
9793           if(using_tlb) {
9794             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9795                itype[i+1]==STORE || itype[i+1]==STORELR ||
9796                itype[i+1]==C1LS || itype[i+1]==C2LS)
9797             map=TLREG;
9798           } else
9799           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9800              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9801             map=INVCP;
9802           }
9803           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9804              itype[i+1]==C1LS || itype[i+1]==C2LS)
9805             temp=FTEMP;
9806           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9807              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9808              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9809              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9810              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9811              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9812              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9813              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9814              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9815              regs[i].regmap[hr]!=map )
9816           {
9817             regs[i].regmap[hr]=-1;
9818             regs[i].isconst&=~(1<<hr);
9819             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9820                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9821                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9822                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9823                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9824                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9825                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9826                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9827                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9828                branch_regs[i].regmap[hr]!=map)
9829             {
9830               branch_regs[i].regmap[hr]=-1;
9831               branch_regs[i].regmap_entry[hr]=-1;
9832               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9833               {
9834                 if(!likely[i]&&i<slen-2) {
9835                   regmap_pre[i+2][hr]=-1;
9836                   regs[i+2].wasconst&=~(1<<hr);
9837                 }
9838               }
9839             }
9840           }
9841         }
9842         else
9843         {
9844           // Non-branch
9845           if(i>0)
9846           {
9847             int d1=0,d2=0,map=-1,temp=-1;
9848             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9849             {
9850               d1=dep1[i];
9851               d2=dep2[i];
9852             }
9853             if(using_tlb) {
9854               if(itype[i]==LOAD || itype[i]==LOADLR ||
9855                  itype[i]==STORE || itype[i]==STORELR ||
9856                  itype[i]==C1LS || itype[i]==C2LS)
9857               map=TLREG;
9858             } else if(itype[i]==STORE || itype[i]==STORELR ||
9859                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9860               map=INVCP;
9861             }
9862             if(itype[i]==LOADLR || itype[i]==STORELR ||
9863                itype[i]==C1LS || itype[i]==C2LS)
9864               temp=FTEMP;
9865             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9866                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9867                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9868                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9869                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9870                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9871             {
9872               if(i<slen-1&&!is_ds[i]) {
9873                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9874                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9875                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9876                 {
9877                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9878                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9879                 }
9880                 regmap_pre[i+1][hr]=-1;
9881                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9882                 regs[i+1].wasconst&=~(1<<hr);
9883               }
9884               regs[i].regmap[hr]=-1;
9885               regs[i].isconst&=~(1<<hr);
9886             }
9887           }
9888         }
9889       }
9890     }
9891   }
9892   
9893   /* Pass 5 - Pre-allocate registers */
9894   
9895   // If a register is allocated during a loop, try to allocate it for the
9896   // entire loop, if possible.  This avoids loading/storing registers
9897   // inside of the loop.
9898   
9899   signed char f_regmap[HOST_REGS];
9900   clear_all_regs(f_regmap);
9901   for(i=0;i<slen-1;i++)
9902   {
9903     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9904     {
9905       if(ba[i]>=start && ba[i]<(start+i*4)) 
9906       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9907       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9908       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9909       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9910       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9911       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9912       {
9913         int t=(ba[i]-start)>>2;
9914         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9915         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9916         for(hr=0;hr<HOST_REGS;hr++)
9917         {
9918           if(regs[i].regmap[hr]>64) {
9919             if(!((regs[i].dirty>>hr)&1))
9920               f_regmap[hr]=regs[i].regmap[hr];
9921             else f_regmap[hr]=-1;
9922           }
9923           else if(regs[i].regmap[hr]>=0) {
9924             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9925               // dealloc old register
9926               int n;
9927               for(n=0;n<HOST_REGS;n++)
9928               {
9929                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9930               }
9931               // and alloc new one
9932               f_regmap[hr]=regs[i].regmap[hr];
9933             }
9934           }
9935           if(branch_regs[i].regmap[hr]>64) {
9936             if(!((branch_regs[i].dirty>>hr)&1))
9937               f_regmap[hr]=branch_regs[i].regmap[hr];
9938             else f_regmap[hr]=-1;
9939           }
9940           else if(branch_regs[i].regmap[hr]>=0) {
9941             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9942               // dealloc old register
9943               int n;
9944               for(n=0;n<HOST_REGS;n++)
9945               {
9946                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9947               }
9948               // and alloc new one
9949               f_regmap[hr]=branch_regs[i].regmap[hr];
9950             }
9951           }
9952           if(ooo[i]) {
9953             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9954               f_regmap[hr]=branch_regs[i].regmap[hr];
9955           }else{
9956             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9957               f_regmap[hr]=branch_regs[i].regmap[hr];
9958           }
9959           // Avoid dirty->clean transition
9960           #ifdef DESTRUCTIVE_WRITEBACK
9961           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9962           #endif
9963           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9964           // case above, however it's always a good idea.  We can't hoist the
9965           // load if the register was already allocated, so there's no point
9966           // wasting time analyzing most of these cases.  It only "succeeds"
9967           // when the mapping was different and the load can be replaced with
9968           // a mov, which is of negligible benefit.  So such cases are
9969           // skipped below.
9970           if(f_regmap[hr]>0) {
9971             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9972               int r=f_regmap[hr];
9973               for(j=t;j<=i;j++)
9974               {
9975                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9976                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9977                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9978                 if(r>63) {
9979                   // NB This can exclude the case where the upper-half
9980                   // register is lower numbered than the lower-half
9981                   // register.  Not sure if it's worth fixing...
9982                   if(get_reg(regs[j].regmap,r&63)<0) break;
9983                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9984                   if(regs[j].is32&(1LL<<(r&63))) break;
9985                 }
9986                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9987                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9988                   int k;
9989                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9990                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9991                     if(r>63) {
9992                       if(get_reg(regs[i].regmap,r&63)<0) break;
9993                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9994                     }
9995                     k=i;
9996                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9997                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9998                         //printf("no free regs for store %x\n",start+(k-1)*4);
9999                         break;
10000                       }
10001                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10002                         //printf("no-match due to different register\n");
10003                         break;
10004                       }
10005                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10006                         //printf("no-match due to branch\n");
10007                         break;
10008                       }
10009                       // call/ret fast path assumes no registers allocated
10010                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10011                         break;
10012                       }
10013                       if(r>63) {
10014                         // NB This can exclude the case where the upper-half
10015                         // register is lower numbered than the lower-half
10016                         // register.  Not sure if it's worth fixing...
10017                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10018                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10019                       }
10020                       k--;
10021                     }
10022                     if(i<slen-1) {
10023                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10024                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10025                         //printf("bad match after branch\n");
10026                         break;
10027                       }
10028                     }
10029                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10030                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10031                       while(k<i) {
10032                         regs[k].regmap_entry[hr]=f_regmap[hr];
10033                         regs[k].regmap[hr]=f_regmap[hr];
10034                         regmap_pre[k+1][hr]=f_regmap[hr];
10035                         regs[k].wasdirty&=~(1<<hr);
10036                         regs[k].dirty&=~(1<<hr);
10037                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10038                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10039                         regs[k].wasconst&=~(1<<hr);
10040                         regs[k].isconst&=~(1<<hr);
10041                         k++;
10042                       }
10043                     }
10044                     else {
10045                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10046                       break;
10047                     }
10048                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10049                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10050                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10051                       regs[i].regmap_entry[hr]=f_regmap[hr];
10052                       regs[i].regmap[hr]=f_regmap[hr];
10053                       regs[i].wasdirty&=~(1<<hr);
10054                       regs[i].dirty&=~(1<<hr);
10055                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10056                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10057                       regs[i].wasconst&=~(1<<hr);
10058                       regs[i].isconst&=~(1<<hr);
10059                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10060                       branch_regs[i].wasdirty&=~(1<<hr);
10061                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10062                       branch_regs[i].regmap[hr]=f_regmap[hr];
10063                       branch_regs[i].dirty&=~(1<<hr);
10064                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10065                       branch_regs[i].wasconst&=~(1<<hr);
10066                       branch_regs[i].isconst&=~(1<<hr);
10067                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10068                         regmap_pre[i+2][hr]=f_regmap[hr];
10069                         regs[i+2].wasdirty&=~(1<<hr);
10070                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10071                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10072                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10073                       }
10074                     }
10075                   }
10076                   for(k=t;k<j;k++) {
10077                     // Alloc register clean at beginning of loop,
10078                     // but may dirty it in pass 6
10079                     regs[k].regmap_entry[hr]=f_regmap[hr];
10080                     regs[k].regmap[hr]=f_regmap[hr];
10081                     regs[k].dirty&=~(1<<hr);
10082                     regs[k].wasconst&=~(1<<hr);
10083                     regs[k].isconst&=~(1<<hr);
10084                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10085                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10086                       branch_regs[k].regmap[hr]=f_regmap[hr];
10087                       branch_regs[k].dirty&=~(1<<hr);
10088                       branch_regs[k].wasconst&=~(1<<hr);
10089                       branch_regs[k].isconst&=~(1<<hr);
10090                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10091                         regmap_pre[k+2][hr]=f_regmap[hr];
10092                         regs[k+2].wasdirty&=~(1<<hr);
10093                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10094                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10095                       }
10096                     }
10097                     else
10098                     {
10099                       regmap_pre[k+1][hr]=f_regmap[hr];
10100                       regs[k+1].wasdirty&=~(1<<hr);
10101                     }
10102                   }
10103                   if(regs[j].regmap[hr]==f_regmap[hr])
10104                     regs[j].regmap_entry[hr]=f_regmap[hr];
10105                   break;
10106                 }
10107                 if(j==i) break;
10108                 if(regs[j].regmap[hr]>=0)
10109                   break;
10110                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10111                   //printf("no-match due to different register\n");
10112                   break;
10113                 }
10114                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10115                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10116                   break;
10117                 }
10118                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10119                 {
10120                   // Stop on unconditional branch
10121                   break;
10122                 }
10123                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10124                 {
10125                   if(ooo[j]) {
10126                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10127                       break;
10128                   }else{
10129                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10130                       break;
10131                   }
10132                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10133                     //printf("no-match due to different register (branch)\n");
10134                     break;
10135                   }
10136                 }
10137                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10138                   //printf("No free regs for store %x\n",start+j*4);
10139                   break;
10140                 }
10141                 if(f_regmap[hr]>=64) {
10142                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10143                     break;
10144                   }
10145                   else
10146                   {
10147                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10148                       break;
10149                     }
10150                   }
10151                 }
10152               }
10153             }
10154           }
10155         }
10156       }
10157     }else{
10158       // Non branch or undetermined branch target
10159       for(hr=0;hr<HOST_REGS;hr++)
10160       {
10161         if(hr!=EXCLUDE_REG) {
10162           if(regs[i].regmap[hr]>64) {
10163             if(!((regs[i].dirty>>hr)&1))
10164               f_regmap[hr]=regs[i].regmap[hr];
10165           }
10166           else if(regs[i].regmap[hr]>=0) {
10167             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10168               // dealloc old register
10169               int n;
10170               for(n=0;n<HOST_REGS;n++)
10171               {
10172                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10173               }
10174               // and alloc new one
10175               f_regmap[hr]=regs[i].regmap[hr];
10176             }
10177           }
10178         }
10179       }
10180       // Try to restore cycle count at branch targets
10181       if(bt[i]) {
10182         for(j=i;j<slen-1;j++) {
10183           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10184           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10185             //printf("no free regs for store %x\n",start+j*4);
10186             break;
10187           }
10188         }
10189         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10190           int k=i;
10191           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10192           while(k<j) {
10193             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10194             regs[k].regmap[HOST_CCREG]=CCREG;
10195             regmap_pre[k+1][HOST_CCREG]=CCREG;
10196             regs[k+1].wasdirty|=1<<HOST_CCREG;
10197             regs[k].dirty|=1<<HOST_CCREG;
10198             regs[k].wasconst&=~(1<<HOST_CCREG);
10199             regs[k].isconst&=~(1<<HOST_CCREG);
10200             k++;
10201           }
10202           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10203         }
10204         // Work backwards from the branch target
10205         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10206         {
10207           //printf("Extend backwards\n");
10208           int k;
10209           k=i;
10210           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10211             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10212               //printf("no free regs for store %x\n",start+(k-1)*4);
10213               break;
10214             }
10215             k--;
10216           }
10217           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10218             //printf("Extend CC, %x ->\n",start+k*4);
10219             while(k<=i) {
10220               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10221               regs[k].regmap[HOST_CCREG]=CCREG;
10222               regmap_pre[k+1][HOST_CCREG]=CCREG;
10223               regs[k+1].wasdirty|=1<<HOST_CCREG;
10224               regs[k].dirty|=1<<HOST_CCREG;
10225               regs[k].wasconst&=~(1<<HOST_CCREG);
10226               regs[k].isconst&=~(1<<HOST_CCREG);
10227               k++;
10228             }
10229           }
10230           else {
10231             //printf("Fail Extend CC, %x ->\n",start+k*4);
10232           }
10233         }
10234       }
10235       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10236          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10237          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10238          itype[i]!=FCONV&&itype[i]!=FCOMP)
10239       {
10240         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10241       }
10242     }
10243   }
10244   
10245   // Cache memory offset or tlb map pointer if a register is available
10246   #ifndef HOST_IMM_ADDR32
10247   #ifndef RAM_OFFSET
10248   if(using_tlb)
10249   #endif
10250   {
10251     int earliest_available[HOST_REGS];
10252     int loop_start[HOST_REGS];
10253     int score[HOST_REGS];
10254     int end[HOST_REGS];
10255     int reg=using_tlb?MMREG:ROREG;
10256
10257     // Init
10258     for(hr=0;hr<HOST_REGS;hr++) {
10259       score[hr]=0;earliest_available[hr]=0;
10260       loop_start[hr]=MAXBLOCK;
10261     }
10262     for(i=0;i<slen-1;i++)
10263     {
10264       // Can't do anything if no registers are available
10265       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10266         for(hr=0;hr<HOST_REGS;hr++) {
10267           score[hr]=0;earliest_available[hr]=i+1;
10268           loop_start[hr]=MAXBLOCK;
10269         }
10270       }
10271       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10272         if(!ooo[i]) {
10273           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10274             for(hr=0;hr<HOST_REGS;hr++) {
10275               score[hr]=0;earliest_available[hr]=i+1;
10276               loop_start[hr]=MAXBLOCK;
10277             }
10278           }
10279         }else{
10280           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10281             for(hr=0;hr<HOST_REGS;hr++) {
10282               score[hr]=0;earliest_available[hr]=i+1;
10283               loop_start[hr]=MAXBLOCK;
10284             }
10285           }
10286         }
10287       }
10288       // Mark unavailable registers
10289       for(hr=0;hr<HOST_REGS;hr++) {
10290         if(regs[i].regmap[hr]>=0) {
10291           score[hr]=0;earliest_available[hr]=i+1;
10292           loop_start[hr]=MAXBLOCK;
10293         }
10294         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10295           if(branch_regs[i].regmap[hr]>=0) {
10296             score[hr]=0;earliest_available[hr]=i+2;
10297             loop_start[hr]=MAXBLOCK;
10298           }
10299         }
10300       }
10301       // No register allocations after unconditional jumps
10302       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10303       {
10304         for(hr=0;hr<HOST_REGS;hr++) {
10305           score[hr]=0;earliest_available[hr]=i+2;
10306           loop_start[hr]=MAXBLOCK;
10307         }
10308         i++; // Skip delay slot too
10309         //printf("skip delay slot: %x\n",start+i*4);
10310       }
10311       else
10312       // Possible match
10313       if(itype[i]==LOAD||itype[i]==LOADLR||
10314          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10315         for(hr=0;hr<HOST_REGS;hr++) {
10316           if(hr!=EXCLUDE_REG) {
10317             end[hr]=i-1;
10318             for(j=i;j<slen-1;j++) {
10319               if(regs[j].regmap[hr]>=0) break;
10320               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10321                 if(branch_regs[j].regmap[hr]>=0) break;
10322                 if(ooo[j]) {
10323                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10324                 }else{
10325                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10326                 }
10327               }
10328               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10329               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10330                 int t=(ba[j]-start)>>2;
10331                 if(t<j&&t>=earliest_available[hr]) {
10332                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10333                     // Score a point for hoisting loop invariant
10334                     if(t<loop_start[hr]) loop_start[hr]=t;
10335                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10336                     score[hr]++;
10337                     end[hr]=j;
10338                   }
10339                 }
10340                 else if(t<j) {
10341                   if(regs[t].regmap[hr]==reg) {
10342                     // Score a point if the branch target matches this register
10343                     score[hr]++;
10344                     end[hr]=j;
10345                   }
10346                 }
10347                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10348                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10349                   score[hr]++;
10350                   end[hr]=j;
10351                 }
10352               }
10353               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10354               {
10355                 // Stop on unconditional branch
10356                 break;
10357               }
10358               else
10359               if(itype[j]==LOAD||itype[j]==LOADLR||
10360                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10361                 score[hr]++;
10362                 end[hr]=j;
10363               }
10364             }
10365           }
10366         }
10367         // Find highest score and allocate that register
10368         int maxscore=0;
10369         for(hr=0;hr<HOST_REGS;hr++) {
10370           if(hr!=EXCLUDE_REG) {
10371             if(score[hr]>score[maxscore]) {
10372               maxscore=hr;
10373               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10374             }
10375           }
10376         }
10377         if(score[maxscore]>1)
10378         {
10379           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10380           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10381             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10382             assert(regs[j].regmap[maxscore]<0);
10383             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10384             regs[j].regmap[maxscore]=reg;
10385             regs[j].dirty&=~(1<<maxscore);
10386             regs[j].wasconst&=~(1<<maxscore);
10387             regs[j].isconst&=~(1<<maxscore);
10388             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10389               branch_regs[j].regmap[maxscore]=reg;
10390               branch_regs[j].wasdirty&=~(1<<maxscore);
10391               branch_regs[j].dirty&=~(1<<maxscore);
10392               branch_regs[j].wasconst&=~(1<<maxscore);
10393               branch_regs[j].isconst&=~(1<<maxscore);
10394               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10395                 regmap_pre[j+2][maxscore]=reg;
10396                 regs[j+2].wasdirty&=~(1<<maxscore);
10397               }
10398               // loop optimization (loop_preload)
10399               int t=(ba[j]-start)>>2;
10400               if(t==loop_start[maxscore]) {
10401                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10402                   regs[t].regmap_entry[maxscore]=reg;
10403               }
10404             }
10405             else
10406             {
10407               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10408                 regmap_pre[j+1][maxscore]=reg;
10409                 regs[j+1].wasdirty&=~(1<<maxscore);
10410               }
10411             }
10412           }
10413           i=j-1;
10414           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10415           for(hr=0;hr<HOST_REGS;hr++) {
10416             score[hr]=0;earliest_available[hr]=i+i;
10417             loop_start[hr]=MAXBLOCK;
10418           }
10419         }
10420       }
10421     }
10422   }
10423   #endif
10424   
10425   // This allocates registers (if possible) one instruction prior
10426   // to use, which can avoid a load-use penalty on certain CPUs.
10427   for(i=0;i<slen-1;i++)
10428   {
10429     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10430     {
10431       if(!bt[i+1])
10432       {
10433         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10434            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10435         {
10436           if(rs1[i+1]) {
10437             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10438             {
10439               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10440               {
10441                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10442                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10443                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10444                 regs[i].isconst&=~(1<<hr);
10445                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10446                 constmap[i][hr]=constmap[i+1][hr];
10447                 regs[i+1].wasdirty&=~(1<<hr);
10448                 regs[i].dirty&=~(1<<hr);
10449               }
10450             }
10451           }
10452           if(rs2[i+1]) {
10453             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10454             {
10455               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10456               {
10457                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10458                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10459                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10460                 regs[i].isconst&=~(1<<hr);
10461                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10462                 constmap[i][hr]=constmap[i+1][hr];
10463                 regs[i+1].wasdirty&=~(1<<hr);
10464                 regs[i].dirty&=~(1<<hr);
10465               }
10466             }
10467           }
10468           // Preload target address for load instruction (non-constant)
10469           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10470             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10471             {
10472               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10473               {
10474                 regs[i].regmap[hr]=rs1[i+1];
10475                 regmap_pre[i+1][hr]=rs1[i+1];
10476                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10477                 regs[i].isconst&=~(1<<hr);
10478                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10479                 constmap[i][hr]=constmap[i+1][hr];
10480                 regs[i+1].wasdirty&=~(1<<hr);
10481                 regs[i].dirty&=~(1<<hr);
10482               }
10483             }
10484           }
10485           // Load source into target register 
10486           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10487             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10488             {
10489               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10490               {
10491                 regs[i].regmap[hr]=rs1[i+1];
10492                 regmap_pre[i+1][hr]=rs1[i+1];
10493                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10494                 regs[i].isconst&=~(1<<hr);
10495                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10496                 constmap[i][hr]=constmap[i+1][hr];
10497                 regs[i+1].wasdirty&=~(1<<hr);
10498                 regs[i].dirty&=~(1<<hr);
10499               }
10500             }
10501           }
10502           // Preload map address
10503           #ifndef HOST_IMM_ADDR32
10504           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10505             hr=get_reg(regs[i+1].regmap,TLREG);
10506             if(hr>=0) {
10507               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10508               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10509                 int nr;
10510                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10511                 {
10512                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10513                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10514                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10515                   regs[i].isconst&=~(1<<hr);
10516                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10517                   constmap[i][hr]=constmap[i+1][hr];
10518                   regs[i+1].wasdirty&=~(1<<hr);
10519                   regs[i].dirty&=~(1<<hr);
10520                 }
10521                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10522                 {
10523                   // move it to another register
10524                   regs[i+1].regmap[hr]=-1;
10525                   regmap_pre[i+2][hr]=-1;
10526                   regs[i+1].regmap[nr]=TLREG;
10527                   regmap_pre[i+2][nr]=TLREG;
10528                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10529                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10530                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10531                   regs[i].isconst&=~(1<<nr);
10532                   regs[i+1].isconst&=~(1<<nr);
10533                   regs[i].dirty&=~(1<<nr);
10534                   regs[i+1].wasdirty&=~(1<<nr);
10535                   regs[i+1].dirty&=~(1<<nr);
10536                   regs[i+2].wasdirty&=~(1<<nr);
10537                 }
10538               }
10539             }
10540           }
10541           #endif
10542           // Address for store instruction (non-constant)
10543           if(itype[i+1]==STORE||itype[i+1]==STORELR
10544              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10545             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10546               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10547               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10548               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10549               assert(hr>=0);
10550               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10551               {
10552                 regs[i].regmap[hr]=rs1[i+1];
10553                 regmap_pre[i+1][hr]=rs1[i+1];
10554                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10555                 regs[i].isconst&=~(1<<hr);
10556                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10557                 constmap[i][hr]=constmap[i+1][hr];
10558                 regs[i+1].wasdirty&=~(1<<hr);
10559                 regs[i].dirty&=~(1<<hr);
10560               }
10561             }
10562           }
10563           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10564             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10565               int nr;
10566               hr=get_reg(regs[i+1].regmap,FTEMP);
10567               assert(hr>=0);
10568               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10569               {
10570                 regs[i].regmap[hr]=rs1[i+1];
10571                 regmap_pre[i+1][hr]=rs1[i+1];
10572                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10573                 regs[i].isconst&=~(1<<hr);
10574                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10575                 constmap[i][hr]=constmap[i+1][hr];
10576                 regs[i+1].wasdirty&=~(1<<hr);
10577                 regs[i].dirty&=~(1<<hr);
10578               }
10579               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10580               {
10581                 // move it to another register
10582                 regs[i+1].regmap[hr]=-1;
10583                 regmap_pre[i+2][hr]=-1;
10584                 regs[i+1].regmap[nr]=FTEMP;
10585                 regmap_pre[i+2][nr]=FTEMP;
10586                 regs[i].regmap[nr]=rs1[i+1];
10587                 regmap_pre[i+1][nr]=rs1[i+1];
10588                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10589                 regs[i].isconst&=~(1<<nr);
10590                 regs[i+1].isconst&=~(1<<nr);
10591                 regs[i].dirty&=~(1<<nr);
10592                 regs[i+1].wasdirty&=~(1<<nr);
10593                 regs[i+1].dirty&=~(1<<nr);
10594                 regs[i+2].wasdirty&=~(1<<nr);
10595               }
10596             }
10597           }
10598           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10599             if(itype[i+1]==LOAD) 
10600               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10601             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10602               hr=get_reg(regs[i+1].regmap,FTEMP);
10603             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10604               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10605               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10606             }
10607             if(hr>=0&&regs[i].regmap[hr]<0) {
10608               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10609               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10610                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10611                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10612                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10613                 regs[i].isconst&=~(1<<hr);
10614                 regs[i+1].wasdirty&=~(1<<hr);
10615                 regs[i].dirty&=~(1<<hr);
10616               }
10617             }
10618           }
10619         }
10620       }
10621     }
10622   }
10623   
10624   /* Pass 6 - Optimize clean/dirty state */
10625   clean_registers(0,slen-1,1);
10626   
10627   /* Pass 7 - Identify 32-bit registers */
10628 #ifndef FORCE32
10629   provisional_r32();
10630
10631   u_int r32=0;
10632   
10633   for (i=slen-1;i>=0;i--)
10634   {
10635     int hr;
10636     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10637     {
10638       if(ba[i]<start || ba[i]>=(start+slen*4))
10639       {
10640         // Branch out of this block, don't need anything
10641         r32=0;
10642       }
10643       else
10644       {
10645         // Internal branch
10646         // Need whatever matches the target
10647         // (and doesn't get overwritten by the delay slot instruction)
10648         r32=0;
10649         int t=(ba[i]-start)>>2;
10650         if(ba[i]>start+i*4) {
10651           // Forward branch
10652           if(!(requires_32bit[t]&~regs[i].was32))
10653             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10654         }else{
10655           // Backward branch
10656           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10657           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10658           if(!(pr32[t]&~regs[i].was32))
10659             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10660         }
10661       }
10662       // Conditional branch may need registers for following instructions
10663       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10664       {
10665         if(i<slen-2) {
10666           r32|=requires_32bit[i+2];
10667           r32&=regs[i].was32;
10668           // Mark this address as a branch target since it may be called
10669           // upon return from interrupt
10670           bt[i+2]=1;
10671         }
10672       }
10673       // Merge in delay slot
10674       if(!likely[i]) {
10675         // These are overwritten unless the branch is "likely"
10676         // and the delay slot is nullified if not taken
10677         r32&=~(1LL<<rt1[i+1]);
10678         r32&=~(1LL<<rt2[i+1]);
10679       }
10680       // Assume these are needed (delay slot)
10681       if(us1[i+1]>0)
10682       {
10683         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10684       }
10685       if(us2[i+1]>0)
10686       {
10687         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10688       }
10689       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10690       {
10691         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10692       }
10693       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10694       {
10695         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10696       }
10697     }
10698     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10699     {
10700       // SYSCALL instruction (software interrupt)
10701       r32=0;
10702     }
10703     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10704     {
10705       // ERET instruction (return from interrupt)
10706       r32=0;
10707     }
10708     // Check 32 bits
10709     r32&=~(1LL<<rt1[i]);
10710     r32&=~(1LL<<rt2[i]);
10711     if(us1[i]>0)
10712     {
10713       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10714     }
10715     if(us2[i]>0)
10716     {
10717       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10718     }
10719     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10720     {
10721       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10722     }
10723     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10724     {
10725       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10726     }
10727     requires_32bit[i]=r32;
10728     
10729     // Dirty registers which are 32-bit, require 32-bit input
10730     // as they will be written as 32-bit values
10731     for(hr=0;hr<HOST_REGS;hr++)
10732     {
10733       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10734         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10735           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10736           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10737         }
10738       }
10739     }
10740     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10741   }
10742 #else
10743   for (i=slen-1;i>=0;i--)
10744   {
10745     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10746     {
10747       // Conditional branch
10748       if((source[i]>>16)!=0x1000&&i<slen-2) {
10749         // Mark this address as a branch target since it may be called
10750         // upon return from interrupt
10751         bt[i+2]=1;
10752       }
10753     }
10754   }
10755 #endif
10756
10757   if(itype[slen-1]==SPAN) {
10758     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10759   }
10760   
10761   /* Debug/disassembly */
10762   if((void*)assem_debug==(void*)printf) 
10763   for(i=0;i<slen;i++)
10764   {
10765     printf("U:");
10766     int r;
10767     for(r=1;r<=CCREG;r++) {
10768       if((unneeded_reg[i]>>r)&1) {
10769         if(r==HIREG) printf(" HI");
10770         else if(r==LOREG) printf(" LO");
10771         else printf(" r%d",r);
10772       }
10773     }
10774 #ifndef FORCE32
10775     printf(" UU:");
10776     for(r=1;r<=CCREG;r++) {
10777       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10778         if(r==HIREG) printf(" HI");
10779         else if(r==LOREG) printf(" LO");
10780         else printf(" r%d",r);
10781       }
10782     }
10783     printf(" 32:");
10784     for(r=0;r<=CCREG;r++) {
10785       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10786       if((regs[i].was32>>r)&1) {
10787         if(r==CCREG) printf(" CC");
10788         else if(r==HIREG) printf(" HI");
10789         else if(r==LOREG) printf(" LO");
10790         else printf(" r%d",r);
10791       }
10792     }
10793 #endif
10794     printf("\n");
10795     #if defined(__i386__) || defined(__x86_64__)
10796     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10797     #endif
10798     #ifdef __arm__
10799     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10800     #endif
10801     printf("needs: ");
10802     if(needed_reg[i]&1) printf("eax ");
10803     if((needed_reg[i]>>1)&1) printf("ecx ");
10804     if((needed_reg[i]>>2)&1) printf("edx ");
10805     if((needed_reg[i]>>3)&1) printf("ebx ");
10806     if((needed_reg[i]>>5)&1) printf("ebp ");
10807     if((needed_reg[i]>>6)&1) printf("esi ");
10808     if((needed_reg[i]>>7)&1) printf("edi ");
10809     printf("r:");
10810     for(r=0;r<=CCREG;r++) {
10811       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10812       if((requires_32bit[i]>>r)&1) {
10813         if(r==CCREG) printf(" CC");
10814         else if(r==HIREG) printf(" HI");
10815         else if(r==LOREG) printf(" LO");
10816         else printf(" r%d",r);
10817       }
10818     }
10819     printf("\n");
10820     /*printf("pr:");
10821     for(r=0;r<=CCREG;r++) {
10822       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10823       if((pr32[i]>>r)&1) {
10824         if(r==CCREG) printf(" CC");
10825         else if(r==HIREG) printf(" HI");
10826         else if(r==LOREG) printf(" LO");
10827         else printf(" r%d",r);
10828       }
10829     }
10830     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10831     printf("\n");*/
10832     #if defined(__i386__) || defined(__x86_64__)
10833     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10834     printf("dirty: ");
10835     if(regs[i].wasdirty&1) printf("eax ");
10836     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10837     if((regs[i].wasdirty>>2)&1) printf("edx ");
10838     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10839     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10840     if((regs[i].wasdirty>>6)&1) printf("esi ");
10841     if((regs[i].wasdirty>>7)&1) printf("edi ");
10842     #endif
10843     #ifdef __arm__
10844     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10845     printf("dirty: ");
10846     if(regs[i].wasdirty&1) printf("r0 ");
10847     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10848     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10849     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10850     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10851     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10852     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10853     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10854     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10855     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10856     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10857     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10858     #endif
10859     printf("\n");
10860     disassemble_inst(i);
10861     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10862     #if defined(__i386__) || defined(__x86_64__)
10863     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10864     if(regs[i].dirty&1) printf("eax ");
10865     if((regs[i].dirty>>1)&1) printf("ecx ");
10866     if((regs[i].dirty>>2)&1) printf("edx ");
10867     if((regs[i].dirty>>3)&1) printf("ebx ");
10868     if((regs[i].dirty>>5)&1) printf("ebp ");
10869     if((regs[i].dirty>>6)&1) printf("esi ");
10870     if((regs[i].dirty>>7)&1) printf("edi ");
10871     #endif
10872     #ifdef __arm__
10873     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10874     if(regs[i].dirty&1) printf("r0 ");
10875     if((regs[i].dirty>>1)&1) printf("r1 ");
10876     if((regs[i].dirty>>2)&1) printf("r2 ");
10877     if((regs[i].dirty>>3)&1) printf("r3 ");
10878     if((regs[i].dirty>>4)&1) printf("r4 ");
10879     if((regs[i].dirty>>5)&1) printf("r5 ");
10880     if((regs[i].dirty>>6)&1) printf("r6 ");
10881     if((regs[i].dirty>>7)&1) printf("r7 ");
10882     if((regs[i].dirty>>8)&1) printf("r8 ");
10883     if((regs[i].dirty>>9)&1) printf("r9 ");
10884     if((regs[i].dirty>>10)&1) printf("r10 ");
10885     if((regs[i].dirty>>12)&1) printf("r12 ");
10886     #endif
10887     printf("\n");
10888     if(regs[i].isconst) {
10889       printf("constants: ");
10890       #if defined(__i386__) || defined(__x86_64__)
10891       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10892       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10893       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10894       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10895       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10896       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10897       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10898       #endif
10899       #ifdef __arm__
10900       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10901       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10902       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10903       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10904       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10905       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10906       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10907       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10908       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10909       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10910       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10911       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10912       #endif
10913       printf("\n");
10914     }
10915 #ifndef FORCE32
10916     printf(" 32:");
10917     for(r=0;r<=CCREG;r++) {
10918       if((regs[i].is32>>r)&1) {
10919         if(r==CCREG) printf(" CC");
10920         else if(r==HIREG) printf(" HI");
10921         else if(r==LOREG) printf(" LO");
10922         else printf(" r%d",r);
10923       }
10924     }
10925     printf("\n");
10926 #endif
10927     /*printf(" p32:");
10928     for(r=0;r<=CCREG;r++) {
10929       if((p32[i]>>r)&1) {
10930         if(r==CCREG) printf(" CC");
10931         else if(r==HIREG) printf(" HI");
10932         else if(r==LOREG) printf(" LO");
10933         else printf(" r%d",r);
10934       }
10935     }
10936     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10937     else printf("\n");*/
10938     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10939       #if defined(__i386__) || defined(__x86_64__)
10940       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10941       if(branch_regs[i].dirty&1) printf("eax ");
10942       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10943       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10944       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10945       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10946       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10947       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10948       #endif
10949       #ifdef __arm__
10950       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10951       if(branch_regs[i].dirty&1) printf("r0 ");
10952       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10953       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10954       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10955       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10956       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10957       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10958       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10959       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10960       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10961       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10962       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10963       #endif
10964 #ifndef FORCE32
10965       printf(" 32:");
10966       for(r=0;r<=CCREG;r++) {
10967         if((branch_regs[i].is32>>r)&1) {
10968           if(r==CCREG) printf(" CC");
10969           else if(r==HIREG) printf(" HI");
10970           else if(r==LOREG) printf(" LO");
10971           else printf(" r%d",r);
10972         }
10973       }
10974       printf("\n");
10975 #endif
10976     }
10977   }
10978
10979   /* Pass 8 - Assembly */
10980   linkcount=0;stubcount=0;
10981   ds=0;is_delayslot=0;
10982   cop1_usable=0;
10983   uint64_t is32_pre=0;
10984   u_int dirty_pre=0;
10985   u_int beginning=(u_int)out;
10986   if((u_int)addr&1) {
10987     ds=1;
10988     pagespan_ds();
10989   }
10990   u_int instr_addr0_override=0;
10991
10992 #ifdef PCSX
10993   if (start == 0x80030000) {
10994     // nasty hack for fastbios thing
10995     // override block entry to this code
10996     instr_addr0_override=(u_int)out;
10997     emit_movimm(start,0);
10998     // abuse io address var as a flag that we
10999     // have already returned here once
11000     emit_readword((int)&address,1);
11001     emit_writeword(0,(int)&pcaddr);
11002     emit_writeword(0,(int)&address);
11003     emit_cmp(0,1);
11004     emit_jne((int)new_dyna_leave);
11005   }
11006 #endif
11007   for(i=0;i<slen;i++)
11008   {
11009     //if(ds) printf("ds: ");
11010     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
11011     if(ds) {
11012       ds=0; // Skip delay slot
11013       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11014       instr_addr[i]=0;
11015     } else {
11016       #ifndef DESTRUCTIVE_WRITEBACK
11017       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11018       {
11019         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11020               unneeded_reg[i],unneeded_reg_upper[i]);
11021         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11022               unneeded_reg[i],unneeded_reg_upper[i]);
11023       }
11024       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11025         is32_pre=branch_regs[i].is32;
11026         dirty_pre=branch_regs[i].dirty;
11027       }else{
11028         is32_pre=regs[i].is32;
11029         dirty_pre=regs[i].dirty;
11030       }
11031       #endif
11032       // write back
11033       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11034       {
11035         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11036                       unneeded_reg[i],unneeded_reg_upper[i]);
11037         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11038       }
11039       // branch target entry point
11040       instr_addr[i]=(u_int)out;
11041       assem_debug("<->\n");
11042       // load regs
11043       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11044         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11045       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11046       address_generation(i,&regs[i],regs[i].regmap_entry);
11047       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11048       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11049       {
11050         // Load the delay slot registers if necessary
11051         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11052           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11053         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11054           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11055         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11056           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11057       }
11058       else if(i+1<slen)
11059       {
11060         // Preload registers for following instruction
11061         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11062           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11063             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11064         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11065           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11066             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11067       }
11068       // TODO: if(is_ooo(i)) address_generation(i+1);
11069       if(itype[i]==CJUMP||itype[i]==FJUMP)
11070         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11071       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11072         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11073       if(bt[i]) cop1_usable=0;
11074       // assemble
11075       switch(itype[i]) {
11076         case ALU:
11077           alu_assemble(i,&regs[i]);break;
11078         case IMM16:
11079           imm16_assemble(i,&regs[i]);break;
11080         case SHIFT:
11081           shift_assemble(i,&regs[i]);break;
11082         case SHIFTIMM:
11083           shiftimm_assemble(i,&regs[i]);break;
11084         case LOAD:
11085           load_assemble(i,&regs[i]);break;
11086         case LOADLR:
11087           loadlr_assemble(i,&regs[i]);break;
11088         case STORE:
11089           store_assemble(i,&regs[i]);break;
11090         case STORELR:
11091           storelr_assemble(i,&regs[i]);break;
11092         case COP0:
11093           cop0_assemble(i,&regs[i]);break;
11094         case COP1:
11095           cop1_assemble(i,&regs[i]);break;
11096         case C1LS:
11097           c1ls_assemble(i,&regs[i]);break;
11098         case COP2:
11099           cop2_assemble(i,&regs[i]);break;
11100         case C2LS:
11101           c2ls_assemble(i,&regs[i]);break;
11102         case C2OP:
11103           c2op_assemble(i,&regs[i]);break;
11104         case FCONV:
11105           fconv_assemble(i,&regs[i]);break;
11106         case FLOAT:
11107           float_assemble(i,&regs[i]);break;
11108         case FCOMP:
11109           fcomp_assemble(i,&regs[i]);break;
11110         case MULTDIV:
11111           multdiv_assemble(i,&regs[i]);break;
11112         case MOV:
11113           mov_assemble(i,&regs[i]);break;
11114         case SYSCALL:
11115           syscall_assemble(i,&regs[i]);break;
11116         case HLECALL:
11117           hlecall_assemble(i,&regs[i]);break;
11118         case INTCALL:
11119           intcall_assemble(i,&regs[i]);break;
11120         case UJUMP:
11121           ujump_assemble(i,&regs[i]);ds=1;break;
11122         case RJUMP:
11123           rjump_assemble(i,&regs[i]);ds=1;break;
11124         case CJUMP:
11125           cjump_assemble(i,&regs[i]);ds=1;break;
11126         case SJUMP:
11127           sjump_assemble(i,&regs[i]);ds=1;break;
11128         case FJUMP:
11129           fjump_assemble(i,&regs[i]);ds=1;break;
11130         case SPAN:
11131           pagespan_assemble(i,&regs[i]);break;
11132       }
11133       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11134         literal_pool(1024);
11135       else
11136         literal_pool_jumpover(256);
11137     }
11138   }
11139   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11140   // If the block did not end with an unconditional branch,
11141   // add a jump to the next instruction.
11142   if(i>1) {
11143     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11144       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11145       assert(i==slen);
11146       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11147         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11148         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11149           emit_loadreg(CCREG,HOST_CCREG);
11150         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11151       }
11152       else if(!likely[i-2])
11153       {
11154         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11155         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11156       }
11157       else
11158       {
11159         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11160         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11161       }
11162       add_to_linker((int)out,start+i*4,0);
11163       emit_jmp(0);
11164     }
11165   }
11166   else
11167   {
11168     assert(i>0);
11169     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11170     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11171     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11172       emit_loadreg(CCREG,HOST_CCREG);
11173     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11174     add_to_linker((int)out,start+i*4,0);
11175     emit_jmp(0);
11176   }
11177
11178   // TODO: delay slot stubs?
11179   // Stubs
11180   for(i=0;i<stubcount;i++)
11181   {
11182     switch(stubs[i][0])
11183     {
11184       case LOADB_STUB:
11185       case LOADH_STUB:
11186       case LOADW_STUB:
11187       case LOADD_STUB:
11188       case LOADBU_STUB:
11189       case LOADHU_STUB:
11190         do_readstub(i);break;
11191       case STOREB_STUB:
11192       case STOREH_STUB:
11193       case STOREW_STUB:
11194       case STORED_STUB:
11195         do_writestub(i);break;
11196       case CC_STUB:
11197         do_ccstub(i);break;
11198       case INVCODE_STUB:
11199         do_invstub(i);break;
11200       case FP_STUB:
11201         do_cop1stub(i);break;
11202       case STORELR_STUB:
11203         do_unalignedwritestub(i);break;
11204     }
11205   }
11206
11207   if (instr_addr0_override)
11208     instr_addr[0] = instr_addr0_override;
11209
11210   /* Pass 9 - Linker */
11211   for(i=0;i<linkcount;i++)
11212   {
11213     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11214     literal_pool(64);
11215     if(!link_addr[i][2])
11216     {
11217       void *stub=out;
11218       void *addr=check_addr(link_addr[i][1]);
11219       emit_extjump(link_addr[i][0],link_addr[i][1]);
11220       if(addr) {
11221         set_jump_target(link_addr[i][0],(int)addr);
11222         add_link(link_addr[i][1],stub);
11223       }
11224       else set_jump_target(link_addr[i][0],(int)stub);
11225     }
11226     else
11227     {
11228       // Internal branch
11229       int target=(link_addr[i][1]-start)>>2;
11230       assert(target>=0&&target<slen);
11231       assert(instr_addr[target]);
11232       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11233       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11234       //#else
11235       set_jump_target(link_addr[i][0],instr_addr[target]);
11236       //#endif
11237     }
11238   }
11239   // External Branch Targets (jump_in)
11240   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11241   for(i=0;i<slen;i++)
11242   {
11243     if(bt[i]||i==0)
11244     {
11245       if(instr_addr[i]) // TODO - delay slots (=null)
11246       {
11247         u_int vaddr=start+i*4;
11248         u_int page=get_page(vaddr);
11249         u_int vpage=get_vpage(vaddr);
11250         literal_pool(256);
11251         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11252 #ifndef FORCE32
11253         if(!requires_32bit[i])
11254 #else
11255         if(1)
11256 #endif
11257         {
11258           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11259           assem_debug("jump_in: %x\n",start+i*4);
11260           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11261           int entry_point=do_dirty_stub(i);
11262           ll_add(jump_in+page,vaddr,(void *)entry_point);
11263           // If there was an existing entry in the hash table,
11264           // replace it with the new address.
11265           // Don't add new entries.  We'll insert the
11266           // ones that actually get used in check_addr().
11267           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11268           if(ht_bin[0]==vaddr) {
11269             ht_bin[1]=entry_point;
11270           }
11271           if(ht_bin[2]==vaddr) {
11272             ht_bin[3]=entry_point;
11273           }
11274         }
11275         else
11276         {
11277           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11278           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11279           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11280           //int entry_point=(int)out;
11281           ////assem_debug("entry_point: %x\n",entry_point);
11282           //load_regs_entry(i);
11283           //if(entry_point==(int)out)
11284           //  entry_point=instr_addr[i];
11285           //else
11286           //  emit_jmp(instr_addr[i]);
11287           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11288           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11289           int entry_point=do_dirty_stub(i);
11290           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11291         }
11292       }
11293     }
11294   }
11295   // Write out the literal pool if necessary
11296   literal_pool(0);
11297   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11298   // Align code
11299   if(((u_int)out)&7) emit_addnop(13);
11300   #endif
11301   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11302   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11303   memcpy(copy,source,slen*4);
11304   copy+=slen*4;
11305   
11306   #ifdef __arm__
11307   __clear_cache((void *)beginning,out);
11308   #endif
11309   
11310   // If we're within 256K of the end of the buffer,
11311   // start over from the beginning. (Is 256K enough?)
11312   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11313   
11314   // Trap writes to any of the pages we compiled
11315   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11316     invalid_code[i]=0;
11317 #ifndef DISABLE_TLB
11318     memory_map[i]|=0x40000000;
11319     if((signed int)start>=(signed int)0xC0000000) {
11320       assert(using_tlb);
11321       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11322       invalid_code[j]=0;
11323       memory_map[j]|=0x40000000;
11324       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11325     }
11326 #endif
11327   }
11328 #ifdef PCSX
11329   // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
11330   if(get_page(start)<(RAM_SIZE>>12))
11331     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11332       invalid_code[((u_int)0x80000000>>12)|i]=0;
11333 #endif
11334   
11335   /* Pass 10 - Free memory by expiring oldest blocks */
11336   
11337   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11338   while(expirep!=end)
11339   {
11340     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11341     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11342     inv_debug("EXP: Phase %d\n",expirep);
11343     switch((expirep>>11)&3)
11344     {
11345       case 0:
11346         // Clear jump_in and jump_dirty
11347         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11348         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11349         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11350         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11351         break;
11352       case 1:
11353         // Clear pointers
11354         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11355         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11356         break;
11357       case 2:
11358         // Clear hash table
11359         for(i=0;i<32;i++) {
11360           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11361           if((ht_bin[3]>>shift)==(base>>shift) ||
11362              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11363             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11364             ht_bin[2]=ht_bin[3]=-1;
11365           }
11366           if((ht_bin[1]>>shift)==(base>>shift) ||
11367              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11368             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11369             ht_bin[0]=ht_bin[2];
11370             ht_bin[1]=ht_bin[3];
11371             ht_bin[2]=ht_bin[3]=-1;
11372           }
11373         }
11374         break;
11375       case 3:
11376         // Clear jump_out
11377         #ifdef __arm__
11378         if((expirep&2047)==0) 
11379           do_clear_cache();
11380         #endif
11381         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11382         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11383         break;
11384     }
11385     expirep=(expirep+1)&65535;
11386   }
11387   return 0;
11388 }
11389
11390 // vim:shiftwidth=2:expandtab