drc: start in more consistent state
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   static u_int sp_in_mirror;
130   u_int stop_after_jal;
131   extern u_char restore_candidate[512];
132   extern int cycle_count;
133
134   /* registers that may be allocated */
135   /* 1-31 gpr */
136 #define HIREG 32 // hi
137 #define LOREG 33 // lo
138 #define FSREG 34 // FPU status (FCSR)
139 #define CSREG 35 // Coprocessor status
140 #define CCREG 36 // Cycle count
141 #define INVCP 37 // Pointer to invalid_code
142 #define MMREG 38 // Pointer to memory_map
143 #define ROREG 39 // ram offset (if rdram!=0x80000000)
144 #define TEMPREG 40
145 #define FTEMP 40 // FPU temporary register
146 #define PTEMP 41 // Prefetch temporary register
147 #define TLREG 42 // TLB mapping offset
148 #define RHASH 43 // Return address hash
149 #define RHTBL 44 // Return address hash table address
150 #define RTEMP 45 // JR/JALR address register
151 #define MAXREG 45
152 #define AGEN1 46 // Address generation temporary register
153 #define AGEN2 47 // Address generation temporary register
154 #define MGEN1 48 // Maptable address generation temporary register
155 #define MGEN2 49 // Maptable address generation temporary register
156 #define BTREG 50 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185 #define HLECALL 26// PCSX fake opcodes for HLE
186 #define COP2 27   // Coprocessor 2 move
187 #define C2LS 28   // Coprocessor 2 load/store
188 #define C2OP 29   // Coprocessor 2 operation
189 #define INTCALL 30// Call interpreter to handle rare corner cases
190
191   /* stubs */
192 #define CC_STUB 1
193 #define FP_STUB 2
194 #define LOADB_STUB 3
195 #define LOADH_STUB 4
196 #define LOADW_STUB 5
197 #define LOADD_STUB 6
198 #define LOADBU_STUB 7
199 #define LOADHU_STUB 8
200 #define STOREB_STUB 9
201 #define STOREH_STUB 10
202 #define STOREW_STUB 11
203 #define STORED_STUB 12
204 #define STORELR_STUB 13
205 #define INVCODE_STUB 14
206
207   /* branch codes */
208 #define TAKEN 1
209 #define NOTTAKEN 2
210 #define NULLDS 3
211
212 // asm linkage
213 int new_recompile_block(int addr);
214 void *get_addr_ht(u_int vaddr);
215 void invalidate_block(u_int block);
216 void invalidate_addr(u_int addr);
217 void remove_hash(int vaddr);
218 void jump_vaddr();
219 void dyna_linker();
220 void dyna_linker_ds();
221 void verify_code();
222 void verify_code_vm();
223 void verify_code_ds();
224 void cc_interrupt();
225 void fp_exception();
226 void fp_exception_ds();
227 void jump_syscall();
228 void jump_syscall_hle();
229 void jump_eret();
230 void jump_hlecall();
231 void jump_intcall();
232 void new_dyna_leave();
233
234 // TLB
235 void TLBWI_new();
236 void TLBWR_new();
237 void read_nomem_new();
238 void read_nomemb_new();
239 void read_nomemh_new();
240 void read_nomemd_new();
241 void write_nomem_new();
242 void write_nomemb_new();
243 void write_nomemh_new();
244 void write_nomemd_new();
245 void write_rdram_new();
246 void write_rdramb_new();
247 void write_rdramh_new();
248 void write_rdramd_new();
249 extern u_int memory_map[1048576];
250
251 // Needed by assembler
252 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
253 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
254 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
255 void load_all_regs(signed char i_regmap[]);
256 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
257 void load_regs_entry(int t);
258 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
259
260 int tracedebug=0;
261
262 //#define DEBUG_CYCLE_COUNT 1
263
264 void nullf() {}
265 //#define assem_debug printf
266 //#define inv_debug printf
267 #define assem_debug nullf
268 #define inv_debug nullf
269
270 static void tlb_hacks()
271 {
272 #ifndef DISABLE_TLB
273   // Goldeneye hack
274   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
275   {
276     u_int addr;
277     int n;
278     switch (ROM_HEADER->Country_code&0xFF) 
279     {
280       case 0x45: // U
281         addr=0x34b30;
282         break;                   
283       case 0x4A: // J 
284         addr=0x34b70;    
285         break;    
286       case 0x50: // E 
287         addr=0x329f0;
288         break;                        
289       default: 
290         // Unknown country code
291         addr=0;
292         break;
293     }
294     u_int rom_addr=(u_int)rom;
295     #ifdef ROM_COPY
296     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
297     // in the lower 4G of memory to use this hack.  Copy it if necessary.
298     if((void *)rom>(void *)0xffffffff) {
299       munmap(ROM_COPY, 67108864);
300       if(mmap(ROM_COPY, 12582912,
301               PROT_READ | PROT_WRITE,
302               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
303               -1, 0) <= 0) {printf("mmap() failed\n");}
304       memcpy(ROM_COPY,rom,12582912);
305       rom_addr=(u_int)ROM_COPY;
306     }
307     #endif
308     if(addr) {
309       for(n=0x7F000;n<0x80000;n++) {
310         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
311       }
312     }
313   }
314 #endif
315 }
316
317 static u_int get_page(u_int vaddr)
318 {
319 #ifndef PCSX
320   u_int page=(vaddr^0x80000000)>>12;
321 #else
322   u_int page=vaddr&~0xe0000000;
323   if (page < 0x1000000)
324     page &= ~0x0e00000; // RAM mirrors
325   page>>=12;
326 #endif
327 #ifndef DISABLE_TLB
328   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
329 #endif
330   if(page>2048) page=2048+(page&2047);
331   return page;
332 }
333
334 static u_int get_vpage(u_int vaddr)
335 {
336   u_int vpage=(vaddr^0x80000000)>>12;
337 #ifndef DISABLE_TLB
338   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
339 #endif
340   if(vpage>2048) vpage=2048+(vpage&2047);
341   return vpage;
342 }
343
344 // Get address from virtual address
345 // This is called from the recompiled JR/JALR instructions
346 void *get_addr(u_int vaddr)
347 {
348   u_int page=get_page(vaddr);
349   u_int vpage=get_vpage(vaddr);
350   struct ll_entry *head;
351   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
352   head=jump_in[page];
353   while(head!=NULL) {
354     if(head->vaddr==vaddr&&head->reg32==0) {
355   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
356       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
357       ht_bin[3]=ht_bin[1];
358       ht_bin[2]=ht_bin[0];
359       ht_bin[1]=(int)head->addr;
360       ht_bin[0]=vaddr;
361       return head->addr;
362     }
363     head=head->next;
364   }
365   head=jump_dirty[vpage];
366   while(head!=NULL) {
367     if(head->vaddr==vaddr&&head->reg32==0) {
368       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
369       // Don't restore blocks which are about to expire from the cache
370       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
371       if(verify_dirty(head->addr)) {
372         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
373         invalid_code[vaddr>>12]=0;
374         memory_map[vaddr>>12]|=0x40000000;
375         if(vpage<2048) {
376 #ifndef DISABLE_TLB
377           if(tlb_LUT_r[vaddr>>12]) {
378             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
379             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
380           }
381 #endif
382           restore_candidate[vpage>>3]|=1<<(vpage&7);
383         }
384         else restore_candidate[page>>3]|=1<<(page&7);
385         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
386         if(ht_bin[0]==vaddr) {
387           ht_bin[1]=(int)head->addr; // Replace existing entry
388         }
389         else
390         {
391           ht_bin[3]=ht_bin[1];
392           ht_bin[2]=ht_bin[0];
393           ht_bin[1]=(int)head->addr;
394           ht_bin[0]=vaddr;
395         }
396         return head->addr;
397       }
398     }
399     head=head->next;
400   }
401   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
402   int r=new_recompile_block(vaddr);
403   if(r==0) return get_addr(vaddr);
404   // Execute in unmapped page, generate pagefault execption
405   Status|=2;
406   Cause=(vaddr<<31)|0x8;
407   EPC=(vaddr&1)?vaddr-5:vaddr;
408   BadVAddr=(vaddr&~1);
409   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
410   EntryHi=BadVAddr&0xFFFFE000;
411   return get_addr_ht(0x80000000);
412 }
413 // Look up address in hash table first
414 void *get_addr_ht(u_int vaddr)
415 {
416   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   return get_addr(vaddr);
421 }
422
423 void *get_addr_32(u_int vaddr,u_int flags)
424 {
425 #ifdef FORCE32
426   return get_addr(vaddr);
427 #else
428   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
429   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
430   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
431   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
432   u_int page=get_page(vaddr);
433   u_int vpage=get_vpage(vaddr);
434   struct ll_entry *head;
435   head=jump_in[page];
436   while(head!=NULL) {
437     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
438       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
439       if(head->reg32==0) {
440         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441         if(ht_bin[0]==-1) {
442           ht_bin[1]=(int)head->addr;
443           ht_bin[0]=vaddr;
444         }else if(ht_bin[2]==-1) {
445           ht_bin[3]=(int)head->addr;
446           ht_bin[2]=vaddr;
447         }
448         //ht_bin[3]=ht_bin[1];
449         //ht_bin[2]=ht_bin[0];
450         //ht_bin[1]=(int)head->addr;
451         //ht_bin[0]=vaddr;
452       }
453       return head->addr;
454     }
455     head=head->next;
456   }
457   head=jump_dirty[vpage];
458   while(head!=NULL) {
459     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
460       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
461       // Don't restore blocks which are about to expire from the cache
462       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
463       if(verify_dirty(head->addr)) {
464         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
465         invalid_code[vaddr>>12]=0;
466         memory_map[vaddr>>12]|=0x40000000;
467         if(vpage<2048) {
468 #ifndef DISABLE_TLB
469           if(tlb_LUT_r[vaddr>>12]) {
470             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
471             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
472           }
473 #endif
474           restore_candidate[vpage>>3]|=1<<(vpage&7);
475         }
476         else restore_candidate[page>>3]|=1<<(page&7);
477         if(head->reg32==0) {
478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479           if(ht_bin[0]==-1) {
480             ht_bin[1]=(int)head->addr;
481             ht_bin[0]=vaddr;
482           }else if(ht_bin[2]==-1) {
483             ht_bin[3]=(int)head->addr;
484             ht_bin[2]=vaddr;
485           }
486           //ht_bin[3]=ht_bin[1];
487           //ht_bin[2]=ht_bin[0];
488           //ht_bin[1]=(int)head->addr;
489           //ht_bin[0]=vaddr;
490         }
491         return head->addr;
492       }
493     }
494     head=head->next;
495   }
496   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
497   int r=new_recompile_block(vaddr);
498   if(r==0) return get_addr(vaddr);
499   // Execute in unmapped page, generate pagefault execption
500   Status|=2;
501   Cause=(vaddr<<31)|0x8;
502   EPC=(vaddr&1)?vaddr-5:vaddr;
503   BadVAddr=(vaddr&~1);
504   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
505   EntryHi=BadVAddr&0xFFFFE000;
506   return get_addr_ht(0x80000000);
507 #endif
508 }
509
510 void clear_all_regs(signed char regmap[])
511 {
512   int hr;
513   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
514 }
515
516 signed char get_reg(signed char regmap[],int r)
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
520   return -1;
521 }
522
523 // Find a register that is available for two consecutive cycles
524 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
525 {
526   int hr;
527   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
528   return -1;
529 }
530
531 int count_free_regs(signed char regmap[])
532 {
533   int count=0;
534   int hr;
535   for(hr=0;hr<HOST_REGS;hr++)
536   {
537     if(hr!=EXCLUDE_REG) {
538       if(regmap[hr]<0) count++;
539     }
540   }
541   return count;
542 }
543
544 void dirty_reg(struct regstat *cur,signed char reg)
545 {
546   int hr;
547   if(!reg) return;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       cur->dirty|=1<<hr;
551     }
552   }
553 }
554
555 // If we dirty the lower half of a 64 bit register which is now being
556 // sign-extended, we need to dump the upper half.
557 // Note: Do this only after completion of the instruction, because
558 // some instructions may need to read the full 64-bit value even if
559 // overwriting it (eg SLTI, DSRA32).
560 static void flush_dirty_uppers(struct regstat *cur)
561 {
562   int hr,reg;
563   for (hr=0;hr<HOST_REGS;hr++) {
564     if((cur->dirty>>hr)&1) {
565       reg=cur->regmap[hr];
566       if(reg>=64) 
567         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
568     }
569   }
570 }
571
572 void set_const(struct regstat *cur,signed char reg,uint64_t value)
573 {
574   int hr;
575   if(!reg) return;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       cur->isconst|=1<<hr;
579       cur->constmap[hr]=value;
580     }
581     else if((cur->regmap[hr]^64)==reg) {
582       cur->isconst|=1<<hr;
583       cur->constmap[hr]=value>>32;
584     }
585   }
586 }
587
588 void clear_const(struct regstat *cur,signed char reg)
589 {
590   int hr;
591   if(!reg) return;
592   for (hr=0;hr<HOST_REGS;hr++) {
593     if((cur->regmap[hr]&63)==reg) {
594       cur->isconst&=~(1<<hr);
595     }
596   }
597 }
598
599 int is_const(struct regstat *cur,signed char reg)
600 {
601   int hr;
602   if(!reg) return 1;
603   for (hr=0;hr<HOST_REGS;hr++) {
604     if((cur->regmap[hr]&63)==reg) {
605       return (cur->isconst>>hr)&1;
606     }
607   }
608   return 0;
609 }
610 uint64_t get_const(struct regstat *cur,signed char reg)
611 {
612   int hr;
613   if(!reg) return 0;
614   for (hr=0;hr<HOST_REGS;hr++) {
615     if(cur->regmap[hr]==reg) {
616       return cur->constmap[hr];
617     }
618   }
619   printf("Unknown constant in r%d\n",reg);
620   exit(1);
621 }
622
623 // Least soon needed registers
624 // Look at the next ten instructions and see which registers
625 // will be used.  Try not to reallocate these.
626 void lsn(u_char hsn[], int i, int *preferred_reg)
627 {
628   int j;
629   int b=-1;
630   for(j=0;j<9;j++)
631   {
632     if(i+j>=slen) {
633       j=slen-i-1;
634       break;
635     }
636     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
637     {
638       // Don't go past an unconditonal jump
639       j++;
640       break;
641     }
642   }
643   for(;j>=0;j--)
644   {
645     if(rs1[i+j]) hsn[rs1[i+j]]=j;
646     if(rs2[i+j]) hsn[rs2[i+j]]=j;
647     if(rt1[i+j]) hsn[rt1[i+j]]=j;
648     if(rt2[i+j]) hsn[rt2[i+j]]=j;
649     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
650       // Stores can allocate zero
651       hsn[rs1[i+j]]=j;
652       hsn[rs2[i+j]]=j;
653     }
654     // On some architectures stores need invc_ptr
655     #if defined(HOST_IMM8)
656     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
657       hsn[INVCP]=j;
658     }
659     #endif
660     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
661     {
662       hsn[CCREG]=j;
663       b=j;
664     }
665   }
666   if(b>=0)
667   {
668     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
669     {
670       // Follow first branch
671       int t=(ba[i+b]-start)>>2;
672       j=7-b;if(t+j>=slen) j=slen-t-1;
673       for(;j>=0;j--)
674       {
675         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
676         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
677         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
678         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
679       }
680     }
681     // TODO: preferred register based on backward branch
682   }
683   // Delay slot should preferably not overwrite branch conditions or cycle count
684   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
685     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
686     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
687     hsn[CCREG]=1;
688     // ...or hash tables
689     hsn[RHASH]=1;
690     hsn[RHTBL]=1;
691   }
692   // Coprocessor load/store needs FTEMP, even if not declared
693   if(itype[i]==C1LS||itype[i]==C2LS) {
694     hsn[FTEMP]=0;
695   }
696   // Load L/R also uses FTEMP as a temporary register
697   if(itype[i]==LOADLR) {
698     hsn[FTEMP]=0;
699   }
700   // Also SWL/SWR/SDL/SDR
701   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
702     hsn[FTEMP]=0;
703   }
704   // Don't remove the TLB registers either
705   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
706     hsn[TLREG]=0;
707   }
708   // Don't remove the miniht registers
709   if(itype[i]==UJUMP||itype[i]==RJUMP)
710   {
711     hsn[RHASH]=0;
712     hsn[RHTBL]=0;
713   }
714 }
715
716 // We only want to allocate registers if we're going to use them again soon
717 int needed_again(int r, int i)
718 {
719   int j;
720   int b=-1;
721   int rn=10;
722   int hr;
723   u_char hsn[MAXREG+1];
724   int preferred_reg;
725   
726   memset(hsn,10,sizeof(hsn));
727   lsn(hsn,i,&preferred_reg);
728   
729   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
730   {
731     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
732       return 0; // Don't need any registers if exiting the block
733   }
734   for(j=0;j<9;j++)
735   {
736     if(i+j>=slen) {
737       j=slen-i-1;
738       break;
739     }
740     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
741     {
742       // Don't go past an unconditonal jump
743       j++;
744       break;
745     }
746     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
747     {
748       break;
749     }
750   }
751   for(;j>=1;j--)
752   {
753     if(rs1[i+j]==r) rn=j;
754     if(rs2[i+j]==r) rn=j;
755     if((unneeded_reg[i+j]>>r)&1) rn=10;
756     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
757     {
758       b=j;
759     }
760   }
761   /*
762   if(b>=0)
763   {
764     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
765     {
766       // Follow first branch
767       int o=rn;
768       int t=(ba[i+b]-start)>>2;
769       j=7-b;if(t+j>=slen) j=slen-t-1;
770       for(;j>=0;j--)
771       {
772         if(!((unneeded_reg[t+j]>>r)&1)) {
773           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
774           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
775         }
776         else rn=o;
777       }
778     }
779   }*/
780   for(hr=0;hr<HOST_REGS;hr++) {
781     if(hr!=EXCLUDE_REG) {
782       if(rn<hsn[hr]) return 1;
783     }
784   }
785   return 0;
786 }
787
788 // Try to match register allocations at the end of a loop with those
789 // at the beginning
790 int loop_reg(int i, int r, int hr)
791 {
792   int j,k;
793   for(j=0;j<9;j++)
794   {
795     if(i+j>=slen) {
796       j=slen-i-1;
797       break;
798     }
799     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
800     {
801       // Don't go past an unconditonal jump
802       j++;
803       break;
804     }
805   }
806   k=0;
807   if(i>0){
808     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
809       k--;
810   }
811   for(;k<j;k++)
812   {
813     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
814     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
815     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
816     {
817       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
818       {
819         int t=(ba[i+k]-start)>>2;
820         int reg=get_reg(regs[t].regmap_entry,r);
821         if(reg>=0) return reg;
822         //reg=get_reg(regs[t+1].regmap_entry,r);
823         //if(reg>=0) return reg;
824       }
825     }
826   }
827   return hr;
828 }
829
830
831 // Allocate every register, preserving source/target regs
832 void alloc_all(struct regstat *cur,int i)
833 {
834   int hr;
835   
836   for(hr=0;hr<HOST_REGS;hr++) {
837     if(hr!=EXCLUDE_REG) {
838       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
839          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
840       {
841         cur->regmap[hr]=-1;
842         cur->dirty&=~(1<<hr);
843       }
844       // Don't need zeros
845       if((cur->regmap[hr]&63)==0)
846       {
847         cur->regmap[hr]=-1;
848         cur->dirty&=~(1<<hr);
849       }
850     }
851   }
852 }
853
854
855 void div64(int64_t dividend,int64_t divisor)
856 {
857   lo=dividend/divisor;
858   hi=dividend%divisor;
859   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
860   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
861 }
862 void divu64(uint64_t dividend,uint64_t divisor)
863 {
864   lo=dividend/divisor;
865   hi=dividend%divisor;
866   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
867   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
868 }
869
870 void mult64(uint64_t m1,uint64_t m2)
871 {
872    unsigned long long int op1, op2, op3, op4;
873    unsigned long long int result1, result2, result3, result4;
874    unsigned long long int temp1, temp2, temp3, temp4;
875    int sign = 0;
876    
877    if (m1 < 0)
878      {
879     op2 = -m1;
880     sign = 1 - sign;
881      }
882    else op2 = m1;
883    if (m2 < 0)
884      {
885     op4 = -m2;
886     sign = 1 - sign;
887      }
888    else op4 = m2;
889    
890    op1 = op2 & 0xFFFFFFFF;
891    op2 = (op2 >> 32) & 0xFFFFFFFF;
892    op3 = op4 & 0xFFFFFFFF;
893    op4 = (op4 >> 32) & 0xFFFFFFFF;
894    
895    temp1 = op1 * op3;
896    temp2 = (temp1 >> 32) + op1 * op4;
897    temp3 = op2 * op3;
898    temp4 = (temp3 >> 32) + op2 * op4;
899    
900    result1 = temp1 & 0xFFFFFFFF;
901    result2 = temp2 + (temp3 & 0xFFFFFFFF);
902    result3 = (result2 >> 32) + temp4;
903    result4 = (result3 >> 32);
904    
905    lo = result1 | (result2 << 32);
906    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
907    if (sign)
908      {
909     hi = ~hi;
910     if (!lo) hi++;
911     else lo = ~lo + 1;
912      }
913 }
914
915 void multu64(uint64_t m1,uint64_t m2)
916 {
917    unsigned long long int op1, op2, op3, op4;
918    unsigned long long int result1, result2, result3, result4;
919    unsigned long long int temp1, temp2, temp3, temp4;
920    
921    op1 = m1 & 0xFFFFFFFF;
922    op2 = (m1 >> 32) & 0xFFFFFFFF;
923    op3 = m2 & 0xFFFFFFFF;
924    op4 = (m2 >> 32) & 0xFFFFFFFF;
925    
926    temp1 = op1 * op3;
927    temp2 = (temp1 >> 32) + op1 * op4;
928    temp3 = op2 * op3;
929    temp4 = (temp3 >> 32) + op2 * op4;
930    
931    result1 = temp1 & 0xFFFFFFFF;
932    result2 = temp2 + (temp3 & 0xFFFFFFFF);
933    result3 = (result2 >> 32) + temp4;
934    result4 = (result3 >> 32);
935    
936    lo = result1 | (result2 << 32);
937    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
938    
939   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
940   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
941 }
942
943 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
944 {
945   if(bits) {
946     original<<=64-bits;
947     original>>=64-bits;
948     loaded<<=bits;
949     original|=loaded;
950   }
951   else original=loaded;
952   return original;
953 }
954 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
955 {
956   if(bits^56) {
957     original>>=64-(bits^56);
958     original<<=64-(bits^56);
959     loaded>>=bits^56;
960     original|=loaded;
961   }
962   else original=loaded;
963   return original;
964 }
965
966 #ifdef __i386__
967 #include "assem_x86.c"
968 #endif
969 #ifdef __x86_64__
970 #include "assem_x64.c"
971 #endif
972 #ifdef __arm__
973 #include "assem_arm.c"
974 #endif
975
976 // Add virtual address mapping to linked list
977 void ll_add(struct ll_entry **head,int vaddr,void *addr)
978 {
979   struct ll_entry *new_entry;
980   new_entry=malloc(sizeof(struct ll_entry));
981   assert(new_entry!=NULL);
982   new_entry->vaddr=vaddr;
983   new_entry->reg32=0;
984   new_entry->addr=addr;
985   new_entry->next=*head;
986   *head=new_entry;
987 }
988
989 // Add virtual address mapping for 32-bit compiled block
990 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
991 {
992   ll_add(head,vaddr,addr);
993 #ifndef FORCE32
994   (*head)->reg32=reg32;
995 #endif
996 }
997
998 // Check if an address is already compiled
999 // but don't return addresses which are about to expire from the cache
1000 void *check_addr(u_int vaddr)
1001 {
1002   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1003   if(ht_bin[0]==vaddr) {
1004     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1005       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1006   }
1007   if(ht_bin[2]==vaddr) {
1008     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1009       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1010   }
1011   u_int page=get_page(vaddr);
1012   struct ll_entry *head;
1013   head=jump_in[page];
1014   while(head!=NULL) {
1015     if(head->vaddr==vaddr&&head->reg32==0) {
1016       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1017         // Update existing entry with current address
1018         if(ht_bin[0]==vaddr) {
1019           ht_bin[1]=(int)head->addr;
1020           return head->addr;
1021         }
1022         if(ht_bin[2]==vaddr) {
1023           ht_bin[3]=(int)head->addr;
1024           return head->addr;
1025         }
1026         // Insert into hash table with low priority.
1027         // Don't evict existing entries, as they are probably
1028         // addresses that are being accessed frequently.
1029         if(ht_bin[0]==-1) {
1030           ht_bin[1]=(int)head->addr;
1031           ht_bin[0]=vaddr;
1032         }else if(ht_bin[2]==-1) {
1033           ht_bin[3]=(int)head->addr;
1034           ht_bin[2]=vaddr;
1035         }
1036         return head->addr;
1037       }
1038     }
1039     head=head->next;
1040   }
1041   return 0;
1042 }
1043
1044 void remove_hash(int vaddr)
1045 {
1046   //printf("remove hash: %x\n",vaddr);
1047   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1048   if(ht_bin[2]==vaddr) {
1049     ht_bin[2]=ht_bin[3]=-1;
1050   }
1051   if(ht_bin[0]==vaddr) {
1052     ht_bin[0]=ht_bin[2];
1053     ht_bin[1]=ht_bin[3];
1054     ht_bin[2]=ht_bin[3]=-1;
1055   }
1056 }
1057
1058 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1059 {
1060   struct ll_entry *next;
1061   while(*head) {
1062     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1063        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1064     {
1065       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1066       remove_hash((*head)->vaddr);
1067       next=(*head)->next;
1068       free(*head);
1069       *head=next;
1070     }
1071     else
1072     {
1073       head=&((*head)->next);
1074     }
1075   }
1076 }
1077
1078 // Remove all entries from linked list
1079 void ll_clear(struct ll_entry **head)
1080 {
1081   struct ll_entry *cur;
1082   struct ll_entry *next;
1083   if(cur=*head) {
1084     *head=0;
1085     while(cur) {
1086       next=cur->next;
1087       free(cur);
1088       cur=next;
1089     }
1090   }
1091 }
1092
1093 // Dereference the pointers and remove if it matches
1094 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1095 {
1096   while(head) {
1097     int ptr=get_pointer(head->addr);
1098     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1099     if(((ptr>>shift)==(addr>>shift)) ||
1100        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1101     {
1102       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1103       u_int host_addr=(u_int)kill_pointer(head->addr);
1104       #ifdef __arm__
1105         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1106       #endif
1107     }
1108     head=head->next;
1109   }
1110 }
1111
1112 // This is called when we write to a compiled block (see do_invstub)
1113 void invalidate_page(u_int page)
1114 {
1115   struct ll_entry *head;
1116   struct ll_entry *next;
1117   head=jump_in[page];
1118   jump_in[page]=0;
1119   while(head!=NULL) {
1120     inv_debug("INVALIDATE: %x\n",head->vaddr);
1121     remove_hash(head->vaddr);
1122     next=head->next;
1123     free(head);
1124     head=next;
1125   }
1126   head=jump_out[page];
1127   jump_out[page]=0;
1128   while(head!=NULL) {
1129     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1130     u_int host_addr=(u_int)kill_pointer(head->addr);
1131     #ifdef __arm__
1132       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1133     #endif
1134     next=head->next;
1135     free(head);
1136     head=next;
1137   }
1138 }
1139 void invalidate_block(u_int block)
1140 {
1141   u_int page=get_page(block<<12);
1142   u_int vpage=get_vpage(block<<12);
1143   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1144   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1145   u_int first,last;
1146   first=last=page;
1147   struct ll_entry *head;
1148   head=jump_dirty[vpage];
1149   //printf("page=%d vpage=%d\n",page,vpage);
1150   while(head!=NULL) {
1151     u_int start,end;
1152     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1153       get_bounds((int)head->addr,&start,&end);
1154       //printf("start: %x end: %x\n",start,end);
1155       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1156         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1157           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1158           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1159         }
1160       }
1161 #ifndef DISABLE_TLB
1162       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1163         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1164           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1165           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1166         }
1167       }
1168 #endif
1169     }
1170     head=head->next;
1171   }
1172   //printf("first=%d last=%d\n",first,last);
1173   invalidate_page(page);
1174   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1175   assert(last<page+5);
1176   // Invalidate the adjacent pages if a block crosses a 4K boundary
1177   while(first<page) {
1178     invalidate_page(first);
1179     first++;
1180   }
1181   for(first=page+1;first<last;first++) {
1182     invalidate_page(first);
1183   }
1184   #ifdef __arm__
1185     do_clear_cache();
1186   #endif
1187   
1188   // Don't trap writes
1189   invalid_code[block]=1;
1190 #ifdef PCSX
1191   invalid_code[((u_int)0x80000000>>12)|page]=1;
1192 #endif
1193 #ifndef DISABLE_TLB
1194   // If there is a valid TLB entry for this page, remove write protect
1195   if(tlb_LUT_w[block]) {
1196     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1197     // CHECK: Is this right?
1198     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1199     u_int real_block=tlb_LUT_w[block]>>12;
1200     invalid_code[real_block]=1;
1201     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1202   }
1203   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1204 #endif
1205
1206   #ifdef USE_MINI_HT
1207   memset(mini_ht,-1,sizeof(mini_ht));
1208   #endif
1209 }
1210 void invalidate_addr(u_int addr)
1211 {
1212   invalidate_block(addr>>12);
1213 }
1214 // This is called when loading a save state.
1215 // Anything could have changed, so invalidate everything.
1216 void invalidate_all_pages()
1217 {
1218   u_int page,n;
1219   for(page=0;page<4096;page++)
1220     invalidate_page(page);
1221   for(page=0;page<1048576;page++)
1222     if(!invalid_code[page]) {
1223       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1224       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1225     }
1226   #ifdef __arm__
1227   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1228   #endif
1229   #ifdef USE_MINI_HT
1230   memset(mini_ht,-1,sizeof(mini_ht));
1231   #endif
1232   #ifndef DISABLE_TLB
1233   // TLB
1234   for(page=0;page<0x100000;page++) {
1235     if(tlb_LUT_r[page]) {
1236       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1237       if(!tlb_LUT_w[page]||!invalid_code[page])
1238         memory_map[page]|=0x40000000; // Write protect
1239     }
1240     else memory_map[page]=-1;
1241     if(page==0x80000) page=0xC0000;
1242   }
1243   tlb_hacks();
1244   #endif
1245 }
1246
1247 // Add an entry to jump_out after making a link
1248 void add_link(u_int vaddr,void *src)
1249 {
1250   u_int page=get_page(vaddr);
1251   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1252   ll_add(jump_out+page,vaddr,src);
1253   //int ptr=get_pointer(src);
1254   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1255 }
1256
1257 // If a code block was found to be unmodified (bit was set in
1258 // restore_candidate) and it remains unmodified (bit is clear
1259 // in invalid_code) then move the entries for that 4K page from
1260 // the dirty list to the clean list.
1261 void clean_blocks(u_int page)
1262 {
1263   struct ll_entry *head;
1264   inv_debug("INV: clean_blocks page=%d\n",page);
1265   head=jump_dirty[page];
1266   while(head!=NULL) {
1267     if(!invalid_code[head->vaddr>>12]) {
1268       // Don't restore blocks which are about to expire from the cache
1269       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1270         u_int start,end;
1271         if(verify_dirty((int)head->addr)) {
1272           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1273           u_int i;
1274           u_int inv=0;
1275           get_bounds((int)head->addr,&start,&end);
1276           if(start-(u_int)rdram<RAM_SIZE) {
1277             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1278               inv|=invalid_code[i];
1279             }
1280           }
1281           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1282             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1283             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1284             if(addr<start||addr>=end) inv=1;
1285           }
1286           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1287             inv=1;
1288           }
1289           if(!inv) {
1290             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1291             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1292               u_int ppage=page;
1293 #ifndef DISABLE_TLB
1294               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1295 #endif
1296               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1297               //printf("page=%x, addr=%x\n",page,head->vaddr);
1298               //assert(head->vaddr>>12==(page|0x80000));
1299               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1300               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1301               if(!head->reg32) {
1302                 if(ht_bin[0]==head->vaddr) {
1303                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1304                 }
1305                 if(ht_bin[2]==head->vaddr) {
1306                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1307                 }
1308               }
1309             }
1310           }
1311         }
1312       }
1313     }
1314     head=head->next;
1315   }
1316 }
1317
1318
1319 void mov_alloc(struct regstat *current,int i)
1320 {
1321   // Note: Don't need to actually alloc the source registers
1322   if((~current->is32>>rs1[i])&1) {
1323     //alloc_reg64(current,i,rs1[i]);
1324     alloc_reg64(current,i,rt1[i]);
1325     current->is32&=~(1LL<<rt1[i]);
1326   } else {
1327     //alloc_reg(current,i,rs1[i]);
1328     alloc_reg(current,i,rt1[i]);
1329     current->is32|=(1LL<<rt1[i]);
1330   }
1331   clear_const(current,rs1[i]);
1332   clear_const(current,rt1[i]);
1333   dirty_reg(current,rt1[i]);
1334 }
1335
1336 void shiftimm_alloc(struct regstat *current,int i)
1337 {
1338   clear_const(current,rs1[i]);
1339   clear_const(current,rt1[i]);
1340   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1341   {
1342     if(rt1[i]) {
1343       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1344       else lt1[i]=rs1[i];
1345       alloc_reg(current,i,rt1[i]);
1346       current->is32|=1LL<<rt1[i];
1347       dirty_reg(current,rt1[i]);
1348     }
1349   }
1350   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1351   {
1352     if(rt1[i]) {
1353       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1354       alloc_reg64(current,i,rt1[i]);
1355       current->is32&=~(1LL<<rt1[i]);
1356       dirty_reg(current,rt1[i]);
1357     }
1358   }
1359   if(opcode2[i]==0x3c) // DSLL32
1360   {
1361     if(rt1[i]) {
1362       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1363       alloc_reg64(current,i,rt1[i]);
1364       current->is32&=~(1LL<<rt1[i]);
1365       dirty_reg(current,rt1[i]);
1366     }
1367   }
1368   if(opcode2[i]==0x3e) // DSRL32
1369   {
1370     if(rt1[i]) {
1371       alloc_reg64(current,i,rs1[i]);
1372       if(imm[i]==32) {
1373         alloc_reg64(current,i,rt1[i]);
1374         current->is32&=~(1LL<<rt1[i]);
1375       } else {
1376         alloc_reg(current,i,rt1[i]);
1377         current->is32|=1LL<<rt1[i];
1378       }
1379       dirty_reg(current,rt1[i]);
1380     }
1381   }
1382   if(opcode2[i]==0x3f) // DSRA32
1383   {
1384     if(rt1[i]) {
1385       alloc_reg64(current,i,rs1[i]);
1386       alloc_reg(current,i,rt1[i]);
1387       current->is32|=1LL<<rt1[i];
1388       dirty_reg(current,rt1[i]);
1389     }
1390   }
1391 }
1392
1393 void shift_alloc(struct regstat *current,int i)
1394 {
1395   if(rt1[i]) {
1396     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1397     {
1398       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1399       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1400       alloc_reg(current,i,rt1[i]);
1401       if(rt1[i]==rs2[i]) {
1402         alloc_reg_temp(current,i,-1);
1403         minimum_free_regs[i]=1;
1404       }
1405       current->is32|=1LL<<rt1[i];
1406     } else { // DSLLV/DSRLV/DSRAV
1407       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1408       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1409       alloc_reg64(current,i,rt1[i]);
1410       current->is32&=~(1LL<<rt1[i]);
1411       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1412       {
1413         alloc_reg_temp(current,i,-1);
1414         minimum_free_regs[i]=1;
1415       }
1416     }
1417     clear_const(current,rs1[i]);
1418     clear_const(current,rs2[i]);
1419     clear_const(current,rt1[i]);
1420     dirty_reg(current,rt1[i]);
1421   }
1422 }
1423
1424 void alu_alloc(struct regstat *current,int i)
1425 {
1426   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1427     if(rt1[i]) {
1428       if(rs1[i]&&rs2[i]) {
1429         alloc_reg(current,i,rs1[i]);
1430         alloc_reg(current,i,rs2[i]);
1431       }
1432       else {
1433         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1434         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1435       }
1436       alloc_reg(current,i,rt1[i]);
1437     }
1438     current->is32|=1LL<<rt1[i];
1439   }
1440   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1441     if(rt1[i]) {
1442       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1443       {
1444         alloc_reg64(current,i,rs1[i]);
1445         alloc_reg64(current,i,rs2[i]);
1446         alloc_reg(current,i,rt1[i]);
1447       } else {
1448         alloc_reg(current,i,rs1[i]);
1449         alloc_reg(current,i,rs2[i]);
1450         alloc_reg(current,i,rt1[i]);
1451       }
1452     }
1453     current->is32|=1LL<<rt1[i];
1454   }
1455   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1456     if(rt1[i]) {
1457       if(rs1[i]&&rs2[i]) {
1458         alloc_reg(current,i,rs1[i]);
1459         alloc_reg(current,i,rs2[i]);
1460       }
1461       else
1462       {
1463         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1464         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1465       }
1466       alloc_reg(current,i,rt1[i]);
1467       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1468       {
1469         if(!((current->uu>>rt1[i])&1)) {
1470           alloc_reg64(current,i,rt1[i]);
1471         }
1472         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1473           if(rs1[i]&&rs2[i]) {
1474             alloc_reg64(current,i,rs1[i]);
1475             alloc_reg64(current,i,rs2[i]);
1476           }
1477           else
1478           {
1479             // Is is really worth it to keep 64-bit values in registers?
1480             #ifdef NATIVE_64BIT
1481             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1482             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1483             #endif
1484           }
1485         }
1486         current->is32&=~(1LL<<rt1[i]);
1487       } else {
1488         current->is32|=1LL<<rt1[i];
1489       }
1490     }
1491   }
1492   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1493     if(rt1[i]) {
1494       if(rs1[i]&&rs2[i]) {
1495         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1496           alloc_reg64(current,i,rs1[i]);
1497           alloc_reg64(current,i,rs2[i]);
1498           alloc_reg64(current,i,rt1[i]);
1499         } else {
1500           alloc_reg(current,i,rs1[i]);
1501           alloc_reg(current,i,rs2[i]);
1502           alloc_reg(current,i,rt1[i]);
1503         }
1504       }
1505       else {
1506         alloc_reg(current,i,rt1[i]);
1507         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1508           // DADD used as move, or zeroing
1509           // If we have a 64-bit source, then make the target 64 bits too
1510           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1511             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1512             alloc_reg64(current,i,rt1[i]);
1513           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1514             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1515             alloc_reg64(current,i,rt1[i]);
1516           }
1517           if(opcode2[i]>=0x2e&&rs2[i]) {
1518             // DSUB used as negation - 64-bit result
1519             // If we have a 32-bit register, extend it to 64 bits
1520             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1521             alloc_reg64(current,i,rt1[i]);
1522           }
1523         }
1524       }
1525       if(rs1[i]&&rs2[i]) {
1526         current->is32&=~(1LL<<rt1[i]);
1527       } else if(rs1[i]) {
1528         current->is32&=~(1LL<<rt1[i]);
1529         if((current->is32>>rs1[i])&1)
1530           current->is32|=1LL<<rt1[i];
1531       } else if(rs2[i]) {
1532         current->is32&=~(1LL<<rt1[i]);
1533         if((current->is32>>rs2[i])&1)
1534           current->is32|=1LL<<rt1[i];
1535       } else {
1536         current->is32|=1LL<<rt1[i];
1537       }
1538     }
1539   }
1540   clear_const(current,rs1[i]);
1541   clear_const(current,rs2[i]);
1542   clear_const(current,rt1[i]);
1543   dirty_reg(current,rt1[i]);
1544 }
1545
1546 void imm16_alloc(struct regstat *current,int i)
1547 {
1548   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1549   else lt1[i]=rs1[i];
1550   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1551   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1552     current->is32&=~(1LL<<rt1[i]);
1553     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1554       // TODO: Could preserve the 32-bit flag if the immediate is zero
1555       alloc_reg64(current,i,rt1[i]);
1556       alloc_reg64(current,i,rs1[i]);
1557     }
1558     clear_const(current,rs1[i]);
1559     clear_const(current,rt1[i]);
1560   }
1561   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1562     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1563     current->is32|=1LL<<rt1[i];
1564     clear_const(current,rs1[i]);
1565     clear_const(current,rt1[i]);
1566   }
1567   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1568     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1569       if(rs1[i]!=rt1[i]) {
1570         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1571         alloc_reg64(current,i,rt1[i]);
1572         current->is32&=~(1LL<<rt1[i]);
1573       }
1574     }
1575     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1576     if(is_const(current,rs1[i])) {
1577       int v=get_const(current,rs1[i]);
1578       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1579       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1580       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1581     }
1582     else clear_const(current,rt1[i]);
1583   }
1584   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1585     if(is_const(current,rs1[i])) {
1586       int v=get_const(current,rs1[i]);
1587       set_const(current,rt1[i],v+imm[i]);
1588     }
1589     else clear_const(current,rt1[i]);
1590     current->is32|=1LL<<rt1[i];
1591   }
1592   else {
1593     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1594     current->is32|=1LL<<rt1[i];
1595   }
1596   dirty_reg(current,rt1[i]);
1597 }
1598
1599 void load_alloc(struct regstat *current,int i)
1600 {
1601   clear_const(current,rt1[i]);
1602   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1603   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1604   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1605   if(rt1[i]) {
1606     alloc_reg(current,i,rt1[i]);
1607     if(get_reg(current->regmap,rt1[i])<0) {
1608       // dummy load, but we still need a register to calculate the address
1609       alloc_reg_temp(current,i,-1);
1610       minimum_free_regs[i]=1;
1611     }
1612     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1613     {
1614       current->is32&=~(1LL<<rt1[i]);
1615       alloc_reg64(current,i,rt1[i]);
1616     }
1617     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1618     {
1619       current->is32&=~(1LL<<rt1[i]);
1620       alloc_reg64(current,i,rt1[i]);
1621       alloc_all(current,i);
1622       alloc_reg64(current,i,FTEMP);
1623       minimum_free_regs[i]=HOST_REGS;
1624     }
1625     else current->is32|=1LL<<rt1[i];
1626     dirty_reg(current,rt1[i]);
1627     // If using TLB, need a register for pointer to the mapping table
1628     if(using_tlb) alloc_reg(current,i,TLREG);
1629     // LWL/LWR need a temporary register for the old value
1630     if(opcode[i]==0x22||opcode[i]==0x26)
1631     {
1632       alloc_reg(current,i,FTEMP);
1633       alloc_reg_temp(current,i,-1);
1634       minimum_free_regs[i]=1;
1635     }
1636   }
1637   else
1638   {
1639     // Load to r0 (dummy load)
1640     // but we still need a register to calculate the address
1641     if(opcode[i]==0x22||opcode[i]==0x26)
1642     {
1643       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1644     }
1645     alloc_reg_temp(current,i,-1);
1646     minimum_free_regs[i]=1;
1647     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1648     {
1649       alloc_all(current,i);
1650       alloc_reg64(current,i,FTEMP);
1651       minimum_free_regs[i]=HOST_REGS;
1652     }
1653   }
1654 }
1655
1656 void store_alloc(struct regstat *current,int i)
1657 {
1658   clear_const(current,rs2[i]);
1659   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1660   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1661   alloc_reg(current,i,rs2[i]);
1662   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1663     alloc_reg64(current,i,rs2[i]);
1664     if(rs2[i]) alloc_reg(current,i,FTEMP);
1665   }
1666   // If using TLB, need a register for pointer to the mapping table
1667   if(using_tlb) alloc_reg(current,i,TLREG);
1668   #if defined(HOST_IMM8)
1669   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1670   else alloc_reg(current,i,INVCP);
1671   #endif
1672   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1673     alloc_reg(current,i,FTEMP);
1674   }
1675   // We need a temporary register for address generation
1676   alloc_reg_temp(current,i,-1);
1677   minimum_free_regs[i]=1;
1678 }
1679
1680 void c1ls_alloc(struct regstat *current,int i)
1681 {
1682   //clear_const(current,rs1[i]); // FIXME
1683   clear_const(current,rt1[i]);
1684   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1685   alloc_reg(current,i,CSREG); // Status
1686   alloc_reg(current,i,FTEMP);
1687   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1688     alloc_reg64(current,i,FTEMP);
1689   }
1690   // If using TLB, need a register for pointer to the mapping table
1691   if(using_tlb) alloc_reg(current,i,TLREG);
1692   #if defined(HOST_IMM8)
1693   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1694   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1695     alloc_reg(current,i,INVCP);
1696   #endif
1697   // We need a temporary register for address generation
1698   alloc_reg_temp(current,i,-1);
1699 }
1700
1701 void c2ls_alloc(struct regstat *current,int i)
1702 {
1703   clear_const(current,rt1[i]);
1704   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1705   alloc_reg(current,i,FTEMP);
1706   // If using TLB, need a register for pointer to the mapping table
1707   if(using_tlb) alloc_reg(current,i,TLREG);
1708   #if defined(HOST_IMM8)
1709   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1710   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1711     alloc_reg(current,i,INVCP);
1712   #endif
1713   // We need a temporary register for address generation
1714   alloc_reg_temp(current,i,-1);
1715   minimum_free_regs[i]=1;
1716 }
1717
1718 #ifndef multdiv_alloc
1719 void multdiv_alloc(struct regstat *current,int i)
1720 {
1721   //  case 0x18: MULT
1722   //  case 0x19: MULTU
1723   //  case 0x1A: DIV
1724   //  case 0x1B: DIVU
1725   //  case 0x1C: DMULT
1726   //  case 0x1D: DMULTU
1727   //  case 0x1E: DDIV
1728   //  case 0x1F: DDIVU
1729   clear_const(current,rs1[i]);
1730   clear_const(current,rs2[i]);
1731   if(rs1[i]&&rs2[i])
1732   {
1733     if((opcode2[i]&4)==0) // 32-bit
1734     {
1735       current->u&=~(1LL<<HIREG);
1736       current->u&=~(1LL<<LOREG);
1737       alloc_reg(current,i,HIREG);
1738       alloc_reg(current,i,LOREG);
1739       alloc_reg(current,i,rs1[i]);
1740       alloc_reg(current,i,rs2[i]);
1741       current->is32|=1LL<<HIREG;
1742       current->is32|=1LL<<LOREG;
1743       dirty_reg(current,HIREG);
1744       dirty_reg(current,LOREG);
1745     }
1746     else // 64-bit
1747     {
1748       current->u&=~(1LL<<HIREG);
1749       current->u&=~(1LL<<LOREG);
1750       current->uu&=~(1LL<<HIREG);
1751       current->uu&=~(1LL<<LOREG);
1752       alloc_reg64(current,i,HIREG);
1753       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1754       alloc_reg64(current,i,rs1[i]);
1755       alloc_reg64(current,i,rs2[i]);
1756       alloc_all(current,i);
1757       current->is32&=~(1LL<<HIREG);
1758       current->is32&=~(1LL<<LOREG);
1759       dirty_reg(current,HIREG);
1760       dirty_reg(current,LOREG);
1761       minimum_free_regs[i]=HOST_REGS;
1762     }
1763   }
1764   else
1765   {
1766     // Multiply by zero is zero.
1767     // MIPS does not have a divide by zero exception.
1768     // The result is undefined, we return zero.
1769     alloc_reg(current,i,HIREG);
1770     alloc_reg(current,i,LOREG);
1771     current->is32|=1LL<<HIREG;
1772     current->is32|=1LL<<LOREG;
1773     dirty_reg(current,HIREG);
1774     dirty_reg(current,LOREG);
1775   }
1776 }
1777 #endif
1778
1779 void cop0_alloc(struct regstat *current,int i)
1780 {
1781   if(opcode2[i]==0) // MFC0
1782   {
1783     if(rt1[i]) {
1784       clear_const(current,rt1[i]);
1785       alloc_all(current,i);
1786       alloc_reg(current,i,rt1[i]);
1787       current->is32|=1LL<<rt1[i];
1788       dirty_reg(current,rt1[i]);
1789     }
1790   }
1791   else if(opcode2[i]==4) // MTC0
1792   {
1793     if(rs1[i]){
1794       clear_const(current,rs1[i]);
1795       alloc_reg(current,i,rs1[i]);
1796       alloc_all(current,i);
1797     }
1798     else {
1799       alloc_all(current,i); // FIXME: Keep r0
1800       current->u&=~1LL;
1801       alloc_reg(current,i,0);
1802     }
1803   }
1804   else
1805   {
1806     // TLBR/TLBWI/TLBWR/TLBP/ERET
1807     assert(opcode2[i]==0x10);
1808     alloc_all(current,i);
1809   }
1810   minimum_free_regs[i]=HOST_REGS;
1811 }
1812
1813 void cop1_alloc(struct regstat *current,int i)
1814 {
1815   alloc_reg(current,i,CSREG); // Load status
1816   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1817   {
1818     if(rt1[i]){
1819       clear_const(current,rt1[i]);
1820       if(opcode2[i]==1) {
1821         alloc_reg64(current,i,rt1[i]); // DMFC1
1822         current->is32&=~(1LL<<rt1[i]);
1823       }else{
1824         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1825         current->is32|=1LL<<rt1[i];
1826       }
1827       dirty_reg(current,rt1[i]);
1828     }
1829     alloc_reg_temp(current,i,-1);
1830   }
1831   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1832   {
1833     if(rs1[i]){
1834       clear_const(current,rs1[i]);
1835       if(opcode2[i]==5)
1836         alloc_reg64(current,i,rs1[i]); // DMTC1
1837       else
1838         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1839       alloc_reg_temp(current,i,-1);
1840     }
1841     else {
1842       current->u&=~1LL;
1843       alloc_reg(current,i,0);
1844       alloc_reg_temp(current,i,-1);
1845     }
1846   }
1847   minimum_free_regs[i]=1;
1848 }
1849 void fconv_alloc(struct regstat *current,int i)
1850 {
1851   alloc_reg(current,i,CSREG); // Load status
1852   alloc_reg_temp(current,i,-1);
1853   minimum_free_regs[i]=1;
1854 }
1855 void float_alloc(struct regstat *current,int i)
1856 {
1857   alloc_reg(current,i,CSREG); // Load status
1858   alloc_reg_temp(current,i,-1);
1859   minimum_free_regs[i]=1;
1860 }
1861 void c2op_alloc(struct regstat *current,int i)
1862 {
1863   alloc_reg_temp(current,i,-1);
1864 }
1865 void fcomp_alloc(struct regstat *current,int i)
1866 {
1867   alloc_reg(current,i,CSREG); // Load status
1868   alloc_reg(current,i,FSREG); // Load flags
1869   dirty_reg(current,FSREG); // Flag will be modified
1870   alloc_reg_temp(current,i,-1);
1871   minimum_free_regs[i]=1;
1872 }
1873
1874 void syscall_alloc(struct regstat *current,int i)
1875 {
1876   alloc_cc(current,i);
1877   dirty_reg(current,CCREG);
1878   alloc_all(current,i);
1879   minimum_free_regs[i]=HOST_REGS;
1880   current->isconst=0;
1881 }
1882
1883 void delayslot_alloc(struct regstat *current,int i)
1884 {
1885   switch(itype[i]) {
1886     case UJUMP:
1887     case CJUMP:
1888     case SJUMP:
1889     case RJUMP:
1890     case FJUMP:
1891     case SYSCALL:
1892     case HLECALL:
1893     case SPAN:
1894       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1895       printf("Disabled speculative precompilation\n");
1896       stop_after_jal=1;
1897       break;
1898     case IMM16:
1899       imm16_alloc(current,i);
1900       break;
1901     case LOAD:
1902     case LOADLR:
1903       load_alloc(current,i);
1904       break;
1905     case STORE:
1906     case STORELR:
1907       store_alloc(current,i);
1908       break;
1909     case ALU:
1910       alu_alloc(current,i);
1911       break;
1912     case SHIFT:
1913       shift_alloc(current,i);
1914       break;
1915     case MULTDIV:
1916       multdiv_alloc(current,i);
1917       break;
1918     case SHIFTIMM:
1919       shiftimm_alloc(current,i);
1920       break;
1921     case MOV:
1922       mov_alloc(current,i);
1923       break;
1924     case COP0:
1925       cop0_alloc(current,i);
1926       break;
1927     case COP1:
1928     case COP2:
1929       cop1_alloc(current,i);
1930       break;
1931     case C1LS:
1932       c1ls_alloc(current,i);
1933       break;
1934     case C2LS:
1935       c2ls_alloc(current,i);
1936       break;
1937     case FCONV:
1938       fconv_alloc(current,i);
1939       break;
1940     case FLOAT:
1941       float_alloc(current,i);
1942       break;
1943     case FCOMP:
1944       fcomp_alloc(current,i);
1945       break;
1946     case C2OP:
1947       c2op_alloc(current,i);
1948       break;
1949   }
1950 }
1951
1952 // Special case where a branch and delay slot span two pages in virtual memory
1953 static void pagespan_alloc(struct regstat *current,int i)
1954 {
1955   current->isconst=0;
1956   current->wasconst=0;
1957   regs[i].wasconst=0;
1958   minimum_free_regs[i]=HOST_REGS;
1959   alloc_all(current,i);
1960   alloc_cc(current,i);
1961   dirty_reg(current,CCREG);
1962   if(opcode[i]==3) // JAL
1963   {
1964     alloc_reg(current,i,31);
1965     dirty_reg(current,31);
1966   }
1967   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1968   {
1969     alloc_reg(current,i,rs1[i]);
1970     if (rt1[i]!=0) {
1971       alloc_reg(current,i,rt1[i]);
1972       dirty_reg(current,rt1[i]);
1973     }
1974   }
1975   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1976   {
1977     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1978     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1979     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1980     {
1981       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1982       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1983     }
1984   }
1985   else
1986   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1987   {
1988     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1989     if(!((current->is32>>rs1[i])&1))
1990     {
1991       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1992     }
1993   }
1994   else
1995   if(opcode[i]==0x11) // BC1
1996   {
1997     alloc_reg(current,i,FSREG);
1998     alloc_reg(current,i,CSREG);
1999   }
2000   //else ...
2001 }
2002
2003 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2004 {
2005   stubs[stubcount][0]=type;
2006   stubs[stubcount][1]=addr;
2007   stubs[stubcount][2]=retaddr;
2008   stubs[stubcount][3]=a;
2009   stubs[stubcount][4]=b;
2010   stubs[stubcount][5]=c;
2011   stubs[stubcount][6]=d;
2012   stubs[stubcount][7]=e;
2013   stubcount++;
2014 }
2015
2016 // Write out a single register
2017 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2018 {
2019   int hr;
2020   for(hr=0;hr<HOST_REGS;hr++) {
2021     if(hr!=EXCLUDE_REG) {
2022       if((regmap[hr]&63)==r) {
2023         if((dirty>>hr)&1) {
2024           if(regmap[hr]<64) {
2025             emit_storereg(r,hr);
2026 #ifndef FORCE32
2027             if((is32>>regmap[hr])&1) {
2028               emit_sarimm(hr,31,hr);
2029               emit_storereg(r|64,hr);
2030             }
2031 #endif
2032           }else{
2033             emit_storereg(r|64,hr);
2034           }
2035         }
2036       }
2037     }
2038   }
2039 }
2040
2041 int mchecksum()
2042 {
2043   //if(!tracedebug) return 0;
2044   int i;
2045   int sum=0;
2046   for(i=0;i<2097152;i++) {
2047     unsigned int temp=sum;
2048     sum<<=1;
2049     sum|=(~temp)>>31;
2050     sum^=((u_int *)rdram)[i];
2051   }
2052   return sum;
2053 }
2054 int rchecksum()
2055 {
2056   int i;
2057   int sum=0;
2058   for(i=0;i<64;i++)
2059     sum^=((u_int *)reg)[i];
2060   return sum;
2061 }
2062 void rlist()
2063 {
2064   int i;
2065   printf("TRACE: ");
2066   for(i=0;i<32;i++)
2067     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2068   printf("\n");
2069 #ifndef DISABLE_COP1
2070   printf("TRACE: ");
2071   for(i=0;i<32;i++)
2072     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2073   printf("\n");
2074 #endif
2075 }
2076
2077 void enabletrace()
2078 {
2079   tracedebug=1;
2080 }
2081
2082 void memdebug(int i)
2083 {
2084   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2085   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2086   //rlist();
2087   //if(tracedebug) {
2088   //if(Count>=-2084597794) {
2089   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2090   //if(0) {
2091     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2092     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2093     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2094     rlist();
2095     #ifdef __i386__
2096     printf("TRACE: %x\n",(&i)[-1]);
2097     #endif
2098     #ifdef __arm__
2099     int j;
2100     printf("TRACE: %x \n",(&j)[10]);
2101     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2102     #endif
2103     //fflush(stdout);
2104   }
2105   //printf("TRACE: %x\n",(&i)[-1]);
2106 }
2107
2108 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2109 {
2110   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2111 }
2112
2113 void alu_assemble(int i,struct regstat *i_regs)
2114 {
2115   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2116     if(rt1[i]) {
2117       signed char s1,s2,t;
2118       t=get_reg(i_regs->regmap,rt1[i]);
2119       if(t>=0) {
2120         s1=get_reg(i_regs->regmap,rs1[i]);
2121         s2=get_reg(i_regs->regmap,rs2[i]);
2122         if(rs1[i]&&rs2[i]) {
2123           assert(s1>=0);
2124           assert(s2>=0);
2125           if(opcode2[i]&2) emit_sub(s1,s2,t);
2126           else emit_add(s1,s2,t);
2127         }
2128         else if(rs1[i]) {
2129           if(s1>=0) emit_mov(s1,t);
2130           else emit_loadreg(rs1[i],t);
2131         }
2132         else if(rs2[i]) {
2133           if(s2>=0) {
2134             if(opcode2[i]&2) emit_neg(s2,t);
2135             else emit_mov(s2,t);
2136           }
2137           else {
2138             emit_loadreg(rs2[i],t);
2139             if(opcode2[i]&2) emit_neg(t,t);
2140           }
2141         }
2142         else emit_zeroreg(t);
2143       }
2144     }
2145   }
2146   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2147     if(rt1[i]) {
2148       signed char s1l,s2l,s1h,s2h,tl,th;
2149       tl=get_reg(i_regs->regmap,rt1[i]);
2150       th=get_reg(i_regs->regmap,rt1[i]|64);
2151       if(tl>=0) {
2152         s1l=get_reg(i_regs->regmap,rs1[i]);
2153         s2l=get_reg(i_regs->regmap,rs2[i]);
2154         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2155         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2156         if(rs1[i]&&rs2[i]) {
2157           assert(s1l>=0);
2158           assert(s2l>=0);
2159           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2160           else emit_adds(s1l,s2l,tl);
2161           if(th>=0) {
2162             #ifdef INVERTED_CARRY
2163             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2164             #else
2165             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2166             #endif
2167             else emit_add(s1h,s2h,th);
2168           }
2169         }
2170         else if(rs1[i]) {
2171           if(s1l>=0) emit_mov(s1l,tl);
2172           else emit_loadreg(rs1[i],tl);
2173           if(th>=0) {
2174             if(s1h>=0) emit_mov(s1h,th);
2175             else emit_loadreg(rs1[i]|64,th);
2176           }
2177         }
2178         else if(rs2[i]) {
2179           if(s2l>=0) {
2180             if(opcode2[i]&2) emit_negs(s2l,tl);
2181             else emit_mov(s2l,tl);
2182           }
2183           else {
2184             emit_loadreg(rs2[i],tl);
2185             if(opcode2[i]&2) emit_negs(tl,tl);
2186           }
2187           if(th>=0) {
2188             #ifdef INVERTED_CARRY
2189             if(s2h>=0) emit_mov(s2h,th);
2190             else emit_loadreg(rs2[i]|64,th);
2191             if(opcode2[i]&2) {
2192               emit_adcimm(-1,th); // x86 has inverted carry flag
2193               emit_not(th,th);
2194             }
2195             #else
2196             if(opcode2[i]&2) {
2197               if(s2h>=0) emit_rscimm(s2h,0,th);
2198               else {
2199                 emit_loadreg(rs2[i]|64,th);
2200                 emit_rscimm(th,0,th);
2201               }
2202             }else{
2203               if(s2h>=0) emit_mov(s2h,th);
2204               else emit_loadreg(rs2[i]|64,th);
2205             }
2206             #endif
2207           }
2208         }
2209         else {
2210           emit_zeroreg(tl);
2211           if(th>=0) emit_zeroreg(th);
2212         }
2213       }
2214     }
2215   }
2216   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2217     if(rt1[i]) {
2218       signed char s1l,s1h,s2l,s2h,t;
2219       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2220       {
2221         t=get_reg(i_regs->regmap,rt1[i]);
2222         //assert(t>=0);
2223         if(t>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2226           s2l=get_reg(i_regs->regmap,rs2[i]);
2227           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2228           if(rs2[i]==0) // rx<r0
2229           {
2230             assert(s1h>=0);
2231             if(opcode2[i]==0x2a) // SLT
2232               emit_shrimm(s1h,31,t);
2233             else // SLTU (unsigned can not be less than zero)
2234               emit_zeroreg(t);
2235           }
2236           else if(rs1[i]==0) // r0<rx
2237           {
2238             assert(s2h>=0);
2239             if(opcode2[i]==0x2a) // SLT
2240               emit_set_gz64_32(s2h,s2l,t);
2241             else // SLTU (set if not zero)
2242               emit_set_nz64_32(s2h,s2l,t);
2243           }
2244           else {
2245             assert(s1l>=0);assert(s1h>=0);
2246             assert(s2l>=0);assert(s2h>=0);
2247             if(opcode2[i]==0x2a) // SLT
2248               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2249             else // SLTU
2250               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2251           }
2252         }
2253       } else {
2254         t=get_reg(i_regs->regmap,rt1[i]);
2255         //assert(t>=0);
2256         if(t>=0) {
2257           s1l=get_reg(i_regs->regmap,rs1[i]);
2258           s2l=get_reg(i_regs->regmap,rs2[i]);
2259           if(rs2[i]==0) // rx<r0
2260           {
2261             assert(s1l>=0);
2262             if(opcode2[i]==0x2a) // SLT
2263               emit_shrimm(s1l,31,t);
2264             else // SLTU (unsigned can not be less than zero)
2265               emit_zeroreg(t);
2266           }
2267           else if(rs1[i]==0) // r0<rx
2268           {
2269             assert(s2l>=0);
2270             if(opcode2[i]==0x2a) // SLT
2271               emit_set_gz32(s2l,t);
2272             else // SLTU (set if not zero)
2273               emit_set_nz32(s2l,t);
2274           }
2275           else{
2276             assert(s1l>=0);assert(s2l>=0);
2277             if(opcode2[i]==0x2a) // SLT
2278               emit_set_if_less32(s1l,s2l,t);
2279             else // SLTU
2280               emit_set_if_carry32(s1l,s2l,t);
2281           }
2282         }
2283       }
2284     }
2285   }
2286   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2287     if(rt1[i]) {
2288       signed char s1l,s1h,s2l,s2h,th,tl;
2289       tl=get_reg(i_regs->regmap,rt1[i]);
2290       th=get_reg(i_regs->regmap,rt1[i]|64);
2291       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2292       {
2293         assert(tl>=0);
2294         if(tl>=0) {
2295           s1l=get_reg(i_regs->regmap,rs1[i]);
2296           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2297           s2l=get_reg(i_regs->regmap,rs2[i]);
2298           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2299           if(rs1[i]&&rs2[i]) {
2300             assert(s1l>=0);assert(s1h>=0);
2301             assert(s2l>=0);assert(s2h>=0);
2302             if(opcode2[i]==0x24) { // AND
2303               emit_and(s1l,s2l,tl);
2304               emit_and(s1h,s2h,th);
2305             } else
2306             if(opcode2[i]==0x25) { // OR
2307               emit_or(s1l,s2l,tl);
2308               emit_or(s1h,s2h,th);
2309             } else
2310             if(opcode2[i]==0x26) { // XOR
2311               emit_xor(s1l,s2l,tl);
2312               emit_xor(s1h,s2h,th);
2313             } else
2314             if(opcode2[i]==0x27) { // NOR
2315               emit_or(s1l,s2l,tl);
2316               emit_or(s1h,s2h,th);
2317               emit_not(tl,tl);
2318               emit_not(th,th);
2319             }
2320           }
2321           else
2322           {
2323             if(opcode2[i]==0x24) { // AND
2324               emit_zeroreg(tl);
2325               emit_zeroreg(th);
2326             } else
2327             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2328               if(rs1[i]){
2329                 if(s1l>=0) emit_mov(s1l,tl);
2330                 else emit_loadreg(rs1[i],tl);
2331                 if(s1h>=0) emit_mov(s1h,th);
2332                 else emit_loadreg(rs1[i]|64,th);
2333               }
2334               else
2335               if(rs2[i]){
2336                 if(s2l>=0) emit_mov(s2l,tl);
2337                 else emit_loadreg(rs2[i],tl);
2338                 if(s2h>=0) emit_mov(s2h,th);
2339                 else emit_loadreg(rs2[i]|64,th);
2340               }
2341               else{
2342                 emit_zeroreg(tl);
2343                 emit_zeroreg(th);
2344               }
2345             } else
2346             if(opcode2[i]==0x27) { // NOR
2347               if(rs1[i]){
2348                 if(s1l>=0) emit_not(s1l,tl);
2349                 else{
2350                   emit_loadreg(rs1[i],tl);
2351                   emit_not(tl,tl);
2352                 }
2353                 if(s1h>=0) emit_not(s1h,th);
2354                 else{
2355                   emit_loadreg(rs1[i]|64,th);
2356                   emit_not(th,th);
2357                 }
2358               }
2359               else
2360               if(rs2[i]){
2361                 if(s2l>=0) emit_not(s2l,tl);
2362                 else{
2363                   emit_loadreg(rs2[i],tl);
2364                   emit_not(tl,tl);
2365                 }
2366                 if(s2h>=0) emit_not(s2h,th);
2367                 else{
2368                   emit_loadreg(rs2[i]|64,th);
2369                   emit_not(th,th);
2370                 }
2371               }
2372               else {
2373                 emit_movimm(-1,tl);
2374                 emit_movimm(-1,th);
2375               }
2376             }
2377           }
2378         }
2379       }
2380       else
2381       {
2382         // 32 bit
2383         if(tl>=0) {
2384           s1l=get_reg(i_regs->regmap,rs1[i]);
2385           s2l=get_reg(i_regs->regmap,rs2[i]);
2386           if(rs1[i]&&rs2[i]) {
2387             assert(s1l>=0);
2388             assert(s2l>=0);
2389             if(opcode2[i]==0x24) { // AND
2390               emit_and(s1l,s2l,tl);
2391             } else
2392             if(opcode2[i]==0x25) { // OR
2393               emit_or(s1l,s2l,tl);
2394             } else
2395             if(opcode2[i]==0x26) { // XOR
2396               emit_xor(s1l,s2l,tl);
2397             } else
2398             if(opcode2[i]==0x27) { // NOR
2399               emit_or(s1l,s2l,tl);
2400               emit_not(tl,tl);
2401             }
2402           }
2403           else
2404           {
2405             if(opcode2[i]==0x24) { // AND
2406               emit_zeroreg(tl);
2407             } else
2408             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2409               if(rs1[i]){
2410                 if(s1l>=0) emit_mov(s1l,tl);
2411                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2412               }
2413               else
2414               if(rs2[i]){
2415                 if(s2l>=0) emit_mov(s2l,tl);
2416                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2417               }
2418               else emit_zeroreg(tl);
2419             } else
2420             if(opcode2[i]==0x27) { // NOR
2421               if(rs1[i]){
2422                 if(s1l>=0) emit_not(s1l,tl);
2423                 else {
2424                   emit_loadreg(rs1[i],tl);
2425                   emit_not(tl,tl);
2426                 }
2427               }
2428               else
2429               if(rs2[i]){
2430                 if(s2l>=0) emit_not(s2l,tl);
2431                 else {
2432                   emit_loadreg(rs2[i],tl);
2433                   emit_not(tl,tl);
2434                 }
2435               }
2436               else emit_movimm(-1,tl);
2437             }
2438           }
2439         }
2440       }
2441     }
2442   }
2443 }
2444
2445 void imm16_assemble(int i,struct regstat *i_regs)
2446 {
2447   if (opcode[i]==0x0f) { // LUI
2448     if(rt1[i]) {
2449       signed char t;
2450       t=get_reg(i_regs->regmap,rt1[i]);
2451       //assert(t>=0);
2452       if(t>=0) {
2453         if(!((i_regs->isconst>>t)&1))
2454           emit_movimm(imm[i]<<16,t);
2455       }
2456     }
2457   }
2458   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2459     if(rt1[i]) {
2460       signed char s,t;
2461       t=get_reg(i_regs->regmap,rt1[i]);
2462       s=get_reg(i_regs->regmap,rs1[i]);
2463       if(rs1[i]) {
2464         //assert(t>=0);
2465         //assert(s>=0);
2466         if(t>=0) {
2467           if(!((i_regs->isconst>>t)&1)) {
2468             if(s<0) {
2469               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2470               emit_addimm(t,imm[i],t);
2471             }else{
2472               if(!((i_regs->wasconst>>s)&1))
2473                 emit_addimm(s,imm[i],t);
2474               else
2475                 emit_movimm(constmap[i][s]+imm[i],t);
2476             }
2477           }
2478         }
2479       } else {
2480         if(t>=0) {
2481           if(!((i_regs->isconst>>t)&1))
2482             emit_movimm(imm[i],t);
2483         }
2484       }
2485     }
2486   }
2487   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2488     if(rt1[i]) {
2489       signed char sh,sl,th,tl;
2490       th=get_reg(i_regs->regmap,rt1[i]|64);
2491       tl=get_reg(i_regs->regmap,rt1[i]);
2492       sh=get_reg(i_regs->regmap,rs1[i]|64);
2493       sl=get_reg(i_regs->regmap,rs1[i]);
2494       if(tl>=0) {
2495         if(rs1[i]) {
2496           assert(sh>=0);
2497           assert(sl>=0);
2498           if(th>=0) {
2499             emit_addimm64_32(sh,sl,imm[i],th,tl);
2500           }
2501           else {
2502             emit_addimm(sl,imm[i],tl);
2503           }
2504         } else {
2505           emit_movimm(imm[i],tl);
2506           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2507         }
2508       }
2509     }
2510   }
2511   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2512     if(rt1[i]) {
2513       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2514       signed char sh,sl,t;
2515       t=get_reg(i_regs->regmap,rt1[i]);
2516       sh=get_reg(i_regs->regmap,rs1[i]|64);
2517       sl=get_reg(i_regs->regmap,rs1[i]);
2518       //assert(t>=0);
2519       if(t>=0) {
2520         if(rs1[i]>0) {
2521           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2522           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2523             if(opcode[i]==0x0a) { // SLTI
2524               if(sl<0) {
2525                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2526                 emit_slti32(t,imm[i],t);
2527               }else{
2528                 emit_slti32(sl,imm[i],t);
2529               }
2530             }
2531             else { // SLTIU
2532               if(sl<0) {
2533                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2534                 emit_sltiu32(t,imm[i],t);
2535               }else{
2536                 emit_sltiu32(sl,imm[i],t);
2537               }
2538             }
2539           }else{ // 64-bit
2540             assert(sl>=0);
2541             if(opcode[i]==0x0a) // SLTI
2542               emit_slti64_32(sh,sl,imm[i],t);
2543             else // SLTIU
2544               emit_sltiu64_32(sh,sl,imm[i],t);
2545           }
2546         }else{
2547           // SLTI(U) with r0 is just stupid,
2548           // nonetheless examples can be found
2549           if(opcode[i]==0x0a) // SLTI
2550             if(0<imm[i]) emit_movimm(1,t);
2551             else emit_zeroreg(t);
2552           else // SLTIU
2553           {
2554             if(imm[i]) emit_movimm(1,t);
2555             else emit_zeroreg(t);
2556           }
2557         }
2558       }
2559     }
2560   }
2561   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2562     if(rt1[i]) {
2563       signed char sh,sl,th,tl;
2564       th=get_reg(i_regs->regmap,rt1[i]|64);
2565       tl=get_reg(i_regs->regmap,rt1[i]);
2566       sh=get_reg(i_regs->regmap,rs1[i]|64);
2567       sl=get_reg(i_regs->regmap,rs1[i]);
2568       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2569         if(opcode[i]==0x0c) //ANDI
2570         {
2571           if(rs1[i]) {
2572             if(sl<0) {
2573               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2574               emit_andimm(tl,imm[i],tl);
2575             }else{
2576               if(!((i_regs->wasconst>>sl)&1))
2577                 emit_andimm(sl,imm[i],tl);
2578               else
2579                 emit_movimm(constmap[i][sl]&imm[i],tl);
2580             }
2581           }
2582           else
2583             emit_zeroreg(tl);
2584           if(th>=0) emit_zeroreg(th);
2585         }
2586         else
2587         {
2588           if(rs1[i]) {
2589             if(sl<0) {
2590               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2591             }
2592             if(th>=0) {
2593               if(sh<0) {
2594                 emit_loadreg(rs1[i]|64,th);
2595               }else{
2596                 emit_mov(sh,th);
2597               }
2598             }
2599             if(opcode[i]==0x0d) //ORI
2600             if(sl<0) {
2601               emit_orimm(tl,imm[i],tl);
2602             }else{
2603               if(!((i_regs->wasconst>>sl)&1))
2604                 emit_orimm(sl,imm[i],tl);
2605               else
2606                 emit_movimm(constmap[i][sl]|imm[i],tl);
2607             }
2608             if(opcode[i]==0x0e) //XORI
2609             if(sl<0) {
2610               emit_xorimm(tl,imm[i],tl);
2611             }else{
2612               if(!((i_regs->wasconst>>sl)&1))
2613                 emit_xorimm(sl,imm[i],tl);
2614               else
2615                 emit_movimm(constmap[i][sl]^imm[i],tl);
2616             }
2617           }
2618           else {
2619             emit_movimm(imm[i],tl);
2620             if(th>=0) emit_zeroreg(th);
2621           }
2622         }
2623       }
2624     }
2625   }
2626 }
2627
2628 void shiftimm_assemble(int i,struct regstat *i_regs)
2629 {
2630   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2631   {
2632     if(rt1[i]) {
2633       signed char s,t;
2634       t=get_reg(i_regs->regmap,rt1[i]);
2635       s=get_reg(i_regs->regmap,rs1[i]);
2636       //assert(t>=0);
2637       if(t>=0){
2638         if(rs1[i]==0)
2639         {
2640           emit_zeroreg(t);
2641         }
2642         else
2643         {
2644           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2645           if(imm[i]) {
2646             if(opcode2[i]==0) // SLL
2647             {
2648               emit_shlimm(s<0?t:s,imm[i],t);
2649             }
2650             if(opcode2[i]==2) // SRL
2651             {
2652               emit_shrimm(s<0?t:s,imm[i],t);
2653             }
2654             if(opcode2[i]==3) // SRA
2655             {
2656               emit_sarimm(s<0?t:s,imm[i],t);
2657             }
2658           }else{
2659             // Shift by zero
2660             if(s>=0 && s!=t) emit_mov(s,t);
2661           }
2662         }
2663       }
2664       //emit_storereg(rt1[i],t); //DEBUG
2665     }
2666   }
2667   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2668   {
2669     if(rt1[i]) {
2670       signed char sh,sl,th,tl;
2671       th=get_reg(i_regs->regmap,rt1[i]|64);
2672       tl=get_reg(i_regs->regmap,rt1[i]);
2673       sh=get_reg(i_regs->regmap,rs1[i]|64);
2674       sl=get_reg(i_regs->regmap,rs1[i]);
2675       if(tl>=0) {
2676         if(rs1[i]==0)
2677         {
2678           emit_zeroreg(tl);
2679           if(th>=0) emit_zeroreg(th);
2680         }
2681         else
2682         {
2683           assert(sl>=0);
2684           assert(sh>=0);
2685           if(imm[i]) {
2686             if(opcode2[i]==0x38) // DSLL
2687             {
2688               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2689               emit_shlimm(sl,imm[i],tl);
2690             }
2691             if(opcode2[i]==0x3a) // DSRL
2692             {
2693               emit_shrdimm(sl,sh,imm[i],tl);
2694               if(th>=0) emit_shrimm(sh,imm[i],th);
2695             }
2696             if(opcode2[i]==0x3b) // DSRA
2697             {
2698               emit_shrdimm(sl,sh,imm[i],tl);
2699               if(th>=0) emit_sarimm(sh,imm[i],th);
2700             }
2701           }else{
2702             // Shift by zero
2703             if(sl!=tl) emit_mov(sl,tl);
2704             if(th>=0&&sh!=th) emit_mov(sh,th);
2705           }
2706         }
2707       }
2708     }
2709   }
2710   if(opcode2[i]==0x3c) // DSLL32
2711   {
2712     if(rt1[i]) {
2713       signed char sl,tl,th;
2714       tl=get_reg(i_regs->regmap,rt1[i]);
2715       th=get_reg(i_regs->regmap,rt1[i]|64);
2716       sl=get_reg(i_regs->regmap,rs1[i]);
2717       if(th>=0||tl>=0){
2718         assert(tl>=0);
2719         assert(th>=0);
2720         assert(sl>=0);
2721         emit_mov(sl,th);
2722         emit_zeroreg(tl);
2723         if(imm[i]>32)
2724         {
2725           emit_shlimm(th,imm[i]&31,th);
2726         }
2727       }
2728     }
2729   }
2730   if(opcode2[i]==0x3e) // DSRL32
2731   {
2732     if(rt1[i]) {
2733       signed char sh,tl,th;
2734       tl=get_reg(i_regs->regmap,rt1[i]);
2735       th=get_reg(i_regs->regmap,rt1[i]|64);
2736       sh=get_reg(i_regs->regmap,rs1[i]|64);
2737       if(tl>=0){
2738         assert(sh>=0);
2739         emit_mov(sh,tl);
2740         if(th>=0) emit_zeroreg(th);
2741         if(imm[i]>32)
2742         {
2743           emit_shrimm(tl,imm[i]&31,tl);
2744         }
2745       }
2746     }
2747   }
2748   if(opcode2[i]==0x3f) // DSRA32
2749   {
2750     if(rt1[i]) {
2751       signed char sh,tl;
2752       tl=get_reg(i_regs->regmap,rt1[i]);
2753       sh=get_reg(i_regs->regmap,rs1[i]|64);
2754       if(tl>=0){
2755         assert(sh>=0);
2756         emit_mov(sh,tl);
2757         if(imm[i]>32)
2758         {
2759           emit_sarimm(tl,imm[i]&31,tl);
2760         }
2761       }
2762     }
2763   }
2764 }
2765
2766 #ifndef shift_assemble
2767 void shift_assemble(int i,struct regstat *i_regs)
2768 {
2769   printf("Need shift_assemble for this architecture.\n");
2770   exit(1);
2771 }
2772 #endif
2773
2774 void load_assemble(int i,struct regstat *i_regs)
2775 {
2776   int s,th,tl,addr,map=-1;
2777   int offset;
2778   int jaddr=0;
2779   int memtarget=0,c=0;
2780   u_int hr,reglist=0;
2781   th=get_reg(i_regs->regmap,rt1[i]|64);
2782   tl=get_reg(i_regs->regmap,rt1[i]);
2783   s=get_reg(i_regs->regmap,rs1[i]);
2784   offset=imm[i];
2785   for(hr=0;hr<HOST_REGS;hr++) {
2786     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2787   }
2788   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2789   if(s>=0) {
2790     c=(i_regs->wasconst>>s)&1;
2791     if (c) {
2792       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2793       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2794     }
2795   }
2796   //printf("load_assemble: c=%d\n",c);
2797   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2798   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2799 #ifdef PCSX
2800   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2801     ||rt1[i]==0) {
2802       // could be FIFO, must perform the read
2803       // ||dummy read
2804       assem_debug("(forced read)\n");
2805       tl=get_reg(i_regs->regmap,-1);
2806       assert(tl>=0);
2807   }
2808 #endif
2809   if(offset||s<0||c) addr=tl;
2810   else addr=s;
2811   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2812  if(tl>=0) {
2813   //printf("load_assemble: c=%d\n",c);
2814   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2815   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2816   reglist&=~(1<<tl);
2817   if(th>=0) reglist&=~(1<<th);
2818   if(!using_tlb) {
2819     if(!c) {
2820       #ifdef RAM_OFFSET
2821       map=get_reg(i_regs->regmap,ROREG);
2822       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2823       #endif
2824 //#define R29_HACK 1
2825       #ifdef R29_HACK
2826       // Strmnnrmn's speed hack
2827       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2828       #endif
2829       {
2830         #ifdef PCSX
2831         if(sp_in_mirror&&rs1[i]==29) {
2832           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2833           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2834         }
2835         else
2836         #endif
2837         emit_cmpimm(addr,RAM_SIZE);
2838         jaddr=(int)out;
2839         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2840         // Hint to branch predictor that the branch is unlikely to be taken
2841         if(rs1[i]>=28)
2842           emit_jno_unlikely(0);
2843         else
2844         #endif
2845         emit_jno(0);
2846       }
2847     }
2848   }else{ // using tlb
2849     int x=0;
2850     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2851     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2852     map=get_reg(i_regs->regmap,TLREG);
2853     assert(map>=0);
2854     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2855     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2856   }
2857   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2858   if (opcode[i]==0x20) { // LB
2859     if(!c||memtarget) {
2860       if(!dummy) {
2861         #ifdef HOST_IMM_ADDR32
2862         if(c)
2863           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2864         else
2865         #endif
2866         {
2867           //emit_xorimm(addr,3,tl);
2868           //gen_tlb_addr_r(tl,map);
2869           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2870           int x=0,a=tl;
2871 #ifdef BIG_ENDIAN_MIPS
2872           if(!c) emit_xorimm(addr,3,tl);
2873           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2874 #else
2875           if(!c) a=addr;
2876 #endif
2877 #ifdef PCSX
2878           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2879 #endif
2880           emit_movsbl_indexed_tlb(x,a,map,tl);
2881         }
2882       }
2883       if(jaddr)
2884         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2885     }
2886     else
2887       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2888   }
2889   if (opcode[i]==0x21) { // LH
2890     if(!c||memtarget) {
2891       if(!dummy) {
2892         #ifdef HOST_IMM_ADDR32
2893         if(c)
2894           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2895         else
2896         #endif
2897         {
2898           int x=0,a=tl;
2899 #ifdef BIG_ENDIAN_MIPS
2900           if(!c) emit_xorimm(addr,2,tl);
2901           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2902 #else
2903           if(!c) a=addr;
2904 #endif
2905 #ifdef PCSX
2906           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2907 #endif
2908           //#ifdef
2909           //emit_movswl_indexed_tlb(x,tl,map,tl);
2910           //else
2911           if(map>=0) {
2912             gen_tlb_addr_r(a,map);
2913             emit_movswl_indexed(x,a,tl);
2914           }else{
2915             #ifdef RAM_OFFSET
2916             emit_movswl_indexed(x,a,tl);
2917             #else
2918             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2919             #endif
2920           }
2921         }
2922       }
2923       if(jaddr)
2924         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2925     }
2926     else
2927       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2928   }
2929   if (opcode[i]==0x23) { // LW
2930     if(!c||memtarget) {
2931       if(!dummy) {
2932         int a=addr;
2933 #ifdef PCSX
2934         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2935 #endif
2936         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2937         #ifdef HOST_IMM_ADDR32
2938         if(c)
2939           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2940         else
2941         #endif
2942         emit_readword_indexed_tlb(0,a,map,tl);
2943       }
2944       if(jaddr)
2945         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2946     }
2947     else
2948       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2949   }
2950   if (opcode[i]==0x24) { // LBU
2951     if(!c||memtarget) {
2952       if(!dummy) {
2953         #ifdef HOST_IMM_ADDR32
2954         if(c)
2955           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2956         else
2957         #endif
2958         {
2959           //emit_xorimm(addr,3,tl);
2960           //gen_tlb_addr_r(tl,map);
2961           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2962           int x=0,a=tl;
2963 #ifdef BIG_ENDIAN_MIPS
2964           if(!c) emit_xorimm(addr,3,tl);
2965           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2966 #else
2967           if(!c) a=addr;
2968 #endif
2969 #ifdef PCSX
2970           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2971 #endif
2972           emit_movzbl_indexed_tlb(x,a,map,tl);
2973         }
2974       }
2975       if(jaddr)
2976         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2977     }
2978     else
2979       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2980   }
2981   if (opcode[i]==0x25) { // LHU
2982     if(!c||memtarget) {
2983       if(!dummy) {
2984         #ifdef HOST_IMM_ADDR32
2985         if(c)
2986           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2987         else
2988         #endif
2989         {
2990           int x=0,a=tl;
2991 #ifdef BIG_ENDIAN_MIPS
2992           if(!c) emit_xorimm(addr,2,tl);
2993           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2994 #else
2995           if(!c) a=addr;
2996 #endif
2997 #ifdef PCSX
2998           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2999 #endif
3000           //#ifdef
3001           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3002           //#else
3003           if(map>=0) {
3004             gen_tlb_addr_r(a,map);
3005             emit_movzwl_indexed(x,a,tl);
3006           }else{
3007             #ifdef RAM_OFFSET
3008             emit_movzwl_indexed(x,a,tl);
3009             #else
3010             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3011             #endif
3012           }
3013         }
3014       }
3015       if(jaddr)
3016         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3017     }
3018     else
3019       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3020   }
3021   if (opcode[i]==0x27) { // LWU
3022     assert(th>=0);
3023     if(!c||memtarget) {
3024       if(!dummy) {
3025         int a=addr;
3026 #ifdef PCSX
3027         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3028 #endif
3029         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3030         #ifdef HOST_IMM_ADDR32
3031         if(c)
3032           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3033         else
3034         #endif
3035         emit_readword_indexed_tlb(0,a,map,tl);
3036       }
3037       if(jaddr)
3038         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3039     }
3040     else {
3041       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3042     }
3043     emit_zeroreg(th);
3044   }
3045   if (opcode[i]==0x37) { // LD
3046     if(!c||memtarget) {
3047       if(!dummy) {
3048         int a=addr;
3049 #ifdef PCSX
3050         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3051 #endif
3052         //gen_tlb_addr_r(tl,map);
3053         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3054         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3055         #ifdef HOST_IMM_ADDR32
3056         if(c)
3057           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3058         else
3059         #endif
3060         emit_readdword_indexed_tlb(0,a,map,th,tl);
3061       }
3062       if(jaddr)
3063         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3064     }
3065     else
3066       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3067   }
3068  }
3069   //emit_storereg(rt1[i],tl); // DEBUG
3070   //if(opcode[i]==0x23)
3071   //if(opcode[i]==0x24)
3072   //if(opcode[i]==0x23||opcode[i]==0x24)
3073   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3074   {
3075     //emit_pusha();
3076     save_regs(0x100f);
3077         emit_readword((int)&last_count,ECX);
3078         #ifdef __i386__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,HOST_CCREG);
3081         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3082         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3083         emit_writeword(HOST_CCREG,(int)&Count);
3084         #endif
3085         #ifdef __arm__
3086         if(get_reg(i_regs->regmap,CCREG)<0)
3087           emit_loadreg(CCREG,0);
3088         else
3089           emit_mov(HOST_CCREG,0);
3090         emit_add(0,ECX,0);
3091         emit_addimm(0,2*ccadj[i],0);
3092         emit_writeword(0,(int)&Count);
3093         #endif
3094     emit_call((int)memdebug);
3095     //emit_popa();
3096     restore_regs(0x100f);
3097   }/**/
3098 }
3099
3100 #ifndef loadlr_assemble
3101 void loadlr_assemble(int i,struct regstat *i_regs)
3102 {
3103   printf("Need loadlr_assemble for this architecture.\n");
3104   exit(1);
3105 }
3106 #endif
3107
3108 void store_assemble(int i,struct regstat *i_regs)
3109 {
3110   int s,th,tl,map=-1;
3111   int addr,temp;
3112   int offset;
3113   int jaddr=0,jaddr2,type;
3114   int memtarget=0,c=0;
3115   int agr=AGEN1+(i&1);
3116   u_int hr,reglist=0;
3117   th=get_reg(i_regs->regmap,rs2[i]|64);
3118   tl=get_reg(i_regs->regmap,rs2[i]);
3119   s=get_reg(i_regs->regmap,rs1[i]);
3120   temp=get_reg(i_regs->regmap,agr);
3121   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3122   offset=imm[i];
3123   if(s>=0) {
3124     c=(i_regs->wasconst>>s)&1;
3125     if(c) {
3126       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3127       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3128     }
3129   }
3130   assert(tl>=0);
3131   assert(temp>=0);
3132   for(hr=0;hr<HOST_REGS;hr++) {
3133     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3134   }
3135   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3136   if(offset||s<0||c) addr=temp;
3137   else addr=s;
3138   if(!using_tlb) {
3139     if(!c) {
3140       #ifdef PCSX
3141       if(sp_in_mirror&&rs1[i]==29) {
3142         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3143         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3144       }
3145       else
3146       #endif
3147       #ifdef R29_HACK
3148       // Strmnnrmn's speed hack
3149       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3150       #endif
3151       emit_cmpimm(addr,RAM_SIZE);
3152       #ifdef DESTRUCTIVE_SHIFT
3153       if(s==addr) emit_mov(s,temp);
3154       #endif
3155       #ifdef R29_HACK
3156       memtarget=1;
3157       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3158       #endif
3159       {
3160         jaddr=(int)out;
3161         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3162         // Hint to branch predictor that the branch is unlikely to be taken
3163         if(rs1[i]>=28)
3164           emit_jno_unlikely(0);
3165         else
3166         #endif
3167         emit_jno(0);
3168       }
3169     }
3170   }else{ // using tlb
3171     int x=0;
3172     if (opcode[i]==0x28) x=3; // SB
3173     if (opcode[i]==0x29) x=2; // SH
3174     map=get_reg(i_regs->regmap,TLREG);
3175     assert(map>=0);
3176     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3177     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3178   }
3179
3180   if (opcode[i]==0x28) { // SB
3181     if(!c||memtarget) {
3182       int x=0,a=temp;
3183 #ifdef BIG_ENDIAN_MIPS
3184       if(!c) emit_xorimm(addr,3,temp);
3185       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3186 #else
3187       if(!c) a=addr;
3188 #endif
3189 #ifdef PCSX
3190       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3191 #endif
3192       //gen_tlb_addr_w(temp,map);
3193       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3194       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3195     }
3196     type=STOREB_STUB;
3197   }
3198   if (opcode[i]==0x29) { // SH
3199     if(!c||memtarget) {
3200       int x=0,a=temp;
3201 #ifdef BIG_ENDIAN_MIPS
3202       if(!c) emit_xorimm(addr,2,temp);
3203       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3204 #else
3205       if(!c) a=addr;
3206 #endif
3207 #ifdef PCSX
3208       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3209 #endif
3210       //#ifdef
3211       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3212       //#else
3213       if(map>=0) {
3214         gen_tlb_addr_w(a,map);
3215         emit_writehword_indexed(tl,x,a);
3216       }else
3217         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3218     }
3219     type=STOREH_STUB;
3220   }
3221   if (opcode[i]==0x2B) { // SW
3222     if(!c||memtarget) {
3223       int a=addr;
3224 #ifdef PCSX
3225       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3226 #endif
3227       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3228       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3229     }
3230     type=STOREW_STUB;
3231   }
3232   if (opcode[i]==0x3F) { // SD
3233     if(!c||memtarget) {
3234       int a=addr;
3235 #ifdef PCSX
3236       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3237 #endif
3238       if(rs2[i]) {
3239         assert(th>=0);
3240         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3241         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3242         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3243       }else{
3244         // Store zero
3245         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3246         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3247         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3248       }
3249     }
3250     type=STORED_STUB;
3251   }
3252   if(!using_tlb) {
3253     if(!c||memtarget) {
3254       #ifdef DESTRUCTIVE_SHIFT
3255       // The x86 shift operation is 'destructive'; it overwrites the
3256       // source register, so we need to make a copy first and use that.
3257       addr=temp;
3258       #endif
3259       #if defined(HOST_IMM8)
3260       int ir=get_reg(i_regs->regmap,INVCP);
3261       assert(ir>=0);
3262       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3263       #else
3264       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3265       #endif
3266       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3267       emit_callne(invalidate_addr_reg[addr]);
3268       #else
3269       jaddr2=(int)out;
3270       emit_jne(0);
3271       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3272       #endif
3273     }
3274   }
3275   if(jaddr) {
3276     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3277   } else if(c&&!memtarget) {
3278     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3279   }
3280   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3281   //if(opcode[i]==0x2B || opcode[i]==0x28)
3282   //if(opcode[i]==0x2B || opcode[i]==0x29)
3283   //if(opcode[i]==0x2B)
3284   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3285   {
3286     //emit_pusha();
3287     save_regs(0x100f);
3288         emit_readword((int)&last_count,ECX);
3289         #ifdef __i386__
3290         if(get_reg(i_regs->regmap,CCREG)<0)
3291           emit_loadreg(CCREG,HOST_CCREG);
3292         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3293         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3294         emit_writeword(HOST_CCREG,(int)&Count);
3295         #endif
3296         #ifdef __arm__
3297         if(get_reg(i_regs->regmap,CCREG)<0)
3298           emit_loadreg(CCREG,0);
3299         else
3300           emit_mov(HOST_CCREG,0);
3301         emit_add(0,ECX,0);
3302         emit_addimm(0,2*ccadj[i],0);
3303         emit_writeword(0,(int)&Count);
3304         #endif
3305     emit_call((int)memdebug);
3306     //emit_popa();
3307     restore_regs(0x100f);
3308   }/**/
3309 }
3310
3311 void storelr_assemble(int i,struct regstat *i_regs)
3312 {
3313   int s,th,tl;
3314   int temp;
3315   int temp2;
3316   int offset;
3317   int jaddr=0,jaddr2;
3318   int case1,case2,case3;
3319   int done0,done1,done2;
3320   int memtarget=0,c=0;
3321   int agr=AGEN1+(i&1);
3322   u_int hr,reglist=0;
3323   th=get_reg(i_regs->regmap,rs2[i]|64);
3324   tl=get_reg(i_regs->regmap,rs2[i]);
3325   s=get_reg(i_regs->regmap,rs1[i]);
3326   temp=get_reg(i_regs->regmap,agr);
3327   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3328   offset=imm[i];
3329   if(s>=0) {
3330     c=(i_regs->isconst>>s)&1;
3331     if(c) {
3332       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3333       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3334     }
3335   }
3336   assert(tl>=0);
3337   for(hr=0;hr<HOST_REGS;hr++) {
3338     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3339   }
3340   assert(temp>=0);
3341   if(!using_tlb) {
3342     if(!c) {
3343       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3344       if(!offset&&s!=temp) emit_mov(s,temp);
3345       jaddr=(int)out;
3346       emit_jno(0);
3347     }
3348     else
3349     {
3350       if(!memtarget||!rs1[i]) {
3351         jaddr=(int)out;
3352         emit_jmp(0);
3353       }
3354     }
3355     #ifdef RAM_OFFSET
3356     int map=get_reg(i_regs->regmap,ROREG);
3357     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3358     gen_tlb_addr_w(temp,map);
3359     #else
3360     if((u_int)rdram!=0x80000000) 
3361       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3362     #endif
3363   }else{ // using tlb
3364     int map=get_reg(i_regs->regmap,TLREG);
3365     assert(map>=0);
3366     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3367     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3368     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3369     if(!jaddr&&!memtarget) {
3370       jaddr=(int)out;
3371       emit_jmp(0);
3372     }
3373     gen_tlb_addr_w(temp,map);
3374   }
3375
3376   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3377     temp2=get_reg(i_regs->regmap,FTEMP);
3378     if(!rs2[i]) temp2=th=tl;
3379   }
3380
3381 #ifndef BIG_ENDIAN_MIPS
3382     emit_xorimm(temp,3,temp);
3383 #endif
3384   emit_testimm(temp,2);
3385   case2=(int)out;
3386   emit_jne(0);
3387   emit_testimm(temp,1);
3388   case1=(int)out;
3389   emit_jne(0);
3390   // 0
3391   if (opcode[i]==0x2A) { // SWL
3392     emit_writeword_indexed(tl,0,temp);
3393   }
3394   if (opcode[i]==0x2E) { // SWR
3395     emit_writebyte_indexed(tl,3,temp);
3396   }
3397   if (opcode[i]==0x2C) { // SDL
3398     emit_writeword_indexed(th,0,temp);
3399     if(rs2[i]) emit_mov(tl,temp2);
3400   }
3401   if (opcode[i]==0x2D) { // SDR
3402     emit_writebyte_indexed(tl,3,temp);
3403     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3404   }
3405   done0=(int)out;
3406   emit_jmp(0);
3407   // 1
3408   set_jump_target(case1,(int)out);
3409   if (opcode[i]==0x2A) { // SWL
3410     // Write 3 msb into three least significant bytes
3411     if(rs2[i]) emit_rorimm(tl,8,tl);
3412     emit_writehword_indexed(tl,-1,temp);
3413     if(rs2[i]) emit_rorimm(tl,16,tl);
3414     emit_writebyte_indexed(tl,1,temp);
3415     if(rs2[i]) emit_rorimm(tl,8,tl);
3416   }
3417   if (opcode[i]==0x2E) { // SWR
3418     // Write two lsb into two most significant bytes
3419     emit_writehword_indexed(tl,1,temp);
3420   }
3421   if (opcode[i]==0x2C) { // SDL
3422     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3423     // Write 3 msb into three least significant bytes
3424     if(rs2[i]) emit_rorimm(th,8,th);
3425     emit_writehword_indexed(th,-1,temp);
3426     if(rs2[i]) emit_rorimm(th,16,th);
3427     emit_writebyte_indexed(th,1,temp);
3428     if(rs2[i]) emit_rorimm(th,8,th);
3429   }
3430   if (opcode[i]==0x2D) { // SDR
3431     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3432     // Write two lsb into two most significant bytes
3433     emit_writehword_indexed(tl,1,temp);
3434   }
3435   done1=(int)out;
3436   emit_jmp(0);
3437   // 2
3438   set_jump_target(case2,(int)out);
3439   emit_testimm(temp,1);
3440   case3=(int)out;
3441   emit_jne(0);
3442   if (opcode[i]==0x2A) { // SWL
3443     // Write two msb into two least significant bytes
3444     if(rs2[i]) emit_rorimm(tl,16,tl);
3445     emit_writehword_indexed(tl,-2,temp);
3446     if(rs2[i]) emit_rorimm(tl,16,tl);
3447   }
3448   if (opcode[i]==0x2E) { // SWR
3449     // Write 3 lsb into three most significant bytes
3450     emit_writebyte_indexed(tl,-1,temp);
3451     if(rs2[i]) emit_rorimm(tl,8,tl);
3452     emit_writehword_indexed(tl,0,temp);
3453     if(rs2[i]) emit_rorimm(tl,24,tl);
3454   }
3455   if (opcode[i]==0x2C) { // SDL
3456     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3457     // Write two msb into two least significant bytes
3458     if(rs2[i]) emit_rorimm(th,16,th);
3459     emit_writehword_indexed(th,-2,temp);
3460     if(rs2[i]) emit_rorimm(th,16,th);
3461   }
3462   if (opcode[i]==0x2D) { // SDR
3463     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3464     // Write 3 lsb into three most significant bytes
3465     emit_writebyte_indexed(tl,-1,temp);
3466     if(rs2[i]) emit_rorimm(tl,8,tl);
3467     emit_writehword_indexed(tl,0,temp);
3468     if(rs2[i]) emit_rorimm(tl,24,tl);
3469   }
3470   done2=(int)out;
3471   emit_jmp(0);
3472   // 3
3473   set_jump_target(case3,(int)out);
3474   if (opcode[i]==0x2A) { // SWL
3475     // Write msb into least significant byte
3476     if(rs2[i]) emit_rorimm(tl,24,tl);
3477     emit_writebyte_indexed(tl,-3,temp);
3478     if(rs2[i]) emit_rorimm(tl,8,tl);
3479   }
3480   if (opcode[i]==0x2E) { // SWR
3481     // Write entire word
3482     emit_writeword_indexed(tl,-3,temp);
3483   }
3484   if (opcode[i]==0x2C) { // SDL
3485     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3486     // Write msb into least significant byte
3487     if(rs2[i]) emit_rorimm(th,24,th);
3488     emit_writebyte_indexed(th,-3,temp);
3489     if(rs2[i]) emit_rorimm(th,8,th);
3490   }
3491   if (opcode[i]==0x2D) { // SDR
3492     if(rs2[i]) emit_mov(th,temp2);
3493     // Write entire word
3494     emit_writeword_indexed(tl,-3,temp);
3495   }
3496   set_jump_target(done0,(int)out);
3497   set_jump_target(done1,(int)out);
3498   set_jump_target(done2,(int)out);
3499   if (opcode[i]==0x2C) { // SDL
3500     emit_testimm(temp,4);
3501     done0=(int)out;
3502     emit_jne(0);
3503     emit_andimm(temp,~3,temp);
3504     emit_writeword_indexed(temp2,4,temp);
3505     set_jump_target(done0,(int)out);
3506   }
3507   if (opcode[i]==0x2D) { // SDR
3508     emit_testimm(temp,4);
3509     done0=(int)out;
3510     emit_jeq(0);
3511     emit_andimm(temp,~3,temp);
3512     emit_writeword_indexed(temp2,-4,temp);
3513     set_jump_target(done0,(int)out);
3514   }
3515   if(!c||!memtarget)
3516     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3517   if(!using_tlb) {
3518     #ifdef RAM_OFFSET
3519     int map=get_reg(i_regs->regmap,ROREG);
3520     if(map<0) map=HOST_TEMPREG;
3521     gen_orig_addr_w(temp,map);
3522     #else
3523     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3524     #endif
3525     #if defined(HOST_IMM8)
3526     int ir=get_reg(i_regs->regmap,INVCP);
3527     assert(ir>=0);
3528     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3529     #else
3530     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3531     #endif
3532     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3533     emit_callne(invalidate_addr_reg[temp]);
3534     #else
3535     jaddr2=(int)out;
3536     emit_jne(0);
3537     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3538     #endif
3539   }
3540   /*
3541     emit_pusha();
3542     //save_regs(0x100f);
3543         emit_readword((int)&last_count,ECX);
3544         if(get_reg(i_regs->regmap,CCREG)<0)
3545           emit_loadreg(CCREG,HOST_CCREG);
3546         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3547         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3548         emit_writeword(HOST_CCREG,(int)&Count);
3549     emit_call((int)memdebug);
3550     emit_popa();
3551     //restore_regs(0x100f);
3552   /**/
3553 }
3554
3555 void c1ls_assemble(int i,struct regstat *i_regs)
3556 {
3557 #ifndef DISABLE_COP1
3558   int s,th,tl;
3559   int temp,ar;
3560   int map=-1;
3561   int offset;
3562   int c=0;
3563   int jaddr,jaddr2=0,jaddr3,type;
3564   int agr=AGEN1+(i&1);
3565   u_int hr,reglist=0;
3566   th=get_reg(i_regs->regmap,FTEMP|64);
3567   tl=get_reg(i_regs->regmap,FTEMP);
3568   s=get_reg(i_regs->regmap,rs1[i]);
3569   temp=get_reg(i_regs->regmap,agr);
3570   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3571   offset=imm[i];
3572   assert(tl>=0);
3573   assert(rs1[i]>0);
3574   assert(temp>=0);
3575   for(hr=0;hr<HOST_REGS;hr++) {
3576     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3577   }
3578   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3579   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3580   {
3581     // Loads use a temporary register which we need to save
3582     reglist|=1<<temp;
3583   }
3584   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3585     ar=temp;
3586   else // LWC1/LDC1
3587     ar=tl;
3588   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3589   //else c=(i_regs->wasconst>>s)&1;
3590   if(s>=0) c=(i_regs->wasconst>>s)&1;
3591   // Check cop1 unusable
3592   if(!cop1_usable) {
3593     signed char rs=get_reg(i_regs->regmap,CSREG);
3594     assert(rs>=0);
3595     emit_testimm(rs,0x20000000);
3596     jaddr=(int)out;
3597     emit_jeq(0);
3598     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3599     cop1_usable=1;
3600   }
3601   if (opcode[i]==0x39) { // SWC1 (get float address)
3602     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3603   }
3604   if (opcode[i]==0x3D) { // SDC1 (get double address)
3605     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3606   }
3607   // Generate address + offset
3608   if(!using_tlb) {
3609     if(!c)
3610       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3611   }
3612   else
3613   {
3614     map=get_reg(i_regs->regmap,TLREG);
3615     assert(map>=0);
3616     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3617       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3618     }
3619     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3620       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3621     }
3622   }
3623   if (opcode[i]==0x39) { // SWC1 (read float)
3624     emit_readword_indexed(0,tl,tl);
3625   }
3626   if (opcode[i]==0x3D) { // SDC1 (read double)
3627     emit_readword_indexed(4,tl,th);
3628     emit_readword_indexed(0,tl,tl);
3629   }
3630   if (opcode[i]==0x31) { // LWC1 (get target address)
3631     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3632   }
3633   if (opcode[i]==0x35) { // LDC1 (get target address)
3634     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3635   }
3636   if(!using_tlb) {
3637     if(!c) {
3638       jaddr2=(int)out;
3639       emit_jno(0);
3640     }
3641     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3642       jaddr2=(int)out;
3643       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3644     }
3645     #ifdef DESTRUCTIVE_SHIFT
3646     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3647       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3648     }
3649     #endif
3650   }else{
3651     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3652       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3653     }
3654     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3655       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3656     }
3657   }
3658   if (opcode[i]==0x31) { // LWC1
3659     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3660     //gen_tlb_addr_r(ar,map);
3661     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3662     #ifdef HOST_IMM_ADDR32
3663     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3664     else
3665     #endif
3666     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3667     type=LOADW_STUB;
3668   }
3669   if (opcode[i]==0x35) { // LDC1
3670     assert(th>=0);
3671     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3672     //gen_tlb_addr_r(ar,map);
3673     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3674     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3675     #ifdef HOST_IMM_ADDR32
3676     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3677     else
3678     #endif
3679     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3680     type=LOADD_STUB;
3681   }
3682   if (opcode[i]==0x39) { // SWC1
3683     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3684     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3685     type=STOREW_STUB;
3686   }
3687   if (opcode[i]==0x3D) { // SDC1
3688     assert(th>=0);
3689     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3690     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3691     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3692     type=STORED_STUB;
3693   }
3694   if(!using_tlb) {
3695     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3696       #ifndef DESTRUCTIVE_SHIFT
3697       temp=offset||c||s<0?ar:s;
3698       #endif
3699       #if defined(HOST_IMM8)
3700       int ir=get_reg(i_regs->regmap,INVCP);
3701       assert(ir>=0);
3702       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3703       #else
3704       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3705       #endif
3706       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3707       emit_callne(invalidate_addr_reg[temp]);
3708       #else
3709       jaddr3=(int)out;
3710       emit_jne(0);
3711       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3712       #endif
3713     }
3714   }
3715   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3716   if (opcode[i]==0x31) { // LWC1 (write float)
3717     emit_writeword_indexed(tl,0,temp);
3718   }
3719   if (opcode[i]==0x35) { // LDC1 (write double)
3720     emit_writeword_indexed(th,4,temp);
3721     emit_writeword_indexed(tl,0,temp);
3722   }
3723   //if(opcode[i]==0x39)
3724   /*if(opcode[i]==0x39||opcode[i]==0x31)
3725   {
3726     emit_pusha();
3727         emit_readword((int)&last_count,ECX);
3728         if(get_reg(i_regs->regmap,CCREG)<0)
3729           emit_loadreg(CCREG,HOST_CCREG);
3730         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3731         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3732         emit_writeword(HOST_CCREG,(int)&Count);
3733     emit_call((int)memdebug);
3734     emit_popa();
3735   }/**/
3736 #else
3737   cop1_unusable(i, i_regs);
3738 #endif
3739 }
3740
3741 void c2ls_assemble(int i,struct regstat *i_regs)
3742 {
3743   int s,tl;
3744   int ar;
3745   int offset;
3746   int memtarget=0,c=0;
3747   int jaddr,jaddr2=0,jaddr3,type;
3748   int agr=AGEN1+(i&1);
3749   u_int hr,reglist=0;
3750   u_int copr=(source[i]>>16)&0x1f;
3751   s=get_reg(i_regs->regmap,rs1[i]);
3752   tl=get_reg(i_regs->regmap,FTEMP);
3753   offset=imm[i];
3754   assert(rs1[i]>0);
3755   assert(tl>=0);
3756   assert(!using_tlb);
3757
3758   for(hr=0;hr<HOST_REGS;hr++) {
3759     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3760   }
3761   if(i_regs->regmap[HOST_CCREG]==CCREG)
3762     reglist&=~(1<<HOST_CCREG);
3763
3764   // get the address
3765   if (opcode[i]==0x3a) { // SWC2
3766     ar=get_reg(i_regs->regmap,agr);
3767     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3768     reglist|=1<<ar;
3769   } else { // LWC2
3770     ar=tl;
3771   }
3772   if(s>=0) c=(i_regs->wasconst>>s)&1;
3773   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3774   if (!offset&&!c&&s>=0) ar=s;
3775   assert(ar>=0);
3776
3777   if (opcode[i]==0x3a) { // SWC2
3778     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3779     type=STOREW_STUB;
3780   }
3781   else
3782     type=LOADW_STUB;
3783
3784   if(c&&!memtarget) {
3785     jaddr2=(int)out;
3786     emit_jmp(0); // inline_readstub/inline_writestub?
3787   }
3788   else {
3789     if(!c) {
3790       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3791       jaddr2=(int)out;
3792       emit_jno(0);
3793     }
3794     if (opcode[i]==0x32) { // LWC2
3795       #ifdef HOST_IMM_ADDR32
3796       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3797       else
3798       #endif
3799       emit_readword_indexed(0,ar,tl);
3800     }
3801     if (opcode[i]==0x3a) { // SWC2
3802       #ifdef DESTRUCTIVE_SHIFT
3803       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3804       #endif
3805       emit_writeword_indexed(tl,0,ar);
3806     }
3807   }
3808   if(jaddr2)
3809     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3810   if (opcode[i]==0x3a) { // SWC2
3811 #if defined(HOST_IMM8)
3812     int ir=get_reg(i_regs->regmap,INVCP);
3813     assert(ir>=0);
3814     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3815 #else
3816     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3817 #endif
3818     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3819     emit_callne(invalidate_addr_reg[ar]);
3820     #else
3821     jaddr3=(int)out;
3822     emit_jne(0);
3823     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3824     #endif
3825   }
3826   if (opcode[i]==0x32) { // LWC2
3827     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3828   }
3829 }
3830
3831 #ifndef multdiv_assemble
3832 void multdiv_assemble(int i,struct regstat *i_regs)
3833 {
3834   printf("Need multdiv_assemble for this architecture.\n");
3835   exit(1);
3836 }
3837 #endif
3838
3839 void mov_assemble(int i,struct regstat *i_regs)
3840 {
3841   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3842   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3843   if(rt1[i]) {
3844     signed char sh,sl,th,tl;
3845     th=get_reg(i_regs->regmap,rt1[i]|64);
3846     tl=get_reg(i_regs->regmap,rt1[i]);
3847     //assert(tl>=0);
3848     if(tl>=0) {
3849       sh=get_reg(i_regs->regmap,rs1[i]|64);
3850       sl=get_reg(i_regs->regmap,rs1[i]);
3851       if(sl>=0) emit_mov(sl,tl);
3852       else emit_loadreg(rs1[i],tl);
3853       if(th>=0) {
3854         if(sh>=0) emit_mov(sh,th);
3855         else emit_loadreg(rs1[i]|64,th);
3856       }
3857     }
3858   }
3859 }
3860
3861 #ifndef fconv_assemble
3862 void fconv_assemble(int i,struct regstat *i_regs)
3863 {
3864   printf("Need fconv_assemble for this architecture.\n");
3865   exit(1);
3866 }
3867 #endif
3868
3869 #if 0
3870 void float_assemble(int i,struct regstat *i_regs)
3871 {
3872   printf("Need float_assemble for this architecture.\n");
3873   exit(1);
3874 }
3875 #endif
3876
3877 void syscall_assemble(int i,struct regstat *i_regs)
3878 {
3879   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3880   assert(ccreg==HOST_CCREG);
3881   assert(!is_delayslot);
3882   emit_movimm(start+i*4,EAX); // Get PC
3883   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3884   emit_jmp((int)jump_syscall_hle); // XXX
3885 }
3886
3887 void hlecall_assemble(int i,struct regstat *i_regs)
3888 {
3889   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3890   assert(ccreg==HOST_CCREG);
3891   assert(!is_delayslot);
3892   emit_movimm(start+i*4+4,0); // Get PC
3893   emit_movimm((int)psxHLEt[source[i]&7],1);
3894   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3895   emit_jmp((int)jump_hlecall);
3896 }
3897
3898 void intcall_assemble(int i,struct regstat *i_regs)
3899 {
3900   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3901   assert(ccreg==HOST_CCREG);
3902   assert(!is_delayslot);
3903   emit_movimm(start+i*4,0); // Get PC
3904   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3905   emit_jmp((int)jump_intcall);
3906 }
3907
3908 void ds_assemble(int i,struct regstat *i_regs)
3909 {
3910   is_delayslot=1;
3911   switch(itype[i]) {
3912     case ALU:
3913       alu_assemble(i,i_regs);break;
3914     case IMM16:
3915       imm16_assemble(i,i_regs);break;
3916     case SHIFT:
3917       shift_assemble(i,i_regs);break;
3918     case SHIFTIMM:
3919       shiftimm_assemble(i,i_regs);break;
3920     case LOAD:
3921       load_assemble(i,i_regs);break;
3922     case LOADLR:
3923       loadlr_assemble(i,i_regs);break;
3924     case STORE:
3925       store_assemble(i,i_regs);break;
3926     case STORELR:
3927       storelr_assemble(i,i_regs);break;
3928     case COP0:
3929       cop0_assemble(i,i_regs);break;
3930     case COP1:
3931       cop1_assemble(i,i_regs);break;
3932     case C1LS:
3933       c1ls_assemble(i,i_regs);break;
3934     case COP2:
3935       cop2_assemble(i,i_regs);break;
3936     case C2LS:
3937       c2ls_assemble(i,i_regs);break;
3938     case C2OP:
3939       c2op_assemble(i,i_regs);break;
3940     case FCONV:
3941       fconv_assemble(i,i_regs);break;
3942     case FLOAT:
3943       float_assemble(i,i_regs);break;
3944     case FCOMP:
3945       fcomp_assemble(i,i_regs);break;
3946     case MULTDIV:
3947       multdiv_assemble(i,i_regs);break;
3948     case MOV:
3949       mov_assemble(i,i_regs);break;
3950     case SYSCALL:
3951     case HLECALL:
3952     case INTCALL:
3953     case SPAN:
3954     case UJUMP:
3955     case RJUMP:
3956     case CJUMP:
3957     case SJUMP:
3958     case FJUMP:
3959       printf("Jump in the delay slot.  This is probably a bug.\n");
3960   }
3961   is_delayslot=0;
3962 }
3963
3964 // Is the branch target a valid internal jump?
3965 int internal_branch(uint64_t i_is32,int addr)
3966 {
3967   if(addr&1) return 0; // Indirect (register) jump
3968   if(addr>=start && addr<start+slen*4-4)
3969   {
3970     int t=(addr-start)>>2;
3971     // Delay slots are not valid branch targets
3972     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3973     // 64 -> 32 bit transition requires a recompile
3974     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3975     {
3976       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3977       else printf("optimizable: yes\n");
3978     }*/
3979     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3980 #ifndef FORCE32
3981     if(requires_32bit[t]&~i_is32) return 0;
3982     else
3983 #endif
3984       return 1;
3985   }
3986   return 0;
3987 }
3988
3989 #ifndef wb_invalidate
3990 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3991   uint64_t u,uint64_t uu)
3992 {
3993   int hr;
3994   for(hr=0;hr<HOST_REGS;hr++) {
3995     if(hr!=EXCLUDE_REG) {
3996       if(pre[hr]!=entry[hr]) {
3997         if(pre[hr]>=0) {
3998           if((dirty>>hr)&1) {
3999             if(get_reg(entry,pre[hr])<0) {
4000               if(pre[hr]<64) {
4001                 if(!((u>>pre[hr])&1)) {
4002                   emit_storereg(pre[hr],hr);
4003                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4004                     emit_sarimm(hr,31,hr);
4005                     emit_storereg(pre[hr]|64,hr);
4006                   }
4007                 }
4008               }else{
4009                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4010                   emit_storereg(pre[hr],hr);
4011                 }
4012               }
4013             }
4014           }
4015         }
4016       }
4017     }
4018   }
4019   // Move from one register to another (no writeback)
4020   for(hr=0;hr<HOST_REGS;hr++) {
4021     if(hr!=EXCLUDE_REG) {
4022       if(pre[hr]!=entry[hr]) {
4023         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4024           int nr;
4025           if((nr=get_reg(entry,pre[hr]))>=0) {
4026             emit_mov(hr,nr);
4027           }
4028         }
4029       }
4030     }
4031   }
4032 }
4033 #endif
4034
4035 // Load the specified registers
4036 // This only loads the registers given as arguments because
4037 // we don't want to load things that will be overwritten
4038 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4039 {
4040   int hr;
4041   // Load 32-bit regs
4042   for(hr=0;hr<HOST_REGS;hr++) {
4043     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4044       if(entry[hr]!=regmap[hr]) {
4045         if(regmap[hr]==rs1||regmap[hr]==rs2)
4046         {
4047           if(regmap[hr]==0) {
4048             emit_zeroreg(hr);
4049           }
4050           else
4051           {
4052             emit_loadreg(regmap[hr],hr);
4053           }
4054         }
4055       }
4056     }
4057   }
4058   //Load 64-bit regs
4059   for(hr=0;hr<HOST_REGS;hr++) {
4060     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4061       if(entry[hr]!=regmap[hr]) {
4062         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4063         {
4064           assert(regmap[hr]!=64);
4065           if((is32>>(regmap[hr]&63))&1) {
4066             int lr=get_reg(regmap,regmap[hr]-64);
4067             if(lr>=0)
4068               emit_sarimm(lr,31,hr);
4069             else
4070               emit_loadreg(regmap[hr],hr);
4071           }
4072           else
4073           {
4074             emit_loadreg(regmap[hr],hr);
4075           }
4076         }
4077       }
4078     }
4079   }
4080 }
4081
4082 // Load registers prior to the start of a loop
4083 // so that they are not loaded within the loop
4084 static void loop_preload(signed char pre[],signed char entry[])
4085 {
4086   int hr;
4087   for(hr=0;hr<HOST_REGS;hr++) {
4088     if(hr!=EXCLUDE_REG) {
4089       if(pre[hr]!=entry[hr]) {
4090         if(entry[hr]>=0) {
4091           if(get_reg(pre,entry[hr])<0) {
4092             assem_debug("loop preload:\n");
4093             //printf("loop preload: %d\n",hr);
4094             if(entry[hr]==0) {
4095               emit_zeroreg(hr);
4096             }
4097             else if(entry[hr]<TEMPREG)
4098             {
4099               emit_loadreg(entry[hr],hr);
4100             }
4101             else if(entry[hr]-64<TEMPREG)
4102             {
4103               emit_loadreg(entry[hr],hr);
4104             }
4105           }
4106         }
4107       }
4108     }
4109   }
4110 }
4111
4112 // Generate address for load/store instruction
4113 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4114 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4115 {
4116   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4117     int ra;
4118     int agr=AGEN1+(i&1);
4119     int mgr=MGEN1+(i&1);
4120     if(itype[i]==LOAD) {
4121       ra=get_reg(i_regs->regmap,rt1[i]);
4122       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4123       assert(ra>=0);
4124     }
4125     if(itype[i]==LOADLR) {
4126       ra=get_reg(i_regs->regmap,FTEMP);
4127     }
4128     if(itype[i]==STORE||itype[i]==STORELR) {
4129       ra=get_reg(i_regs->regmap,agr);
4130       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4131     }
4132     if(itype[i]==C1LS||itype[i]==C2LS) {
4133       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4134         ra=get_reg(i_regs->regmap,FTEMP);
4135       else { // SWC1/SDC1/SWC2/SDC2
4136         ra=get_reg(i_regs->regmap,agr);
4137         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4138       }
4139     }
4140     int rs=get_reg(i_regs->regmap,rs1[i]);
4141     int rm=get_reg(i_regs->regmap,TLREG);
4142     if(ra>=0) {
4143       int offset=imm[i];
4144       int c=(i_regs->wasconst>>rs)&1;
4145       if(rs1[i]==0) {
4146         // Using r0 as a base address
4147         /*if(rm>=0) {
4148           if(!entry||entry[rm]!=mgr) {
4149             generate_map_const(offset,rm);
4150           } // else did it in the previous cycle
4151         }*/
4152         if(!entry||entry[ra]!=agr) {
4153           if (opcode[i]==0x22||opcode[i]==0x26) {
4154             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4155           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4156             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4157           }else{
4158             emit_movimm(offset,ra);
4159           }
4160         } // else did it in the previous cycle
4161       }
4162       else if(rs<0) {
4163         if(!entry||entry[ra]!=rs1[i])
4164           emit_loadreg(rs1[i],ra);
4165         //if(!entry||entry[ra]!=rs1[i])
4166         //  printf("poor load scheduling!\n");
4167       }
4168       else if(c) {
4169         if(rm>=0) {
4170           if(!entry||entry[rm]!=mgr) {
4171             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4172               // Stores to memory go thru the mapper to detect self-modifying
4173               // code, loads don't.
4174               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4175                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4176                 generate_map_const(constmap[i][rs]+offset,rm);
4177             }else{
4178               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4179                 generate_map_const(constmap[i][rs]+offset,rm);
4180             }
4181           }
4182         }
4183         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4184           if(!entry||entry[ra]!=agr) {
4185             if (opcode[i]==0x22||opcode[i]==0x26) {
4186               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4187             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4188               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4189             }else{
4190               #ifdef HOST_IMM_ADDR32
4191               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4192                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4193               #endif
4194               emit_movimm(constmap[i][rs]+offset,ra);
4195             }
4196           } // else did it in the previous cycle
4197         } // else load_consts already did it
4198       }
4199       if(offset&&!c&&rs1[i]) {
4200         if(rs>=0) {
4201           emit_addimm(rs,offset,ra);
4202         }else{
4203           emit_addimm(ra,offset,ra);
4204         }
4205       }
4206     }
4207   }
4208   // Preload constants for next instruction
4209   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4210     int agr,ra;
4211     #ifndef HOST_IMM_ADDR32
4212     // Mapper entry
4213     agr=MGEN1+((i+1)&1);
4214     ra=get_reg(i_regs->regmap,agr);
4215     if(ra>=0) {
4216       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4217       int offset=imm[i+1];
4218       int c=(regs[i+1].wasconst>>rs)&1;
4219       if(c) {
4220         if(itype[i+1]==STORE||itype[i+1]==STORELR
4221            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4222           // Stores to memory go thru the mapper to detect self-modifying
4223           // code, loads don't.
4224           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4225              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4226             generate_map_const(constmap[i+1][rs]+offset,ra);
4227         }else{
4228           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4229             generate_map_const(constmap[i+1][rs]+offset,ra);
4230         }
4231       }
4232       /*else if(rs1[i]==0) {
4233         generate_map_const(offset,ra);
4234       }*/
4235     }
4236     #endif
4237     // Actual address
4238     agr=AGEN1+((i+1)&1);
4239     ra=get_reg(i_regs->regmap,agr);
4240     if(ra>=0) {
4241       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4242       int offset=imm[i+1];
4243       int c=(regs[i+1].wasconst>>rs)&1;
4244       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4245         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4246           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4247         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4248           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4249         }else{
4250           #ifdef HOST_IMM_ADDR32
4251           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4252              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4253           #endif
4254           emit_movimm(constmap[i+1][rs]+offset,ra);
4255         }
4256       }
4257       else if(rs1[i+1]==0) {
4258         // Using r0 as a base address
4259         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4260           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4261         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4262           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4263         }else{
4264           emit_movimm(offset,ra);
4265         }
4266       }
4267     }
4268   }
4269 }
4270
4271 int get_final_value(int hr, int i, int *value)
4272 {
4273   int reg=regs[i].regmap[hr];
4274   while(i<slen-1) {
4275     if(regs[i+1].regmap[hr]!=reg) break;
4276     if(!((regs[i+1].isconst>>hr)&1)) break;
4277     if(bt[i+1]) break;
4278     i++;
4279   }
4280   if(i<slen-1) {
4281     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4282       *value=constmap[i][hr];
4283       return 1;
4284     }
4285     if(!bt[i+1]) {
4286       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4287         // Load in delay slot, out-of-order execution
4288         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4289         {
4290           #ifdef HOST_IMM_ADDR32
4291           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4292           #endif
4293           // Precompute load address
4294           *value=constmap[i][hr]+imm[i+2];
4295           return 1;
4296         }
4297       }
4298       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4299       {
4300         #ifdef HOST_IMM_ADDR32
4301         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4302         #endif
4303         // Precompute load address
4304         *value=constmap[i][hr]+imm[i+1];
4305         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4306         return 1;
4307       }
4308     }
4309   }
4310   *value=constmap[i][hr];
4311   //printf("c=%x\n",(int)constmap[i][hr]);
4312   if(i==slen-1) return 1;
4313   if(reg<64) {
4314     return !((unneeded_reg[i+1]>>reg)&1);
4315   }else{
4316     return !((unneeded_reg_upper[i+1]>>reg)&1);
4317   }
4318 }
4319
4320 // Load registers with known constants
4321 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4322 {
4323   int hr;
4324   // Load 32-bit regs
4325   for(hr=0;hr<HOST_REGS;hr++) {
4326     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4327       //if(entry[hr]!=regmap[hr]) {
4328       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4329         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4330           int value;
4331           if(get_final_value(hr,i,&value)) {
4332             if(value==0) {
4333               emit_zeroreg(hr);
4334             }
4335             else {
4336               emit_movimm(value,hr);
4337             }
4338           }
4339         }
4340       }
4341     }
4342   }
4343   // Load 64-bit regs
4344   for(hr=0;hr<HOST_REGS;hr++) {
4345     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4346       //if(entry[hr]!=regmap[hr]) {
4347       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4348         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4349           if((is32>>(regmap[hr]&63))&1) {
4350             int lr=get_reg(regmap,regmap[hr]-64);
4351             assert(lr>=0);
4352             emit_sarimm(lr,31,hr);
4353           }
4354           else
4355           {
4356             int value;
4357             if(get_final_value(hr,i,&value)) {
4358               if(value==0) {
4359                 emit_zeroreg(hr);
4360               }
4361               else {
4362                 emit_movimm(value,hr);
4363               }
4364             }
4365           }
4366         }
4367       }
4368     }
4369   }
4370 }
4371 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4372 {
4373   int hr;
4374   // Load 32-bit regs
4375   for(hr=0;hr<HOST_REGS;hr++) {
4376     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4377       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4378         int value=constmap[i][hr];
4379         if(value==0) {
4380           emit_zeroreg(hr);
4381         }
4382         else {
4383           emit_movimm(value,hr);
4384         }
4385       }
4386     }
4387   }
4388   // Load 64-bit regs
4389   for(hr=0;hr<HOST_REGS;hr++) {
4390     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4391       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4392         if((is32>>(regmap[hr]&63))&1) {
4393           int lr=get_reg(regmap,regmap[hr]-64);
4394           assert(lr>=0);
4395           emit_sarimm(lr,31,hr);
4396         }
4397         else
4398         {
4399           int value=constmap[i][hr];
4400           if(value==0) {
4401             emit_zeroreg(hr);
4402           }
4403           else {
4404             emit_movimm(value,hr);
4405           }
4406         }
4407       }
4408     }
4409   }
4410 }
4411
4412 // Write out all dirty registers (except cycle count)
4413 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4414 {
4415   int hr;
4416   for(hr=0;hr<HOST_REGS;hr++) {
4417     if(hr!=EXCLUDE_REG) {
4418       if(i_regmap[hr]>0) {
4419         if(i_regmap[hr]!=CCREG) {
4420           if((i_dirty>>hr)&1) {
4421             if(i_regmap[hr]<64) {
4422               emit_storereg(i_regmap[hr],hr);
4423 #ifndef FORCE32
4424               if( ((i_is32>>i_regmap[hr])&1) ) {
4425                 #ifdef DESTRUCTIVE_WRITEBACK
4426                 emit_sarimm(hr,31,hr);
4427                 emit_storereg(i_regmap[hr]|64,hr);
4428                 #else
4429                 emit_sarimm(hr,31,HOST_TEMPREG);
4430                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4431                 #endif
4432               }
4433 #endif
4434             }else{
4435               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4436                 emit_storereg(i_regmap[hr],hr);
4437               }
4438             }
4439           }
4440         }
4441       }
4442     }
4443   }
4444 }
4445 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4446 // This writes the registers not written by store_regs_bt
4447 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4448 {
4449   int hr;
4450   int t=(addr-start)>>2;
4451   for(hr=0;hr<HOST_REGS;hr++) {
4452     if(hr!=EXCLUDE_REG) {
4453       if(i_regmap[hr]>0) {
4454         if(i_regmap[hr]!=CCREG) {
4455           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4456             if((i_dirty>>hr)&1) {
4457               if(i_regmap[hr]<64) {
4458                 emit_storereg(i_regmap[hr],hr);
4459 #ifndef FORCE32
4460                 if( ((i_is32>>i_regmap[hr])&1) ) {
4461                   #ifdef DESTRUCTIVE_WRITEBACK
4462                   emit_sarimm(hr,31,hr);
4463                   emit_storereg(i_regmap[hr]|64,hr);
4464                   #else
4465                   emit_sarimm(hr,31,HOST_TEMPREG);
4466                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4467                   #endif
4468                 }
4469 #endif
4470               }else{
4471                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4472                   emit_storereg(i_regmap[hr],hr);
4473                 }
4474               }
4475             }
4476           }
4477         }
4478       }
4479     }
4480   }
4481 }
4482
4483 // Load all registers (except cycle count)
4484 void load_all_regs(signed char i_regmap[])
4485 {
4486   int hr;
4487   for(hr=0;hr<HOST_REGS;hr++) {
4488     if(hr!=EXCLUDE_REG) {
4489       if(i_regmap[hr]==0) {
4490         emit_zeroreg(hr);
4491       }
4492       else
4493       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4494       {
4495         emit_loadreg(i_regmap[hr],hr);
4496       }
4497     }
4498   }
4499 }
4500
4501 // Load all current registers also needed by next instruction
4502 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4503 {
4504   int hr;
4505   for(hr=0;hr<HOST_REGS;hr++) {
4506     if(hr!=EXCLUDE_REG) {
4507       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4508         if(i_regmap[hr]==0) {
4509           emit_zeroreg(hr);
4510         }
4511         else
4512         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4513         {
4514           emit_loadreg(i_regmap[hr],hr);
4515         }
4516       }
4517     }
4518   }
4519 }
4520
4521 // Load all regs, storing cycle count if necessary
4522 void load_regs_entry(int t)
4523 {
4524   int hr;
4525   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4526   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4527   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4528     emit_storereg(CCREG,HOST_CCREG);
4529   }
4530   // Load 32-bit regs
4531   for(hr=0;hr<HOST_REGS;hr++) {
4532     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4533       if(regs[t].regmap_entry[hr]==0) {
4534         emit_zeroreg(hr);
4535       }
4536       else if(regs[t].regmap_entry[hr]!=CCREG)
4537       {
4538         emit_loadreg(regs[t].regmap_entry[hr],hr);
4539       }
4540     }
4541   }
4542   // Load 64-bit regs
4543   for(hr=0;hr<HOST_REGS;hr++) {
4544     if(regs[t].regmap_entry[hr]>=64) {
4545       assert(regs[t].regmap_entry[hr]!=64);
4546       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4547         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4548         if(lr<0) {
4549           emit_loadreg(regs[t].regmap_entry[hr],hr);
4550         }
4551         else
4552         {
4553           emit_sarimm(lr,31,hr);
4554         }
4555       }
4556       else
4557       {
4558         emit_loadreg(regs[t].regmap_entry[hr],hr);
4559       }
4560     }
4561   }
4562 }
4563
4564 // Store dirty registers prior to branch
4565 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4566 {
4567   if(internal_branch(i_is32,addr))
4568   {
4569     int t=(addr-start)>>2;
4570     int hr;
4571     for(hr=0;hr<HOST_REGS;hr++) {
4572       if(hr!=EXCLUDE_REG) {
4573         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4574           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4575             if((i_dirty>>hr)&1) {
4576               if(i_regmap[hr]<64) {
4577                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4578                   emit_storereg(i_regmap[hr],hr);
4579                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4580                     #ifdef DESTRUCTIVE_WRITEBACK
4581                     emit_sarimm(hr,31,hr);
4582                     emit_storereg(i_regmap[hr]|64,hr);
4583                     #else
4584                     emit_sarimm(hr,31,HOST_TEMPREG);
4585                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4586                     #endif
4587                   }
4588                 }
4589               }else{
4590                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4591                   emit_storereg(i_regmap[hr],hr);
4592                 }
4593               }
4594             }
4595           }
4596         }
4597       }
4598     }
4599   }
4600   else
4601   {
4602     // Branch out of this block, write out all dirty regs
4603     wb_dirtys(i_regmap,i_is32,i_dirty);
4604   }
4605 }
4606
4607 // Load all needed registers for branch target
4608 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4609 {
4610   //if(addr>=start && addr<(start+slen*4))
4611   if(internal_branch(i_is32,addr))
4612   {
4613     int t=(addr-start)>>2;
4614     int hr;
4615     // Store the cycle count before loading something else
4616     if(i_regmap[HOST_CCREG]!=CCREG) {
4617       assert(i_regmap[HOST_CCREG]==-1);
4618     }
4619     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4620       emit_storereg(CCREG,HOST_CCREG);
4621     }
4622     // Load 32-bit regs
4623     for(hr=0;hr<HOST_REGS;hr++) {
4624       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4625         #ifdef DESTRUCTIVE_WRITEBACK
4626         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4627         #else
4628         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4629         #endif
4630           if(regs[t].regmap_entry[hr]==0) {
4631             emit_zeroreg(hr);
4632           }
4633           else if(regs[t].regmap_entry[hr]!=CCREG)
4634           {
4635             emit_loadreg(regs[t].regmap_entry[hr],hr);
4636           }
4637         }
4638       }
4639     }
4640     //Load 64-bit regs
4641     for(hr=0;hr<HOST_REGS;hr++) {
4642       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4643         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4644           assert(regs[t].regmap_entry[hr]!=64);
4645           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4646             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4647             if(lr<0) {
4648               emit_loadreg(regs[t].regmap_entry[hr],hr);
4649             }
4650             else
4651             {
4652               emit_sarimm(lr,31,hr);
4653             }
4654           }
4655           else
4656           {
4657             emit_loadreg(regs[t].regmap_entry[hr],hr);
4658           }
4659         }
4660         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4661           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4662           assert(lr>=0);
4663           emit_sarimm(lr,31,hr);
4664         }
4665       }
4666     }
4667   }
4668 }
4669
4670 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4671 {
4672   if(addr>=start && addr<start+slen*4-4)
4673   {
4674     int t=(addr-start)>>2;
4675     int hr;
4676     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4677     for(hr=0;hr<HOST_REGS;hr++)
4678     {
4679       if(hr!=EXCLUDE_REG)
4680       {
4681         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4682         {
4683           if(regs[t].regmap_entry[hr]!=-1)
4684           {
4685             return 0;
4686           }
4687           else 
4688           if((i_dirty>>hr)&1)
4689           {
4690             if(i_regmap[hr]<64)
4691             {
4692               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4693                 return 0;
4694             }
4695             else
4696             {
4697               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4698                 return 0;
4699             }
4700           }
4701         }
4702         else // Same register but is it 32-bit or dirty?
4703         if(i_regmap[hr]>=0)
4704         {
4705           if(!((regs[t].dirty>>hr)&1))
4706           {
4707             if((i_dirty>>hr)&1)
4708             {
4709               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4710               {
4711                 //printf("%x: dirty no match\n",addr);
4712                 return 0;
4713               }
4714             }
4715           }
4716           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4717           {
4718             //printf("%x: is32 no match\n",addr);
4719             return 0;
4720           }
4721         }
4722       }
4723     }
4724     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4725 #ifndef FORCE32
4726     if(requires_32bit[t]&~i_is32) return 0;
4727 #endif
4728     // Delay slots are not valid branch targets
4729     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4730     // Delay slots require additional processing, so do not match
4731     if(is_ds[t]) return 0;
4732   }
4733   else
4734   {
4735     int hr;
4736     for(hr=0;hr<HOST_REGS;hr++)
4737     {
4738       if(hr!=EXCLUDE_REG)
4739       {
4740         if(i_regmap[hr]>=0)
4741         {
4742           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4743           {
4744             if((i_dirty>>hr)&1)
4745             {
4746               return 0;
4747             }
4748           }
4749         }
4750       }
4751     }
4752   }
4753   return 1;
4754 }
4755
4756 // Used when a branch jumps into the delay slot of another branch
4757 void ds_assemble_entry(int i)
4758 {
4759   int t=(ba[i]-start)>>2;
4760   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4761   assem_debug("Assemble delay slot at %x\n",ba[i]);
4762   assem_debug("<->\n");
4763   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4764     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4765   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4766   address_generation(t,&regs[t],regs[t].regmap_entry);
4767   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4768     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4769   cop1_usable=0;
4770   is_delayslot=0;
4771   switch(itype[t]) {
4772     case ALU:
4773       alu_assemble(t,&regs[t]);break;
4774     case IMM16:
4775       imm16_assemble(t,&regs[t]);break;
4776     case SHIFT:
4777       shift_assemble(t,&regs[t]);break;
4778     case SHIFTIMM:
4779       shiftimm_assemble(t,&regs[t]);break;
4780     case LOAD:
4781       load_assemble(t,&regs[t]);break;
4782     case LOADLR:
4783       loadlr_assemble(t,&regs[t]);break;
4784     case STORE:
4785       store_assemble(t,&regs[t]);break;
4786     case STORELR:
4787       storelr_assemble(t,&regs[t]);break;
4788     case COP0:
4789       cop0_assemble(t,&regs[t]);break;
4790     case COP1:
4791       cop1_assemble(t,&regs[t]);break;
4792     case C1LS:
4793       c1ls_assemble(t,&regs[t]);break;
4794     case COP2:
4795       cop2_assemble(t,&regs[t]);break;
4796     case C2LS:
4797       c2ls_assemble(t,&regs[t]);break;
4798     case C2OP:
4799       c2op_assemble(t,&regs[t]);break;
4800     case FCONV:
4801       fconv_assemble(t,&regs[t]);break;
4802     case FLOAT:
4803       float_assemble(t,&regs[t]);break;
4804     case FCOMP:
4805       fcomp_assemble(t,&regs[t]);break;
4806     case MULTDIV:
4807       multdiv_assemble(t,&regs[t]);break;
4808     case MOV:
4809       mov_assemble(t,&regs[t]);break;
4810     case SYSCALL:
4811     case HLECALL:
4812     case INTCALL:
4813     case SPAN:
4814     case UJUMP:
4815     case RJUMP:
4816     case CJUMP:
4817     case SJUMP:
4818     case FJUMP:
4819       printf("Jump in the delay slot.  This is probably a bug.\n");
4820   }
4821   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4822   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4823   if(internal_branch(regs[t].is32,ba[i]+4))
4824     assem_debug("branch: internal\n");
4825   else
4826     assem_debug("branch: external\n");
4827   assert(internal_branch(regs[t].is32,ba[i]+4));
4828   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4829   emit_jmp(0);
4830 }
4831
4832 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4833 {
4834   int count;
4835   int jaddr;
4836   int idle=0;
4837   if(itype[i]==RJUMP)
4838   {
4839     *adj=0;
4840   }
4841   //if(ba[i]>=start && ba[i]<(start+slen*4))
4842   if(internal_branch(branch_regs[i].is32,ba[i]))
4843   {
4844     int t=(ba[i]-start)>>2;
4845     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4846     else *adj=ccadj[t];
4847   }
4848   else
4849   {
4850     *adj=0;
4851   }
4852   count=ccadj[i];
4853   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4854     // Idle loop
4855     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4856     idle=(int)out;
4857     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4858     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4859     jaddr=(int)out;
4860     emit_jmp(0);
4861   }
4862   else if(*adj==0||invert) {
4863     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4864     jaddr=(int)out;
4865     emit_jns(0);
4866   }
4867   else
4868   {
4869     emit_cmpimm(HOST_CCREG,-2*(count+2));
4870     jaddr=(int)out;
4871     emit_jns(0);
4872   }
4873   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4874 }
4875
4876 void do_ccstub(int n)
4877 {
4878   literal_pool(256);
4879   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4880   set_jump_target(stubs[n][1],(int)out);
4881   int i=stubs[n][4];
4882   if(stubs[n][6]==NULLDS) {
4883     // Delay slot instruction is nullified ("likely" branch)
4884     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4885   }
4886   else if(stubs[n][6]!=TAKEN) {
4887     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4888   }
4889   else {
4890     if(internal_branch(branch_regs[i].is32,ba[i]))
4891       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4892   }
4893   if(stubs[n][5]!=-1)
4894   {
4895     // Save PC as return address
4896     emit_movimm(stubs[n][5],EAX);
4897     emit_writeword(EAX,(int)&pcaddr);
4898   }
4899   else
4900   {
4901     // Return address depends on which way the branch goes
4902     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4903     {
4904       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4905       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4906       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4907       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4908       if(rs1[i]==0)
4909       {
4910         s1l=s2l;s1h=s2h;
4911         s2l=s2h=-1;
4912       }
4913       else if(rs2[i]==0)
4914       {
4915         s2l=s2h=-1;
4916       }
4917       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4918         s1h=s2h=-1;
4919       }
4920       assert(s1l>=0);
4921       #ifdef DESTRUCTIVE_WRITEBACK
4922       if(rs1[i]) {
4923         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4924           emit_loadreg(rs1[i],s1l);
4925       } 
4926       else {
4927         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4928           emit_loadreg(rs2[i],s1l);
4929       }
4930       if(s2l>=0)
4931         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4932           emit_loadreg(rs2[i],s2l);
4933       #endif
4934       int hr=0;
4935       int addr,alt,ntaddr;
4936       while(hr<HOST_REGS)
4937       {
4938         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4939            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4940            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4941         {
4942           addr=hr++;break;
4943         }
4944         hr++;
4945       }
4946       while(hr<HOST_REGS)
4947       {
4948         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4949            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4950            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4951         {
4952           alt=hr++;break;
4953         }
4954         hr++;
4955       }
4956       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4957       {
4958         while(hr<HOST_REGS)
4959         {
4960           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4961              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4962              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4963           {
4964             ntaddr=hr;break;
4965           }
4966           hr++;
4967         }
4968         assert(hr<HOST_REGS);
4969       }
4970       if((opcode[i]&0x2f)==4) // BEQ
4971       {
4972         #ifdef HAVE_CMOV_IMM
4973         if(s1h<0) {
4974           if(s2l>=0) emit_cmp(s1l,s2l);
4975           else emit_test(s1l,s1l);
4976           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4977         }
4978         else
4979         #endif
4980         {
4981           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4982           if(s1h>=0) {
4983             if(s2h>=0) emit_cmp(s1h,s2h);
4984             else emit_test(s1h,s1h);
4985             emit_cmovne_reg(alt,addr);
4986           }
4987           if(s2l>=0) emit_cmp(s1l,s2l);
4988           else emit_test(s1l,s1l);
4989           emit_cmovne_reg(alt,addr);
4990         }
4991       }
4992       if((opcode[i]&0x2f)==5) // BNE
4993       {
4994         #ifdef HAVE_CMOV_IMM
4995         if(s1h<0) {
4996           if(s2l>=0) emit_cmp(s1l,s2l);
4997           else emit_test(s1l,s1l);
4998           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4999         }
5000         else
5001         #endif
5002         {
5003           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5004           if(s1h>=0) {
5005             if(s2h>=0) emit_cmp(s1h,s2h);
5006             else emit_test(s1h,s1h);
5007             emit_cmovne_reg(alt,addr);
5008           }
5009           if(s2l>=0) emit_cmp(s1l,s2l);
5010           else emit_test(s1l,s1l);
5011           emit_cmovne_reg(alt,addr);
5012         }
5013       }
5014       if((opcode[i]&0x2f)==6) // BLEZ
5015       {
5016         //emit_movimm(ba[i],alt);
5017         //emit_movimm(start+i*4+8,addr);
5018         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5019         emit_cmpimm(s1l,1);
5020         if(s1h>=0) emit_mov(addr,ntaddr);
5021         emit_cmovl_reg(alt,addr);
5022         if(s1h>=0) {
5023           emit_test(s1h,s1h);
5024           emit_cmovne_reg(ntaddr,addr);
5025           emit_cmovs_reg(alt,addr);
5026         }
5027       }
5028       if((opcode[i]&0x2f)==7) // BGTZ
5029       {
5030         //emit_movimm(ba[i],addr);
5031         //emit_movimm(start+i*4+8,ntaddr);
5032         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5033         emit_cmpimm(s1l,1);
5034         if(s1h>=0) emit_mov(addr,alt);
5035         emit_cmovl_reg(ntaddr,addr);
5036         if(s1h>=0) {
5037           emit_test(s1h,s1h);
5038           emit_cmovne_reg(alt,addr);
5039           emit_cmovs_reg(ntaddr,addr);
5040         }
5041       }
5042       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5043       {
5044         //emit_movimm(ba[i],alt);
5045         //emit_movimm(start+i*4+8,addr);
5046         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5047         if(s1h>=0) emit_test(s1h,s1h);
5048         else emit_test(s1l,s1l);
5049         emit_cmovs_reg(alt,addr);
5050       }
5051       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5052       {
5053         //emit_movimm(ba[i],addr);
5054         //emit_movimm(start+i*4+8,alt);
5055         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5056         if(s1h>=0) emit_test(s1h,s1h);
5057         else emit_test(s1l,s1l);
5058         emit_cmovs_reg(alt,addr);
5059       }
5060       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5061         if(source[i]&0x10000) // BC1T
5062         {
5063           //emit_movimm(ba[i],alt);
5064           //emit_movimm(start+i*4+8,addr);
5065           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5066           emit_testimm(s1l,0x800000);
5067           emit_cmovne_reg(alt,addr);
5068         }
5069         else // BC1F
5070         {
5071           //emit_movimm(ba[i],addr);
5072           //emit_movimm(start+i*4+8,alt);
5073           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5074           emit_testimm(s1l,0x800000);
5075           emit_cmovne_reg(alt,addr);
5076         }
5077       }
5078       emit_writeword(addr,(int)&pcaddr);
5079     }
5080     else
5081     if(itype[i]==RJUMP)
5082     {
5083       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5084       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5085         r=get_reg(branch_regs[i].regmap,RTEMP);
5086       }
5087       emit_writeword(r,(int)&pcaddr);
5088     }
5089     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5090   }
5091   // Update cycle count
5092   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5093   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5094   emit_call((int)cc_interrupt);
5095   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5096   if(stubs[n][6]==TAKEN) {
5097     if(internal_branch(branch_regs[i].is32,ba[i]))
5098       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5099     else if(itype[i]==RJUMP) {
5100       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5101         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5102       else
5103         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5104     }
5105   }else if(stubs[n][6]==NOTTAKEN) {
5106     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5107     else load_all_regs(branch_regs[i].regmap);
5108   }else if(stubs[n][6]==NULLDS) {
5109     // Delay slot instruction is nullified ("likely" branch)
5110     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5111     else load_all_regs(regs[i].regmap);
5112   }else{
5113     load_all_regs(branch_regs[i].regmap);
5114   }
5115   emit_jmp(stubs[n][2]); // return address
5116   
5117   /* This works but uses a lot of memory...
5118   emit_readword((int)&last_count,ECX);
5119   emit_add(HOST_CCREG,ECX,EAX);
5120   emit_writeword(EAX,(int)&Count);
5121   emit_call((int)gen_interupt);
5122   emit_readword((int)&Count,HOST_CCREG);
5123   emit_readword((int)&next_interupt,EAX);
5124   emit_readword((int)&pending_exception,EBX);
5125   emit_writeword(EAX,(int)&last_count);
5126   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5127   emit_test(EBX,EBX);
5128   int jne_instr=(int)out;
5129   emit_jne(0);
5130   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5131   load_all_regs(branch_regs[i].regmap);
5132   emit_jmp(stubs[n][2]); // return address
5133   set_jump_target(jne_instr,(int)out);
5134   emit_readword((int)&pcaddr,EAX);
5135   // Call get_addr_ht instead of doing the hash table here.
5136   // This code is executed infrequently and takes up a lot of space
5137   // so smaller is better.
5138   emit_storereg(CCREG,HOST_CCREG);
5139   emit_pushreg(EAX);
5140   emit_call((int)get_addr_ht);
5141   emit_loadreg(CCREG,HOST_CCREG);
5142   emit_addimm(ESP,4,ESP);
5143   emit_jmpreg(EAX);*/
5144 }
5145
5146 add_to_linker(int addr,int target,int ext)
5147 {
5148   link_addr[linkcount][0]=addr;
5149   link_addr[linkcount][1]=target;
5150   link_addr[linkcount][2]=ext;  
5151   linkcount++;
5152 }
5153
5154 void ujump_assemble(int i,struct regstat *i_regs)
5155 {
5156   signed char *i_regmap=i_regs->regmap;
5157   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5158   address_generation(i+1,i_regs,regs[i].regmap_entry);
5159   #ifdef REG_PREFETCH
5160   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5161   if(rt1[i]==31&&temp>=0) 
5162   {
5163     int return_address=start+i*4+8;
5164     if(get_reg(branch_regs[i].regmap,31)>0) 
5165     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5166   }
5167   #endif
5168   if(rt1[i]==31) {
5169     int rt;
5170     unsigned int return_address;
5171     rt=get_reg(branch_regs[i].regmap,31);
5172     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5173     //assert(rt>=0);
5174     return_address=start+i*4+8;
5175     if(rt>=0) {
5176       #ifdef USE_MINI_HT
5177       if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5178         int temp=-1; // note: must be ds-safe
5179         #ifdef HOST_TEMPREG
5180         temp=HOST_TEMPREG;
5181         #endif
5182         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5183         else emit_movimm(return_address,rt);
5184       }
5185       else
5186       #endif
5187       {
5188         #ifdef REG_PREFETCH
5189         if(temp>=0) 
5190         {
5191           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5192         }
5193         #endif
5194         emit_movimm(return_address,rt); // PC into link register
5195         #ifdef IMM_PREFETCH
5196         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5197         #endif
5198       }
5199     }
5200   }
5201   ds_assemble(i+1,i_regs);
5202   uint64_t bc_unneeded=branch_regs[i].u;
5203   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5204   bc_unneeded|=1|(1LL<<rt1[i]);
5205   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5206   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5207                 bc_unneeded,bc_unneeded_upper);
5208   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5209   int cc,adj;
5210   cc=get_reg(branch_regs[i].regmap,CCREG);
5211   assert(cc==HOST_CCREG);
5212   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5213   #ifdef REG_PREFETCH
5214   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5215   #endif
5216   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5217   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5218   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5219   if(internal_branch(branch_regs[i].is32,ba[i]))
5220     assem_debug("branch: internal\n");
5221   else
5222     assem_debug("branch: external\n");
5223   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5224     ds_assemble_entry(i);
5225   }
5226   else {
5227     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5228     emit_jmp(0);
5229   }
5230 }
5231
5232 void rjump_assemble(int i,struct regstat *i_regs)
5233 {
5234   signed char *i_regmap=i_regs->regmap;
5235   int temp;
5236   int rs,cc,adj;
5237   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5238   assert(rs>=0);
5239   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5240     // Delay slot abuse, make a copy of the branch address register
5241     temp=get_reg(branch_regs[i].regmap,RTEMP);
5242     assert(temp>=0);
5243     assert(regs[i].regmap[temp]==RTEMP);
5244     emit_mov(rs,temp);
5245     rs=temp;
5246   }
5247   address_generation(i+1,i_regs,regs[i].regmap_entry);
5248   #ifdef REG_PREFETCH
5249   if(rt1[i]==31) 
5250   {
5251     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5252       int return_address=start+i*4+8;
5253       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5254     }
5255   }
5256   #endif
5257   #ifdef USE_MINI_HT
5258   if(rs1[i]==31) {
5259     int rh=get_reg(regs[i].regmap,RHASH);
5260     if(rh>=0) do_preload_rhash(rh);
5261   }
5262   #endif
5263   ds_assemble(i+1,i_regs);
5264   uint64_t bc_unneeded=branch_regs[i].u;
5265   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5266   bc_unneeded|=1|(1LL<<rt1[i]);
5267   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5268   bc_unneeded&=~(1LL<<rs1[i]);
5269   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5270                 bc_unneeded,bc_unneeded_upper);
5271   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5272   if(rt1[i]!=0) {
5273     int rt,return_address;
5274     assert(rt1[i+1]!=rt1[i]);
5275     assert(rt2[i+1]!=rt1[i]);
5276     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5277     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5278     assert(rt>=0);
5279     return_address=start+i*4+8;
5280     #ifdef REG_PREFETCH
5281     if(temp>=0) 
5282     {
5283       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5284     }
5285     #endif
5286     emit_movimm(return_address,rt); // PC into link register
5287     #ifdef IMM_PREFETCH
5288     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5289     #endif
5290   }
5291   cc=get_reg(branch_regs[i].regmap,CCREG);
5292   assert(cc==HOST_CCREG);
5293   #ifdef USE_MINI_HT
5294   int rh=get_reg(branch_regs[i].regmap,RHASH);
5295   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5296   if(rs1[i]==31) {
5297     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5298     do_preload_rhtbl(ht);
5299     do_rhash(rs,rh);
5300   }
5301   #endif
5302   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5303   #ifdef DESTRUCTIVE_WRITEBACK
5304   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5305     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5306       emit_loadreg(rs1[i],rs);
5307     }
5308   }
5309   #endif
5310   #ifdef REG_PREFETCH
5311   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5312   #endif
5313   #ifdef USE_MINI_HT
5314   if(rs1[i]==31) {
5315     do_miniht_load(ht,rh);
5316   }
5317   #endif
5318   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5319   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5320   //assert(adj==0);
5321   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5322   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5323   emit_jns(0);
5324   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5325   #ifdef USE_MINI_HT
5326   if(rs1[i]==31) {
5327     do_miniht_jump(rs,rh,ht);
5328   }
5329   else
5330   #endif
5331   {
5332     //if(rs!=EAX) emit_mov(rs,EAX);
5333     //emit_jmp((int)jump_vaddr_eax);
5334     emit_jmp(jump_vaddr_reg[rs]);
5335   }
5336   /* Check hash table
5337   temp=!rs;
5338   emit_mov(rs,temp);
5339   emit_shrimm(rs,16,rs);
5340   emit_xor(temp,rs,rs);
5341   emit_movzwl_reg(rs,rs);
5342   emit_shlimm(rs,4,rs);
5343   emit_cmpmem_indexed((int)hash_table,rs,temp);
5344   emit_jne((int)out+14);
5345   emit_readword_indexed((int)hash_table+4,rs,rs);
5346   emit_jmpreg(rs);
5347   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5348   emit_addimm_no_flags(8,rs);
5349   emit_jeq((int)out-17);
5350   // No hit on hash table, call compiler
5351   emit_pushreg(temp);
5352 //DEBUG >
5353 #ifdef DEBUG_CYCLE_COUNT
5354   emit_readword((int)&last_count,ECX);
5355   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5356   emit_readword((int)&next_interupt,ECX);
5357   emit_writeword(HOST_CCREG,(int)&Count);
5358   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5359   emit_writeword(ECX,(int)&last_count);
5360 #endif
5361 //DEBUG <
5362   emit_storereg(CCREG,HOST_CCREG);
5363   emit_call((int)get_addr);
5364   emit_loadreg(CCREG,HOST_CCREG);
5365   emit_addimm(ESP,4,ESP);
5366   emit_jmpreg(EAX);*/
5367   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5368   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5369   #endif
5370 }
5371
5372 void cjump_assemble(int i,struct regstat *i_regs)
5373 {
5374   signed char *i_regmap=i_regs->regmap;
5375   int cc;
5376   int match;
5377   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5378   assem_debug("match=%d\n",match);
5379   int s1h,s1l,s2h,s2l;
5380   int prev_cop1_usable=cop1_usable;
5381   int unconditional=0,nop=0;
5382   int only32=0;
5383   int invert=0;
5384   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5385   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5386   if(!match) invert=1;
5387   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5388   if(i>(ba[i]-start)>>2) invert=1;
5389   #endif
5390   
5391   if(ooo[i]) {
5392     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5393     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5394     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5395     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5396   }
5397   else {
5398     s1l=get_reg(i_regmap,rs1[i]);
5399     s1h=get_reg(i_regmap,rs1[i]|64);
5400     s2l=get_reg(i_regmap,rs2[i]);
5401     s2h=get_reg(i_regmap,rs2[i]|64);
5402   }
5403   if(rs1[i]==0&&rs2[i]==0)
5404   {
5405     if(opcode[i]&1) nop=1;
5406     else unconditional=1;
5407     //assert(opcode[i]!=5);
5408     //assert(opcode[i]!=7);
5409     //assert(opcode[i]!=0x15);
5410     //assert(opcode[i]!=0x17);
5411   }
5412   else if(rs1[i]==0)
5413   {
5414     s1l=s2l;s1h=s2h;
5415     s2l=s2h=-1;
5416     only32=(regs[i].was32>>rs2[i])&1;
5417   }
5418   else if(rs2[i]==0)
5419   {
5420     s2l=s2h=-1;
5421     only32=(regs[i].was32>>rs1[i])&1;
5422   }
5423   else {
5424     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5425   }
5426
5427   if(ooo[i]) {
5428     // Out of order execution (delay slot first)
5429     //printf("OOOE\n");
5430     address_generation(i+1,i_regs,regs[i].regmap_entry);
5431     ds_assemble(i+1,i_regs);
5432     int adj;
5433     uint64_t bc_unneeded=branch_regs[i].u;
5434     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5435     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5436     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5437     bc_unneeded|=1;
5438     bc_unneeded_upper|=1;
5439     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5440                   bc_unneeded,bc_unneeded_upper);
5441     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5442     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5443     cc=get_reg(branch_regs[i].regmap,CCREG);
5444     assert(cc==HOST_CCREG);
5445     if(unconditional) 
5446       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5447     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5448     //assem_debug("cycle count (adj)\n");
5449     if(unconditional) {
5450       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5451       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5452         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5453         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5454         if(internal)
5455           assem_debug("branch: internal\n");
5456         else
5457           assem_debug("branch: external\n");
5458         if(internal&&is_ds[(ba[i]-start)>>2]) {
5459           ds_assemble_entry(i);
5460         }
5461         else {
5462           add_to_linker((int)out,ba[i],internal);
5463           emit_jmp(0);
5464         }
5465         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5466         if(((u_int)out)&7) emit_addnop(0);
5467         #endif
5468       }
5469     }
5470     else if(nop) {
5471       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5472       int jaddr=(int)out;
5473       emit_jns(0);
5474       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5475     }
5476     else {
5477       int taken=0,nottaken=0,nottaken1=0;
5478       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5479       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5480       if(!only32)
5481       {
5482         assert(s1h>=0);
5483         if(opcode[i]==4) // BEQ
5484         {
5485           if(s2h>=0) emit_cmp(s1h,s2h);
5486           else emit_test(s1h,s1h);
5487           nottaken1=(int)out;
5488           emit_jne(1);
5489         }
5490         if(opcode[i]==5) // BNE
5491         {
5492           if(s2h>=0) emit_cmp(s1h,s2h);
5493           else emit_test(s1h,s1h);
5494           if(invert) taken=(int)out;
5495           else add_to_linker((int)out,ba[i],internal);
5496           emit_jne(0);
5497         }
5498         if(opcode[i]==6) // BLEZ
5499         {
5500           emit_test(s1h,s1h);
5501           if(invert) taken=(int)out;
5502           else add_to_linker((int)out,ba[i],internal);
5503           emit_js(0);
5504           nottaken1=(int)out;
5505           emit_jne(1);
5506         }
5507         if(opcode[i]==7) // BGTZ
5508         {
5509           emit_test(s1h,s1h);
5510           nottaken1=(int)out;
5511           emit_js(1);
5512           if(invert) taken=(int)out;
5513           else add_to_linker((int)out,ba[i],internal);
5514           emit_jne(0);
5515         }
5516       } // if(!only32)
5517           
5518       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5519       assert(s1l>=0);
5520       if(opcode[i]==4) // BEQ
5521       {
5522         if(s2l>=0) emit_cmp(s1l,s2l);
5523         else emit_test(s1l,s1l);
5524         if(invert){
5525           nottaken=(int)out;
5526           emit_jne(1);
5527         }else{
5528           add_to_linker((int)out,ba[i],internal);
5529           emit_jeq(0);
5530         }
5531       }
5532       if(opcode[i]==5) // BNE
5533       {
5534         if(s2l>=0) emit_cmp(s1l,s2l);
5535         else emit_test(s1l,s1l);
5536         if(invert){
5537           nottaken=(int)out;
5538           emit_jeq(1);
5539         }else{
5540           add_to_linker((int)out,ba[i],internal);
5541           emit_jne(0);
5542         }
5543       }
5544       if(opcode[i]==6) // BLEZ
5545       {
5546         emit_cmpimm(s1l,1);
5547         if(invert){
5548           nottaken=(int)out;
5549           emit_jge(1);
5550         }else{
5551           add_to_linker((int)out,ba[i],internal);
5552           emit_jl(0);
5553         }
5554       }
5555       if(opcode[i]==7) // BGTZ
5556       {
5557         emit_cmpimm(s1l,1);
5558         if(invert){
5559           nottaken=(int)out;
5560           emit_jl(1);
5561         }else{
5562           add_to_linker((int)out,ba[i],internal);
5563           emit_jge(0);
5564         }
5565       }
5566       if(invert) {
5567         if(taken) set_jump_target(taken,(int)out);
5568         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5569         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5570           if(adj) {
5571             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5572             add_to_linker((int)out,ba[i],internal);
5573           }else{
5574             emit_addnop(13);
5575             add_to_linker((int)out,ba[i],internal*2);
5576           }
5577           emit_jmp(0);
5578         }else
5579         #endif
5580         {
5581           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5582           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5583           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5584           if(internal)
5585             assem_debug("branch: internal\n");
5586           else
5587             assem_debug("branch: external\n");
5588           if(internal&&is_ds[(ba[i]-start)>>2]) {
5589             ds_assemble_entry(i);
5590           }
5591           else {
5592             add_to_linker((int)out,ba[i],internal);
5593             emit_jmp(0);
5594           }
5595         }
5596         set_jump_target(nottaken,(int)out);
5597       }
5598
5599       if(nottaken1) set_jump_target(nottaken1,(int)out);
5600       if(adj) {
5601         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5602       }
5603     } // (!unconditional)
5604   } // if(ooo)
5605   else
5606   {
5607     // In-order execution (branch first)
5608     //if(likely[i]) printf("IOL\n");
5609     //else
5610     //printf("IOE\n");
5611     int taken=0,nottaken=0,nottaken1=0;
5612     if(!unconditional&&!nop) {
5613       if(!only32)
5614       {
5615         assert(s1h>=0);
5616         if((opcode[i]&0x2f)==4) // BEQ
5617         {
5618           if(s2h>=0) emit_cmp(s1h,s2h);
5619           else emit_test(s1h,s1h);
5620           nottaken1=(int)out;
5621           emit_jne(2);
5622         }
5623         if((opcode[i]&0x2f)==5) // BNE
5624         {
5625           if(s2h>=0) emit_cmp(s1h,s2h);
5626           else emit_test(s1h,s1h);
5627           taken=(int)out;
5628           emit_jne(1);
5629         }
5630         if((opcode[i]&0x2f)==6) // BLEZ
5631         {
5632           emit_test(s1h,s1h);
5633           taken=(int)out;
5634           emit_js(1);
5635           nottaken1=(int)out;
5636           emit_jne(2);
5637         }
5638         if((opcode[i]&0x2f)==7) // BGTZ
5639         {
5640           emit_test(s1h,s1h);
5641           nottaken1=(int)out;
5642           emit_js(2);
5643           taken=(int)out;
5644           emit_jne(1);
5645         }
5646       } // if(!only32)
5647           
5648       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5649       assert(s1l>=0);
5650       if((opcode[i]&0x2f)==4) // BEQ
5651       {
5652         if(s2l>=0) emit_cmp(s1l,s2l);
5653         else emit_test(s1l,s1l);
5654         nottaken=(int)out;
5655         emit_jne(2);
5656       }
5657       if((opcode[i]&0x2f)==5) // BNE
5658       {
5659         if(s2l>=0) emit_cmp(s1l,s2l);
5660         else emit_test(s1l,s1l);
5661         nottaken=(int)out;
5662         emit_jeq(2);
5663       }
5664       if((opcode[i]&0x2f)==6) // BLEZ
5665       {
5666         emit_cmpimm(s1l,1);
5667         nottaken=(int)out;
5668         emit_jge(2);
5669       }
5670       if((opcode[i]&0x2f)==7) // BGTZ
5671       {
5672         emit_cmpimm(s1l,1);
5673         nottaken=(int)out;
5674         emit_jl(2);
5675       }
5676     } // if(!unconditional)
5677     int adj;
5678     uint64_t ds_unneeded=branch_regs[i].u;
5679     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5680     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5681     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5682     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5683     ds_unneeded|=1;
5684     ds_unneeded_upper|=1;
5685     // branch taken
5686     if(!nop) {
5687       if(taken) set_jump_target(taken,(int)out);
5688       assem_debug("1:\n");
5689       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5690                     ds_unneeded,ds_unneeded_upper);
5691       // load regs
5692       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5693       address_generation(i+1,&branch_regs[i],0);
5694       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5695       ds_assemble(i+1,&branch_regs[i]);
5696       cc=get_reg(branch_regs[i].regmap,CCREG);
5697       if(cc==-1) {
5698         emit_loadreg(CCREG,cc=HOST_CCREG);
5699         // CHECK: Is the following instruction (fall thru) allocated ok?
5700       }
5701       assert(cc==HOST_CCREG);
5702       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5703       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5704       assem_debug("cycle count (adj)\n");
5705       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5706       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5707       if(internal)
5708         assem_debug("branch: internal\n");
5709       else
5710         assem_debug("branch: external\n");
5711       if(internal&&is_ds[(ba[i]-start)>>2]) {
5712         ds_assemble_entry(i);
5713       }
5714       else {
5715         add_to_linker((int)out,ba[i],internal);
5716         emit_jmp(0);
5717       }
5718     }
5719     // branch not taken
5720     cop1_usable=prev_cop1_usable;
5721     if(!unconditional) {
5722       if(nottaken1) set_jump_target(nottaken1,(int)out);
5723       set_jump_target(nottaken,(int)out);
5724       assem_debug("2:\n");
5725       if(!likely[i]) {
5726         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5727                       ds_unneeded,ds_unneeded_upper);
5728         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5729         address_generation(i+1,&branch_regs[i],0);
5730         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5731         ds_assemble(i+1,&branch_regs[i]);
5732       }
5733       cc=get_reg(branch_regs[i].regmap,CCREG);
5734       if(cc==-1&&!likely[i]) {
5735         // Cycle count isn't in a register, temporarily load it then write it out
5736         emit_loadreg(CCREG,HOST_CCREG);
5737         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5738         int jaddr=(int)out;
5739         emit_jns(0);
5740         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5741         emit_storereg(CCREG,HOST_CCREG);
5742       }
5743       else{
5744         cc=get_reg(i_regmap,CCREG);
5745         assert(cc==HOST_CCREG);
5746         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5747         int jaddr=(int)out;
5748         emit_jns(0);
5749         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5750       }
5751     }
5752   }
5753 }
5754
5755 void sjump_assemble(int i,struct regstat *i_regs)
5756 {
5757   signed char *i_regmap=i_regs->regmap;
5758   int cc;
5759   int match;
5760   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5761   assem_debug("smatch=%d\n",match);
5762   int s1h,s1l;
5763   int prev_cop1_usable=cop1_usable;
5764   int unconditional=0,nevertaken=0;
5765   int only32=0;
5766   int invert=0;
5767   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5768   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5769   if(!match) invert=1;
5770   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5771   if(i>(ba[i]-start)>>2) invert=1;
5772   #endif
5773
5774   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5775   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5776
5777   if(ooo[i]) {
5778     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5779     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5780   }
5781   else {
5782     s1l=get_reg(i_regmap,rs1[i]);
5783     s1h=get_reg(i_regmap,rs1[i]|64);
5784   }
5785   if(rs1[i]==0)
5786   {
5787     if(opcode2[i]&1) unconditional=1;
5788     else nevertaken=1;
5789     // These are never taken (r0 is never less than zero)
5790     //assert(opcode2[i]!=0);
5791     //assert(opcode2[i]!=2);
5792     //assert(opcode2[i]!=0x10);
5793     //assert(opcode2[i]!=0x12);
5794   }
5795   else {
5796     only32=(regs[i].was32>>rs1[i])&1;
5797   }
5798
5799   if(ooo[i]) {
5800     // Out of order execution (delay slot first)
5801     //printf("OOOE\n");
5802     address_generation(i+1,i_regs,regs[i].regmap_entry);
5803     ds_assemble(i+1,i_regs);
5804     int adj;
5805     uint64_t bc_unneeded=branch_regs[i].u;
5806     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5807     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5808     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5809     bc_unneeded|=1;
5810     bc_unneeded_upper|=1;
5811     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5812                   bc_unneeded,bc_unneeded_upper);
5813     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5814     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5815     if(rt1[i]==31) {
5816       int rt,return_address;
5817       rt=get_reg(branch_regs[i].regmap,31);
5818       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5819       if(rt>=0) {
5820         // Save the PC even if the branch is not taken
5821         return_address=start+i*4+8;
5822         emit_movimm(return_address,rt); // PC into link register
5823         #ifdef IMM_PREFETCH
5824         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5825         #endif
5826       }
5827     }
5828     cc=get_reg(branch_regs[i].regmap,CCREG);
5829     assert(cc==HOST_CCREG);
5830     if(unconditional) 
5831       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5832     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5833     assem_debug("cycle count (adj)\n");
5834     if(unconditional) {
5835       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5836       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5837         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5838         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5839         if(internal)
5840           assem_debug("branch: internal\n");
5841         else
5842           assem_debug("branch: external\n");
5843         if(internal&&is_ds[(ba[i]-start)>>2]) {
5844           ds_assemble_entry(i);
5845         }
5846         else {
5847           add_to_linker((int)out,ba[i],internal);
5848           emit_jmp(0);
5849         }
5850         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5851         if(((u_int)out)&7) emit_addnop(0);
5852         #endif
5853       }
5854     }
5855     else if(nevertaken) {
5856       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5857       int jaddr=(int)out;
5858       emit_jns(0);
5859       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5860     }
5861     else {
5862       int nottaken=0;
5863       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5864       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5865       if(!only32)
5866       {
5867         assert(s1h>=0);
5868         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5869         {
5870           emit_test(s1h,s1h);
5871           if(invert){
5872             nottaken=(int)out;
5873             emit_jns(1);
5874           }else{
5875             add_to_linker((int)out,ba[i],internal);
5876             emit_js(0);
5877           }
5878         }
5879         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5880         {
5881           emit_test(s1h,s1h);
5882           if(invert){
5883             nottaken=(int)out;
5884             emit_js(1);
5885           }else{
5886             add_to_linker((int)out,ba[i],internal);
5887             emit_jns(0);
5888           }
5889         }
5890       } // if(!only32)
5891       else
5892       {
5893         assert(s1l>=0);
5894         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5895         {
5896           emit_test(s1l,s1l);
5897           if(invert){
5898             nottaken=(int)out;
5899             emit_jns(1);
5900           }else{
5901             add_to_linker((int)out,ba[i],internal);
5902             emit_js(0);
5903           }
5904         }
5905         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5906         {
5907           emit_test(s1l,s1l);
5908           if(invert){
5909             nottaken=(int)out;
5910             emit_js(1);
5911           }else{
5912             add_to_linker((int)out,ba[i],internal);
5913             emit_jns(0);
5914           }
5915         }
5916       } // if(!only32)
5917           
5918       if(invert) {
5919         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5920         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5921           if(adj) {
5922             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5923             add_to_linker((int)out,ba[i],internal);
5924           }else{
5925             emit_addnop(13);
5926             add_to_linker((int)out,ba[i],internal*2);
5927           }
5928           emit_jmp(0);
5929         }else
5930         #endif
5931         {
5932           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5933           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5934           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5935           if(internal)
5936             assem_debug("branch: internal\n");
5937           else
5938             assem_debug("branch: external\n");
5939           if(internal&&is_ds[(ba[i]-start)>>2]) {
5940             ds_assemble_entry(i);
5941           }
5942           else {
5943             add_to_linker((int)out,ba[i],internal);
5944             emit_jmp(0);
5945           }
5946         }
5947         set_jump_target(nottaken,(int)out);
5948       }
5949
5950       if(adj) {
5951         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5952       }
5953     } // (!unconditional)
5954   } // if(ooo)
5955   else
5956   {
5957     // In-order execution (branch first)
5958     //printf("IOE\n");
5959     int nottaken=0;
5960     if(rt1[i]==31) {
5961       int rt,return_address;
5962       rt=get_reg(branch_regs[i].regmap,31);
5963       if(rt>=0) {
5964         // Save the PC even if the branch is not taken
5965         return_address=start+i*4+8;
5966         emit_movimm(return_address,rt); // PC into link register
5967         #ifdef IMM_PREFETCH
5968         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5969         #endif
5970       }
5971     }
5972     if(!unconditional) {
5973       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5974       if(!only32)
5975       {
5976         assert(s1h>=0);
5977         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5978         {
5979           emit_test(s1h,s1h);
5980           nottaken=(int)out;
5981           emit_jns(1);
5982         }
5983         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5984         {
5985           emit_test(s1h,s1h);
5986           nottaken=(int)out;
5987           emit_js(1);
5988         }
5989       } // if(!only32)
5990       else
5991       {
5992         assert(s1l>=0);
5993         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5994         {
5995           emit_test(s1l,s1l);
5996           nottaken=(int)out;
5997           emit_jns(1);
5998         }
5999         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6000         {
6001           emit_test(s1l,s1l);
6002           nottaken=(int)out;
6003           emit_js(1);
6004         }
6005       }
6006     } // if(!unconditional)
6007     int adj;
6008     uint64_t ds_unneeded=branch_regs[i].u;
6009     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6010     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6011     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6012     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6013     ds_unneeded|=1;
6014     ds_unneeded_upper|=1;
6015     // branch taken
6016     if(!nevertaken) {
6017       //assem_debug("1:\n");
6018       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6019                     ds_unneeded,ds_unneeded_upper);
6020       // load regs
6021       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6022       address_generation(i+1,&branch_regs[i],0);
6023       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6024       ds_assemble(i+1,&branch_regs[i]);
6025       cc=get_reg(branch_regs[i].regmap,CCREG);
6026       if(cc==-1) {
6027         emit_loadreg(CCREG,cc=HOST_CCREG);
6028         // CHECK: Is the following instruction (fall thru) allocated ok?
6029       }
6030       assert(cc==HOST_CCREG);
6031       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6032       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6033       assem_debug("cycle count (adj)\n");
6034       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6035       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6036       if(internal)
6037         assem_debug("branch: internal\n");
6038       else
6039         assem_debug("branch: external\n");
6040       if(internal&&is_ds[(ba[i]-start)>>2]) {
6041         ds_assemble_entry(i);
6042       }
6043       else {
6044         add_to_linker((int)out,ba[i],internal);
6045         emit_jmp(0);
6046       }
6047     }
6048     // branch not taken
6049     cop1_usable=prev_cop1_usable;
6050     if(!unconditional) {
6051       set_jump_target(nottaken,(int)out);
6052       assem_debug("1:\n");
6053       if(!likely[i]) {
6054         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6055                       ds_unneeded,ds_unneeded_upper);
6056         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6057         address_generation(i+1,&branch_regs[i],0);
6058         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6059         ds_assemble(i+1,&branch_regs[i]);
6060       }
6061       cc=get_reg(branch_regs[i].regmap,CCREG);
6062       if(cc==-1&&!likely[i]) {
6063         // Cycle count isn't in a register, temporarily load it then write it out
6064         emit_loadreg(CCREG,HOST_CCREG);
6065         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6066         int jaddr=(int)out;
6067         emit_jns(0);
6068         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6069         emit_storereg(CCREG,HOST_CCREG);
6070       }
6071       else{
6072         cc=get_reg(i_regmap,CCREG);
6073         assert(cc==HOST_CCREG);
6074         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6075         int jaddr=(int)out;
6076         emit_jns(0);
6077         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6078       }
6079     }
6080   }
6081 }
6082
6083 void fjump_assemble(int i,struct regstat *i_regs)
6084 {
6085   signed char *i_regmap=i_regs->regmap;
6086   int cc;
6087   int match;
6088   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6089   assem_debug("fmatch=%d\n",match);
6090   int fs,cs;
6091   int eaddr;
6092   int invert=0;
6093   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6094   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6095   if(!match) invert=1;
6096   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6097   if(i>(ba[i]-start)>>2) invert=1;
6098   #endif
6099
6100   if(ooo[i]) {
6101     fs=get_reg(branch_regs[i].regmap,FSREG);
6102     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6103   }
6104   else {
6105     fs=get_reg(i_regmap,FSREG);
6106   }
6107
6108   // Check cop1 unusable
6109   if(!cop1_usable) {
6110     cs=get_reg(i_regmap,CSREG);
6111     assert(cs>=0);
6112     emit_testimm(cs,0x20000000);
6113     eaddr=(int)out;
6114     emit_jeq(0);
6115     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6116     cop1_usable=1;
6117   }
6118
6119   if(ooo[i]) {
6120     // Out of order execution (delay slot first)
6121     //printf("OOOE\n");
6122     ds_assemble(i+1,i_regs);
6123     int adj;
6124     uint64_t bc_unneeded=branch_regs[i].u;
6125     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6126     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6127     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6128     bc_unneeded|=1;
6129     bc_unneeded_upper|=1;
6130     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6131                   bc_unneeded,bc_unneeded_upper);
6132     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6133     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6134     cc=get_reg(branch_regs[i].regmap,CCREG);
6135     assert(cc==HOST_CCREG);
6136     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6137     assem_debug("cycle count (adj)\n");
6138     if(1) {
6139       int nottaken=0;
6140       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6141       if(1) {
6142         assert(fs>=0);
6143         emit_testimm(fs,0x800000);
6144         if(source[i]&0x10000) // BC1T
6145         {
6146           if(invert){
6147             nottaken=(int)out;
6148             emit_jeq(1);
6149           }else{
6150             add_to_linker((int)out,ba[i],internal);
6151             emit_jne(0);
6152           }
6153         }
6154         else // BC1F
6155           if(invert){
6156             nottaken=(int)out;
6157             emit_jne(1);
6158           }else{
6159             add_to_linker((int)out,ba[i],internal);
6160             emit_jeq(0);
6161           }
6162         {
6163         }
6164       } // if(!only32)
6165           
6166       if(invert) {
6167         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6168         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6169         else if(match) emit_addnop(13);
6170         #endif
6171         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6172         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6173         if(internal)
6174           assem_debug("branch: internal\n");
6175         else
6176           assem_debug("branch: external\n");
6177         if(internal&&is_ds[(ba[i]-start)>>2]) {
6178           ds_assemble_entry(i);
6179         }
6180         else {
6181           add_to_linker((int)out,ba[i],internal);
6182           emit_jmp(0);
6183         }
6184         set_jump_target(nottaken,(int)out);
6185       }
6186
6187       if(adj) {
6188         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6189       }
6190     } // (!unconditional)
6191   } // if(ooo)
6192   else
6193   {
6194     // In-order execution (branch first)
6195     //printf("IOE\n");
6196     int nottaken=0;
6197     if(1) {
6198       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6199       if(1) {
6200         assert(fs>=0);
6201         emit_testimm(fs,0x800000);
6202         if(source[i]&0x10000) // BC1T
6203         {
6204           nottaken=(int)out;
6205           emit_jeq(1);
6206         }
6207         else // BC1F
6208         {
6209           nottaken=(int)out;
6210           emit_jne(1);
6211         }
6212       }
6213     } // if(!unconditional)
6214     int adj;
6215     uint64_t ds_unneeded=branch_regs[i].u;
6216     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6217     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6218     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6219     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6220     ds_unneeded|=1;
6221     ds_unneeded_upper|=1;
6222     // branch taken
6223     //assem_debug("1:\n");
6224     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6225                   ds_unneeded,ds_unneeded_upper);
6226     // load regs
6227     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6228     address_generation(i+1,&branch_regs[i],0);
6229     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6230     ds_assemble(i+1,&branch_regs[i]);
6231     cc=get_reg(branch_regs[i].regmap,CCREG);
6232     if(cc==-1) {
6233       emit_loadreg(CCREG,cc=HOST_CCREG);
6234       // CHECK: Is the following instruction (fall thru) allocated ok?
6235     }
6236     assert(cc==HOST_CCREG);
6237     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6238     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6239     assem_debug("cycle count (adj)\n");
6240     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6241     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6242     if(internal)
6243       assem_debug("branch: internal\n");
6244     else
6245       assem_debug("branch: external\n");
6246     if(internal&&is_ds[(ba[i]-start)>>2]) {
6247       ds_assemble_entry(i);
6248     }
6249     else {
6250       add_to_linker((int)out,ba[i],internal);
6251       emit_jmp(0);
6252     }
6253
6254     // branch not taken
6255     if(1) { // <- FIXME (don't need this)
6256       set_jump_target(nottaken,(int)out);
6257       assem_debug("1:\n");
6258       if(!likely[i]) {
6259         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6260                       ds_unneeded,ds_unneeded_upper);
6261         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6262         address_generation(i+1,&branch_regs[i],0);
6263         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6264         ds_assemble(i+1,&branch_regs[i]);
6265       }
6266       cc=get_reg(branch_regs[i].regmap,CCREG);
6267       if(cc==-1&&!likely[i]) {
6268         // Cycle count isn't in a register, temporarily load it then write it out
6269         emit_loadreg(CCREG,HOST_CCREG);
6270         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6271         int jaddr=(int)out;
6272         emit_jns(0);
6273         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6274         emit_storereg(CCREG,HOST_CCREG);
6275       }
6276       else{
6277         cc=get_reg(i_regmap,CCREG);
6278         assert(cc==HOST_CCREG);
6279         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6280         int jaddr=(int)out;
6281         emit_jns(0);
6282         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6283       }
6284     }
6285   }
6286 }
6287
6288 static void pagespan_assemble(int i,struct regstat *i_regs)
6289 {
6290   int s1l=get_reg(i_regs->regmap,rs1[i]);
6291   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6292   int s2l=get_reg(i_regs->regmap,rs2[i]);
6293   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6294   void *nt_branch=NULL;
6295   int taken=0;
6296   int nottaken=0;
6297   int unconditional=0;
6298   if(rs1[i]==0)
6299   {
6300     s1l=s2l;s1h=s2h;
6301     s2l=s2h=-1;
6302   }
6303   else if(rs2[i]==0)
6304   {
6305     s2l=s2h=-1;
6306   }
6307   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6308     s1h=s2h=-1;
6309   }
6310   int hr=0;
6311   int addr,alt,ntaddr;
6312   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6313   else {
6314     while(hr<HOST_REGS)
6315     {
6316       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6317          (i_regs->regmap[hr]&63)!=rs1[i] &&
6318          (i_regs->regmap[hr]&63)!=rs2[i] )
6319       {
6320         addr=hr++;break;
6321       }
6322       hr++;
6323     }
6324   }
6325   while(hr<HOST_REGS)
6326   {
6327     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6328        (i_regs->regmap[hr]&63)!=rs1[i] &&
6329        (i_regs->regmap[hr]&63)!=rs2[i] )
6330     {
6331       alt=hr++;break;
6332     }
6333     hr++;
6334   }
6335   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6336   {
6337     while(hr<HOST_REGS)
6338     {
6339       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6340          (i_regs->regmap[hr]&63)!=rs1[i] &&
6341          (i_regs->regmap[hr]&63)!=rs2[i] )
6342       {
6343         ntaddr=hr;break;
6344       }
6345       hr++;
6346     }
6347   }
6348   assert(hr<HOST_REGS);
6349   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6350     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6351   }
6352   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6353   if(opcode[i]==2) // J
6354   {
6355     unconditional=1;
6356   }
6357   if(opcode[i]==3) // JAL
6358   {
6359     // TODO: mini_ht
6360     int rt=get_reg(i_regs->regmap,31);
6361     emit_movimm(start+i*4+8,rt);
6362     unconditional=1;
6363   }
6364   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6365   {
6366     emit_mov(s1l,addr);
6367     if(opcode2[i]==9) // JALR
6368     {
6369       int rt=get_reg(i_regs->regmap,rt1[i]);
6370       emit_movimm(start+i*4+8,rt);
6371     }
6372   }
6373   if((opcode[i]&0x3f)==4) // BEQ
6374   {
6375     if(rs1[i]==rs2[i])
6376     {
6377       unconditional=1;
6378     }
6379     else
6380     #ifdef HAVE_CMOV_IMM
6381     if(s1h<0) {
6382       if(s2l>=0) emit_cmp(s1l,s2l);
6383       else emit_test(s1l,s1l);
6384       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6385     }
6386     else
6387     #endif
6388     {
6389       assert(s1l>=0);
6390       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6391       if(s1h>=0) {
6392         if(s2h>=0) emit_cmp(s1h,s2h);
6393         else emit_test(s1h,s1h);
6394         emit_cmovne_reg(alt,addr);
6395       }
6396       if(s2l>=0) emit_cmp(s1l,s2l);
6397       else emit_test(s1l,s1l);
6398       emit_cmovne_reg(alt,addr);
6399     }
6400   }
6401   if((opcode[i]&0x3f)==5) // BNE
6402   {
6403     #ifdef HAVE_CMOV_IMM
6404     if(s1h<0) {
6405       if(s2l>=0) emit_cmp(s1l,s2l);
6406       else emit_test(s1l,s1l);
6407       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6408     }
6409     else
6410     #endif
6411     {
6412       assert(s1l>=0);
6413       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6414       if(s1h>=0) {
6415         if(s2h>=0) emit_cmp(s1h,s2h);
6416         else emit_test(s1h,s1h);
6417         emit_cmovne_reg(alt,addr);
6418       }
6419       if(s2l>=0) emit_cmp(s1l,s2l);
6420       else emit_test(s1l,s1l);
6421       emit_cmovne_reg(alt,addr);
6422     }
6423   }
6424   if((opcode[i]&0x3f)==0x14) // BEQL
6425   {
6426     if(s1h>=0) {
6427       if(s2h>=0) emit_cmp(s1h,s2h);
6428       else emit_test(s1h,s1h);
6429       nottaken=(int)out;
6430       emit_jne(0);
6431     }
6432     if(s2l>=0) emit_cmp(s1l,s2l);
6433     else emit_test(s1l,s1l);
6434     if(nottaken) set_jump_target(nottaken,(int)out);
6435     nottaken=(int)out;
6436     emit_jne(0);
6437   }
6438   if((opcode[i]&0x3f)==0x15) // BNEL
6439   {
6440     if(s1h>=0) {
6441       if(s2h>=0) emit_cmp(s1h,s2h);
6442       else emit_test(s1h,s1h);
6443       taken=(int)out;
6444       emit_jne(0);
6445     }
6446     if(s2l>=0) emit_cmp(s1l,s2l);
6447     else emit_test(s1l,s1l);
6448     nottaken=(int)out;
6449     emit_jeq(0);
6450     if(taken) set_jump_target(taken,(int)out);
6451   }
6452   if((opcode[i]&0x3f)==6) // BLEZ
6453   {
6454     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6455     emit_cmpimm(s1l,1);
6456     if(s1h>=0) emit_mov(addr,ntaddr);
6457     emit_cmovl_reg(alt,addr);
6458     if(s1h>=0) {
6459       emit_test(s1h,s1h);
6460       emit_cmovne_reg(ntaddr,addr);
6461       emit_cmovs_reg(alt,addr);
6462     }
6463   }
6464   if((opcode[i]&0x3f)==7) // BGTZ
6465   {
6466     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6467     emit_cmpimm(s1l,1);
6468     if(s1h>=0) emit_mov(addr,alt);
6469     emit_cmovl_reg(ntaddr,addr);
6470     if(s1h>=0) {
6471       emit_test(s1h,s1h);
6472       emit_cmovne_reg(alt,addr);
6473       emit_cmovs_reg(ntaddr,addr);
6474     }
6475   }
6476   if((opcode[i]&0x3f)==0x16) // BLEZL
6477   {
6478     assert((opcode[i]&0x3f)!=0x16);
6479   }
6480   if((opcode[i]&0x3f)==0x17) // BGTZL
6481   {
6482     assert((opcode[i]&0x3f)!=0x17);
6483   }
6484   assert(opcode[i]!=1); // BLTZ/BGEZ
6485
6486   //FIXME: Check CSREG
6487   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6488     if((source[i]&0x30000)==0) // BC1F
6489     {
6490       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6491       emit_testimm(s1l,0x800000);
6492       emit_cmovne_reg(alt,addr);
6493     }
6494     if((source[i]&0x30000)==0x10000) // BC1T
6495     {
6496       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6497       emit_testimm(s1l,0x800000);
6498       emit_cmovne_reg(alt,addr);
6499     }
6500     if((source[i]&0x30000)==0x20000) // BC1FL
6501     {
6502       emit_testimm(s1l,0x800000);
6503       nottaken=(int)out;
6504       emit_jne(0);
6505     }
6506     if((source[i]&0x30000)==0x30000) // BC1TL
6507     {
6508       emit_testimm(s1l,0x800000);
6509       nottaken=(int)out;
6510       emit_jeq(0);
6511     }
6512   }
6513
6514   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6515   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6516   if(likely[i]||unconditional)
6517   {
6518     emit_movimm(ba[i],HOST_BTREG);
6519   }
6520   else if(addr!=HOST_BTREG)
6521   {
6522     emit_mov(addr,HOST_BTREG);
6523   }
6524   void *branch_addr=out;
6525   emit_jmp(0);
6526   int target_addr=start+i*4+5;
6527   void *stub=out;
6528   void *compiled_target_addr=check_addr(target_addr);
6529   emit_extjump_ds((int)branch_addr,target_addr);
6530   if(compiled_target_addr) {
6531     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6532     add_link(target_addr,stub);
6533   }
6534   else set_jump_target((int)branch_addr,(int)stub);
6535   if(likely[i]) {
6536     // Not-taken path
6537     set_jump_target((int)nottaken,(int)out);
6538     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6539     void *branch_addr=out;
6540     emit_jmp(0);
6541     int target_addr=start+i*4+8;
6542     void *stub=out;
6543     void *compiled_target_addr=check_addr(target_addr);
6544     emit_extjump_ds((int)branch_addr,target_addr);
6545     if(compiled_target_addr) {
6546       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6547       add_link(target_addr,stub);
6548     }
6549     else set_jump_target((int)branch_addr,(int)stub);
6550   }
6551 }
6552
6553 // Assemble the delay slot for the above
6554 static void pagespan_ds()
6555 {
6556   assem_debug("initial delay slot:\n");
6557   u_int vaddr=start+1;
6558   u_int page=get_page(vaddr);
6559   u_int vpage=get_vpage(vaddr);
6560   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6561   do_dirty_stub_ds();
6562   ll_add(jump_in+page,vaddr,(void *)out);
6563   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6564   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6565     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6566   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6567     emit_writeword(HOST_BTREG,(int)&branch_target);
6568   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6569   address_generation(0,&regs[0],regs[0].regmap_entry);
6570   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6571     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6572   cop1_usable=0;
6573   is_delayslot=0;
6574   switch(itype[0]) {
6575     case ALU:
6576       alu_assemble(0,&regs[0]);break;
6577     case IMM16:
6578       imm16_assemble(0,&regs[0]);break;
6579     case SHIFT:
6580       shift_assemble(0,&regs[0]);break;
6581     case SHIFTIMM:
6582       shiftimm_assemble(0,&regs[0]);break;
6583     case LOAD:
6584       load_assemble(0,&regs[0]);break;
6585     case LOADLR:
6586       loadlr_assemble(0,&regs[0]);break;
6587     case STORE:
6588       store_assemble(0,&regs[0]);break;
6589     case STORELR:
6590       storelr_assemble(0,&regs[0]);break;
6591     case COP0:
6592       cop0_assemble(0,&regs[0]);break;
6593     case COP1:
6594       cop1_assemble(0,&regs[0]);break;
6595     case C1LS:
6596       c1ls_assemble(0,&regs[0]);break;
6597     case COP2:
6598       cop2_assemble(0,&regs[0]);break;
6599     case C2LS:
6600       c2ls_assemble(0,&regs[0]);break;
6601     case C2OP:
6602       c2op_assemble(0,&regs[0]);break;
6603     case FCONV:
6604       fconv_assemble(0,&regs[0]);break;
6605     case FLOAT:
6606       float_assemble(0,&regs[0]);break;
6607     case FCOMP:
6608       fcomp_assemble(0,&regs[0]);break;
6609     case MULTDIV:
6610       multdiv_assemble(0,&regs[0]);break;
6611     case MOV:
6612       mov_assemble(0,&regs[0]);break;
6613     case SYSCALL:
6614     case HLECALL:
6615     case INTCALL:
6616     case SPAN:
6617     case UJUMP:
6618     case RJUMP:
6619     case CJUMP:
6620     case SJUMP:
6621     case FJUMP:
6622       printf("Jump in the delay slot.  This is probably a bug.\n");
6623   }
6624   int btaddr=get_reg(regs[0].regmap,BTREG);
6625   if(btaddr<0) {
6626     btaddr=get_reg(regs[0].regmap,-1);
6627     emit_readword((int)&branch_target,btaddr);
6628   }
6629   assert(btaddr!=HOST_CCREG);
6630   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6631 #ifdef HOST_IMM8
6632   emit_movimm(start+4,HOST_TEMPREG);
6633   emit_cmp(btaddr,HOST_TEMPREG);
6634 #else
6635   emit_cmpimm(btaddr,start+4);
6636 #endif
6637   int branch=(int)out;
6638   emit_jeq(0);
6639   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6640   emit_jmp(jump_vaddr_reg[btaddr]);
6641   set_jump_target(branch,(int)out);
6642   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6643   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6644 }
6645
6646 // Basic liveness analysis for MIPS registers
6647 void unneeded_registers(int istart,int iend,int r)
6648 {
6649   int i;
6650   uint64_t u,uu,b,bu;
6651   uint64_t temp_u,temp_uu;
6652   uint64_t tdep;
6653   if(iend==slen-1) {
6654     u=1;uu=1;
6655   }else{
6656     u=unneeded_reg[iend+1];
6657     uu=unneeded_reg_upper[iend+1];
6658     u=1;uu=1;
6659   }
6660   for (i=iend;i>=istart;i--)
6661   {
6662     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6663     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6664     {
6665       // If subroutine call, flag return address as a possible branch target
6666       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6667       
6668       if(ba[i]<start || ba[i]>=(start+slen*4))
6669       {
6670         // Branch out of this block, flush all regs
6671         u=1;
6672         uu=1;
6673         /* Hexagon hack 
6674         if(itype[i]==UJUMP&&rt1[i]==31)
6675         {
6676           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6677         }
6678         if(itype[i]==RJUMP&&rs1[i]==31)
6679         {
6680           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6681         }
6682         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6683           if(itype[i]==UJUMP&&rt1[i]==31)
6684           {
6685             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6686             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6687           }
6688           if(itype[i]==RJUMP&&rs1[i]==31)
6689           {
6690             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6691             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6692           }
6693         }*/
6694         branch_unneeded_reg[i]=u;
6695         branch_unneeded_reg_upper[i]=uu;
6696         // Merge in delay slot
6697         tdep=(~uu>>rt1[i+1])&1;
6698         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6699         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6700         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6701         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6702         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6703         u|=1;uu|=1;
6704         // If branch is "likely" (and conditional)
6705         // then we skip the delay slot on the fall-thru path
6706         if(likely[i]) {
6707           if(i<slen-1) {
6708             u&=unneeded_reg[i+2];
6709             uu&=unneeded_reg_upper[i+2];
6710           }
6711           else
6712           {
6713             u=1;
6714             uu=1;
6715           }
6716         }
6717       }
6718       else
6719       {
6720         // Internal branch, flag target
6721         bt[(ba[i]-start)>>2]=1;
6722         if(ba[i]<=start+i*4) {
6723           // Backward branch
6724           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6725           {
6726             // Unconditional branch
6727             temp_u=1;temp_uu=1;
6728           } else {
6729             // Conditional branch (not taken case)
6730             temp_u=unneeded_reg[i+2];
6731             temp_uu=unneeded_reg_upper[i+2];
6732           }
6733           // Merge in delay slot
6734           tdep=(~temp_uu>>rt1[i+1])&1;
6735           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6736           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6737           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6738           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6739           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6740           temp_u|=1;temp_uu|=1;
6741           // If branch is "likely" (and conditional)
6742           // then we skip the delay slot on the fall-thru path
6743           if(likely[i]) {
6744             if(i<slen-1) {
6745               temp_u&=unneeded_reg[i+2];
6746               temp_uu&=unneeded_reg_upper[i+2];
6747             }
6748             else
6749             {
6750               temp_u=1;
6751               temp_uu=1;
6752             }
6753           }
6754           tdep=(~temp_uu>>rt1[i])&1;
6755           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6756           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6757           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6758           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6759           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6760           temp_u|=1;temp_uu|=1;
6761           unneeded_reg[i]=temp_u;
6762           unneeded_reg_upper[i]=temp_uu;
6763           // Only go three levels deep.  This recursion can take an
6764           // excessive amount of time if there are a lot of nested loops.
6765           if(r<2) {
6766             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6767           }else{
6768             unneeded_reg[(ba[i]-start)>>2]=1;
6769             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6770           }
6771         } /*else*/ if(1) {
6772           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6773           {
6774             // Unconditional branch
6775             u=unneeded_reg[(ba[i]-start)>>2];
6776             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6777             branch_unneeded_reg[i]=u;
6778             branch_unneeded_reg_upper[i]=uu;
6779         //u=1;
6780         //uu=1;
6781         //branch_unneeded_reg[i]=u;
6782         //branch_unneeded_reg_upper[i]=uu;
6783             // Merge in delay slot
6784             tdep=(~uu>>rt1[i+1])&1;
6785             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6786             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6787             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6788             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6789             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6790             u|=1;uu|=1;
6791           } else {
6792             // Conditional branch
6793             b=unneeded_reg[(ba[i]-start)>>2];
6794             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6795             branch_unneeded_reg[i]=b;
6796             branch_unneeded_reg_upper[i]=bu;
6797         //b=1;
6798         //bu=1;
6799         //branch_unneeded_reg[i]=b;
6800         //branch_unneeded_reg_upper[i]=bu;
6801             // Branch delay slot
6802             tdep=(~uu>>rt1[i+1])&1;
6803             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6804             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6805             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6806             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6807             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6808             b|=1;bu|=1;
6809             // If branch is "likely" then we skip the
6810             // delay slot on the fall-thru path
6811             if(likely[i]) {
6812               u=b;
6813               uu=bu;
6814               if(i<slen-1) {
6815                 u&=unneeded_reg[i+2];
6816                 uu&=unneeded_reg_upper[i+2];
6817         //u=1;
6818         //uu=1;
6819               }
6820             } else {
6821               u&=b;
6822               uu&=bu;
6823         //u=1;
6824         //uu=1;
6825             }
6826             if(i<slen-1) {
6827               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6828               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6829         //branch_unneeded_reg[i]=1;
6830         //branch_unneeded_reg_upper[i]=1;
6831             } else {
6832               branch_unneeded_reg[i]=1;
6833               branch_unneeded_reg_upper[i]=1;
6834             }
6835           }
6836         }
6837       }
6838     }
6839     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6840     {
6841       // SYSCALL instruction (software interrupt)
6842       u=1;
6843       uu=1;
6844     }
6845     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6846     {
6847       // ERET instruction (return from interrupt)
6848       u=1;
6849       uu=1;
6850     }
6851     //u=uu=1; // DEBUG
6852     tdep=(~uu>>rt1[i])&1;
6853     // Written registers are unneeded
6854     u|=1LL<<rt1[i];
6855     u|=1LL<<rt2[i];
6856     uu|=1LL<<rt1[i];
6857     uu|=1LL<<rt2[i];
6858     // Accessed registers are needed
6859     u&=~(1LL<<rs1[i]);
6860     u&=~(1LL<<rs2[i]);
6861     uu&=~(1LL<<us1[i]);
6862     uu&=~(1LL<<us2[i]);
6863     // Source-target dependencies
6864     uu&=~(tdep<<dep1[i]);
6865     uu&=~(tdep<<dep2[i]);
6866     // R0 is always unneeded
6867     u|=1;uu|=1;
6868     // Save it
6869     unneeded_reg[i]=u;
6870     unneeded_reg_upper[i]=uu;
6871     /*
6872     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6873     printf("U:");
6874     int r;
6875     for(r=1;r<=CCREG;r++) {
6876       if((unneeded_reg[i]>>r)&1) {
6877         if(r==HIREG) printf(" HI");
6878         else if(r==LOREG) printf(" LO");
6879         else printf(" r%d",r);
6880       }
6881     }
6882     printf(" UU:");
6883     for(r=1;r<=CCREG;r++) {
6884       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6885         if(r==HIREG) printf(" HI");
6886         else if(r==LOREG) printf(" LO");
6887         else printf(" r%d",r);
6888       }
6889     }
6890     printf("\n");*/
6891   }
6892 #ifdef FORCE32
6893   for (i=iend;i>=istart;i--)
6894   {
6895     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6896   }
6897 #endif
6898 }
6899
6900 // Identify registers which are likely to contain 32-bit values
6901 // This is used to predict whether any branches will jump to a
6902 // location with 64-bit values in registers.
6903 static void provisional_32bit()
6904 {
6905   int i,j;
6906   uint64_t is32=1;
6907   uint64_t lastbranch=1;
6908   
6909   for(i=0;i<slen;i++)
6910   {
6911     if(i>0) {
6912       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6913         if(i>1) is32=lastbranch;
6914         else is32=1;
6915       }
6916     }
6917     if(i>1)
6918     {
6919       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6920         if(likely[i-2]) {
6921           if(i>2) is32=lastbranch;
6922           else is32=1;
6923         }
6924       }
6925       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6926       {
6927         if(rs1[i-2]==0||rs2[i-2]==0)
6928         {
6929           if(rs1[i-2]) {
6930             is32|=1LL<<rs1[i-2];
6931           }
6932           if(rs2[i-2]) {
6933             is32|=1LL<<rs2[i-2];
6934           }
6935         }
6936       }
6937     }
6938     // If something jumps here with 64-bit values
6939     // then promote those registers to 64 bits
6940     if(bt[i])
6941     {
6942       uint64_t temp_is32=is32;
6943       for(j=i-1;j>=0;j--)
6944       {
6945         if(ba[j]==start+i*4) 
6946           //temp_is32&=branch_regs[j].is32;
6947           temp_is32&=p32[j];
6948       }
6949       for(j=i;j<slen;j++)
6950       {
6951         if(ba[j]==start+i*4) 
6952           temp_is32=1;
6953       }
6954       is32=temp_is32;
6955     }
6956     int type=itype[i];
6957     int op=opcode[i];
6958     int op2=opcode2[i];
6959     int rt=rt1[i];
6960     int s1=rs1[i];
6961     int s2=rs2[i];
6962     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6963       // Branches don't write registers, consider the delay slot instead.
6964       type=itype[i+1];
6965       op=opcode[i+1];
6966       op2=opcode2[i+1];
6967       rt=rt1[i+1];
6968       s1=rs1[i+1];
6969       s2=rs2[i+1];
6970       lastbranch=is32;
6971     }
6972     switch(type) {
6973       case LOAD:
6974         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6975            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6976           is32&=~(1LL<<rt);
6977         else
6978           is32|=1LL<<rt;
6979         break;
6980       case STORE:
6981       case STORELR:
6982         break;
6983       case LOADLR:
6984         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6985         if(op==0x22) is32|=1LL<<rt; // LWL
6986         break;
6987       case IMM16:
6988         if (op==0x08||op==0x09|| // ADDI/ADDIU
6989             op==0x0a||op==0x0b|| // SLTI/SLTIU
6990             op==0x0c|| // ANDI
6991             op==0x0f)  // LUI
6992         {
6993           is32|=1LL<<rt;
6994         }
6995         if(op==0x18||op==0x19) { // DADDI/DADDIU
6996           is32&=~(1LL<<rt);
6997           //if(imm[i]==0)
6998           //  is32|=((is32>>s1)&1LL)<<rt;
6999         }
7000         if(op==0x0d||op==0x0e) { // ORI/XORI
7001           uint64_t sr=((is32>>s1)&1LL);
7002           is32&=~(1LL<<rt);
7003           is32|=sr<<rt;
7004         }
7005         break;
7006       case UJUMP:
7007         break;
7008       case RJUMP:
7009         break;
7010       case CJUMP:
7011         break;
7012       case SJUMP:
7013         break;
7014       case FJUMP:
7015         break;
7016       case ALU:
7017         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7018           is32|=1LL<<rt;
7019         }
7020         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7021           is32|=1LL<<rt;
7022         }
7023         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7024           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7025           is32&=~(1LL<<rt);
7026           is32|=sr<<rt;
7027         }
7028         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7029           if(s1==0&&s2==0) {
7030             is32|=1LL<<rt;
7031           }
7032           else if(s2==0) {
7033             uint64_t sr=((is32>>s1)&1LL);
7034             is32&=~(1LL<<rt);
7035             is32|=sr<<rt;
7036           }
7037           else if(s1==0) {
7038             uint64_t sr=((is32>>s2)&1LL);
7039             is32&=~(1LL<<rt);
7040             is32|=sr<<rt;
7041           }
7042           else {
7043             is32&=~(1LL<<rt);
7044           }
7045         }
7046         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7047           if(s1==0&&s2==0) {
7048             is32|=1LL<<rt;
7049           }
7050           else if(s2==0) {
7051             uint64_t sr=((is32>>s1)&1LL);
7052             is32&=~(1LL<<rt);
7053             is32|=sr<<rt;
7054           }
7055           else {
7056             is32&=~(1LL<<rt);
7057           }
7058         }
7059         break;
7060       case MULTDIV:
7061         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7062           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7063         }
7064         else {
7065           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7066         }
7067         break;
7068       case MOV:
7069         {
7070           uint64_t sr=((is32>>s1)&1LL);
7071           is32&=~(1LL<<rt);
7072           is32|=sr<<rt;
7073         }
7074         break;
7075       case SHIFT:
7076         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7077         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7078         break;
7079       case SHIFTIMM:
7080         is32|=1LL<<rt;
7081         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7082         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7083         break;
7084       case COP0:
7085         if(op2==0) is32|=1LL<<rt; // MFC0
7086         break;
7087       case COP1:
7088       case COP2:
7089         if(op2==0) is32|=1LL<<rt; // MFC1
7090         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7091         if(op2==2) is32|=1LL<<rt; // CFC1
7092         break;
7093       case C1LS:
7094       case C2LS:
7095         break;
7096       case FLOAT:
7097       case FCONV:
7098         break;
7099       case FCOMP:
7100         break;
7101       case C2OP:
7102       case SYSCALL:
7103       case HLECALL:
7104         break;
7105       default:
7106         break;
7107     }
7108     is32|=1;
7109     p32[i]=is32;
7110
7111     if(i>0)
7112     {
7113       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7114       {
7115         if(rt1[i-1]==31) // JAL/JALR
7116         {
7117           // Subroutine call will return here, don't alloc any registers
7118           is32=1;
7119         }
7120         else if(i+1<slen)
7121         {
7122           // Internal branch will jump here, match registers to caller
7123           is32=0x3FFFFFFFFLL;
7124         }
7125       }
7126     }
7127   }
7128 }
7129
7130 // Identify registers which may be assumed to contain 32-bit values
7131 // and where optimizations will rely on this.
7132 // This is used to determine whether backward branches can safely
7133 // jump to a location with 64-bit values in registers.
7134 static void provisional_r32()
7135 {
7136   u_int r32=0;
7137   int i;
7138   
7139   for (i=slen-1;i>=0;i--)
7140   {
7141     int hr;
7142     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7143     {
7144       if(ba[i]<start || ba[i]>=(start+slen*4))
7145       {
7146         // Branch out of this block, don't need anything
7147         r32=0;
7148       }
7149       else
7150       {
7151         // Internal branch
7152         // Need whatever matches the target
7153         // (and doesn't get overwritten by the delay slot instruction)
7154         r32=0;
7155         int t=(ba[i]-start)>>2;
7156         if(ba[i]>start+i*4) {
7157           // Forward branch
7158           //if(!(requires_32bit[t]&~regs[i].was32))
7159           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7160           if(!(pr32[t]&~regs[i].was32))
7161             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7162         }else{
7163           // Backward branch
7164           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7165             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7166         }
7167       }
7168       // Conditional branch may need registers for following instructions
7169       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7170       {
7171         if(i<slen-2) {
7172           //r32|=requires_32bit[i+2];
7173           r32|=pr32[i+2];
7174           r32&=regs[i].was32;
7175           // Mark this address as a branch target since it may be called
7176           // upon return from interrupt
7177           //bt[i+2]=1;
7178         }
7179       }
7180       // Merge in delay slot
7181       if(!likely[i]) {
7182         // These are overwritten unless the branch is "likely"
7183         // and the delay slot is nullified if not taken
7184         r32&=~(1LL<<rt1[i+1]);
7185         r32&=~(1LL<<rt2[i+1]);
7186       }
7187       // Assume these are needed (delay slot)
7188       if(us1[i+1]>0)
7189       {
7190         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7191       }
7192       if(us2[i+1]>0)
7193       {
7194         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7195       }
7196       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7197       {
7198         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7199       }
7200       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7201       {
7202         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7203       }
7204     }
7205     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7206     {
7207       // SYSCALL instruction (software interrupt)
7208       r32=0;
7209     }
7210     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7211     {
7212       // ERET instruction (return from interrupt)
7213       r32=0;
7214     }
7215     // Check 32 bits
7216     r32&=~(1LL<<rt1[i]);
7217     r32&=~(1LL<<rt2[i]);
7218     if(us1[i]>0)
7219     {
7220       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7221     }
7222     if(us2[i]>0)
7223     {
7224       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7225     }
7226     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7227     {
7228       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7229     }
7230     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7231     {
7232       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7233     }
7234     //requires_32bit[i]=r32;
7235     pr32[i]=r32;
7236     
7237     // Dirty registers which are 32-bit, require 32-bit input
7238     // as they will be written as 32-bit values
7239     for(hr=0;hr<HOST_REGS;hr++)
7240     {
7241       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7242         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7243           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7244           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7245           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7246         }
7247       }
7248     }
7249   }
7250 }
7251
7252 // Write back dirty registers as soon as we will no longer modify them,
7253 // so that we don't end up with lots of writes at the branches.
7254 void clean_registers(int istart,int iend,int wr)
7255 {
7256   int i;
7257   int r;
7258   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7259   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7260   if(iend==slen-1) {
7261     will_dirty_i=will_dirty_next=0;
7262     wont_dirty_i=wont_dirty_next=0;
7263   }else{
7264     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7265     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7266   }
7267   for (i=iend;i>=istart;i--)
7268   {
7269     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7270     {
7271       if(ba[i]<start || ba[i]>=(start+slen*4))
7272       {
7273         // Branch out of this block, flush all regs
7274         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7275         {
7276           // Unconditional branch
7277           will_dirty_i=0;
7278           wont_dirty_i=0;
7279           // Merge in delay slot (will dirty)
7280           for(r=0;r<HOST_REGS;r++) {
7281             if(r!=EXCLUDE_REG) {
7282               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7283               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7284               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7285               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7286               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7287               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7288               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7289               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7290               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7291               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7292               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7293               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7294               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7295               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7296             }
7297           }
7298         }
7299         else
7300         {
7301           // Conditional branch
7302           will_dirty_i=0;
7303           wont_dirty_i=wont_dirty_next;
7304           // Merge in delay slot (will dirty)
7305           for(r=0;r<HOST_REGS;r++) {
7306             if(r!=EXCLUDE_REG) {
7307               if(!likely[i]) {
7308                 // Might not dirty if likely branch is not taken
7309                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7310                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7311                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7312                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7313                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7314                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7315                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7316                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7317                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7318                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7319                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7320                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7321                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7322                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7323               }
7324             }
7325           }
7326         }
7327         // Merge in delay slot (wont dirty)
7328         for(r=0;r<HOST_REGS;r++) {
7329           if(r!=EXCLUDE_REG) {
7330             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7331             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7332             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7333             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7334             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7335             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7336             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7337             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7338             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7339             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7340           }
7341         }
7342         if(wr) {
7343           #ifndef DESTRUCTIVE_WRITEBACK
7344           branch_regs[i].dirty&=wont_dirty_i;
7345           #endif
7346           branch_regs[i].dirty|=will_dirty_i;
7347         }
7348       }
7349       else
7350       {
7351         // Internal branch
7352         if(ba[i]<=start+i*4) {
7353           // Backward branch
7354           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7355           {
7356             // Unconditional branch
7357             temp_will_dirty=0;
7358             temp_wont_dirty=0;
7359             // Merge in delay slot (will dirty)
7360             for(r=0;r<HOST_REGS;r++) {
7361               if(r!=EXCLUDE_REG) {
7362                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7363                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7364                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7365                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7366                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7367                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7368                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7369                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7370                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7371                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7372                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7373                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7374                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7375                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7376               }
7377             }
7378           } else {
7379             // Conditional branch (not taken case)
7380             temp_will_dirty=will_dirty_next;
7381             temp_wont_dirty=wont_dirty_next;
7382             // Merge in delay slot (will dirty)
7383             for(r=0;r<HOST_REGS;r++) {
7384               if(r!=EXCLUDE_REG) {
7385                 if(!likely[i]) {
7386                   // Will not dirty if likely branch is not taken
7387                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7388                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7389                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7390                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7391                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7392                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7393                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7394                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7395                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7396                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7397                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7398                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7399                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7400                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7401                 }
7402               }
7403             }
7404           }
7405           // Merge in delay slot (wont dirty)
7406           for(r=0;r<HOST_REGS;r++) {
7407             if(r!=EXCLUDE_REG) {
7408               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7409               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7410               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7411               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7412               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7413               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7414               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7415               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7416               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7417               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7418             }
7419           }
7420           // Deal with changed mappings
7421           if(i<iend) {
7422             for(r=0;r<HOST_REGS;r++) {
7423               if(r!=EXCLUDE_REG) {
7424                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7425                   temp_will_dirty&=~(1<<r);
7426                   temp_wont_dirty&=~(1<<r);
7427                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7428                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7429                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7430                   } else {
7431                     temp_will_dirty|=1<<r;
7432                     temp_wont_dirty|=1<<r;
7433                   }
7434                 }
7435               }
7436             }
7437           }
7438           if(wr) {
7439             will_dirty[i]=temp_will_dirty;
7440             wont_dirty[i]=temp_wont_dirty;
7441             clean_registers((ba[i]-start)>>2,i-1,0);
7442           }else{
7443             // Limit recursion.  It can take an excessive amount
7444             // of time if there are a lot of nested loops.
7445             will_dirty[(ba[i]-start)>>2]=0;
7446             wont_dirty[(ba[i]-start)>>2]=-1;
7447           }
7448         }
7449         /*else*/ if(1)
7450         {
7451           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7452           {
7453             // Unconditional branch
7454             will_dirty_i=0;
7455             wont_dirty_i=0;
7456           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7457             for(r=0;r<HOST_REGS;r++) {
7458               if(r!=EXCLUDE_REG) {
7459                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7460                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7461                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7462                 }
7463               }
7464             }
7465           //}
7466             // Merge in delay slot
7467             for(r=0;r<HOST_REGS;r++) {
7468               if(r!=EXCLUDE_REG) {
7469                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7470                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7471                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7472                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7473                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7474                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7475                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7476                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7477                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7478                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7479                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7480                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7481                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7482                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7483               }
7484             }
7485           } else {
7486             // Conditional branch
7487             will_dirty_i=will_dirty_next;
7488             wont_dirty_i=wont_dirty_next;
7489           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7490             for(r=0;r<HOST_REGS;r++) {
7491               if(r!=EXCLUDE_REG) {
7492                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7493                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7494                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7495                 }
7496                 else
7497                 {
7498                   will_dirty_i&=~(1<<r);
7499                 }
7500                 // Treat delay slot as part of branch too
7501                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7502                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7503                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7504                 }
7505                 else
7506                 {
7507                   will_dirty[i+1]&=~(1<<r);
7508                 }*/
7509               }
7510             }
7511           //}
7512             // Merge in delay slot
7513             for(r=0;r<HOST_REGS;r++) {
7514               if(r!=EXCLUDE_REG) {
7515                 if(!likely[i]) {
7516                   // Might not dirty if likely branch is not taken
7517                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7518                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7519                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7520                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7521                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7522                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7523                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7524                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7525                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7526                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7527                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7528                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7529                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7530                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7531                 }
7532               }
7533             }
7534           }
7535           // Merge in delay slot
7536           for(r=0;r<HOST_REGS;r++) {
7537             if(r!=EXCLUDE_REG) {
7538               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7539               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7540               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7541               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7542               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7543               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7544               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7545               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7546               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7547               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7548             }
7549           }
7550           if(wr) {
7551             #ifndef DESTRUCTIVE_WRITEBACK
7552             branch_regs[i].dirty&=wont_dirty_i;
7553             #endif
7554             branch_regs[i].dirty|=will_dirty_i;
7555           }
7556         }
7557       }
7558     }
7559     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7560     {
7561       // SYSCALL instruction (software interrupt)
7562       will_dirty_i=0;
7563       wont_dirty_i=0;
7564     }
7565     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7566     {
7567       // ERET instruction (return from interrupt)
7568       will_dirty_i=0;
7569       wont_dirty_i=0;
7570     }
7571     will_dirty_next=will_dirty_i;
7572     wont_dirty_next=wont_dirty_i;
7573     for(r=0;r<HOST_REGS;r++) {
7574       if(r!=EXCLUDE_REG) {
7575         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7576         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7577         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7578         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7579         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7580         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7581         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7582         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7583         if(i>istart) {
7584           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7585           {
7586             // Don't store a register immediately after writing it,
7587             // may prevent dual-issue.
7588             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7589             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7590           }
7591         }
7592       }
7593     }
7594     // Save it
7595     will_dirty[i]=will_dirty_i;
7596     wont_dirty[i]=wont_dirty_i;
7597     // Mark registers that won't be dirtied as not dirty
7598     if(wr) {
7599       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7600       for(r=0;r<HOST_REGS;r++) {
7601         if((will_dirty_i>>r)&1) {
7602           printf(" r%d",r);
7603         }
7604       }
7605       printf("\n");*/
7606
7607       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7608         regs[i].dirty|=will_dirty_i;
7609         #ifndef DESTRUCTIVE_WRITEBACK
7610         regs[i].dirty&=wont_dirty_i;
7611         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7612         {
7613           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7614             for(r=0;r<HOST_REGS;r++) {
7615               if(r!=EXCLUDE_REG) {
7616                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7617                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7618                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7619               }
7620             }
7621           }
7622         }
7623         else
7624         {
7625           if(i<iend) {
7626             for(r=0;r<HOST_REGS;r++) {
7627               if(r!=EXCLUDE_REG) {
7628                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7629                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7630                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7631               }
7632             }
7633           }
7634         }
7635         #endif
7636       //}
7637     }
7638     // Deal with changed mappings
7639     temp_will_dirty=will_dirty_i;
7640     temp_wont_dirty=wont_dirty_i;
7641     for(r=0;r<HOST_REGS;r++) {
7642       if(r!=EXCLUDE_REG) {
7643         int nr;
7644         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7645           if(wr) {
7646             #ifndef DESTRUCTIVE_WRITEBACK
7647             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7648             #endif
7649             regs[i].wasdirty|=will_dirty_i&(1<<r);
7650           }
7651         }
7652         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7653           // Register moved to a different register
7654           will_dirty_i&=~(1<<r);
7655           wont_dirty_i&=~(1<<r);
7656           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7657           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7658           if(wr) {
7659             #ifndef DESTRUCTIVE_WRITEBACK
7660             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7661             #endif
7662             regs[i].wasdirty|=will_dirty_i&(1<<r);
7663           }
7664         }
7665         else {
7666           will_dirty_i&=~(1<<r);
7667           wont_dirty_i&=~(1<<r);
7668           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7669             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7670             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7671           } else {
7672             wont_dirty_i|=1<<r;
7673             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7674           }
7675         }
7676       }
7677     }
7678   }
7679 }
7680
7681   /* disassembly */
7682 void disassemble_inst(int i)
7683 {
7684     if (bt[i]) printf("*"); else printf(" ");
7685     switch(itype[i]) {
7686       case UJUMP:
7687         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7688       case CJUMP:
7689         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7690       case SJUMP:
7691         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7692       case FJUMP:
7693         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7694       case RJUMP:
7695         if (opcode[i]==0x9&&rt1[i]!=31)
7696           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7697         else
7698           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7699         break;
7700       case SPAN:
7701         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7702       case IMM16:
7703         if(opcode[i]==0xf) //LUI
7704           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7705         else
7706           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7707         break;
7708       case LOAD:
7709       case LOADLR:
7710         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7711         break;
7712       case STORE:
7713       case STORELR:
7714         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7715         break;
7716       case ALU:
7717       case SHIFT:
7718         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7719         break;
7720       case MULTDIV:
7721         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7722         break;
7723       case SHIFTIMM:
7724         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7725         break;
7726       case MOV:
7727         if((opcode2[i]&0x1d)==0x10)
7728           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7729         else if((opcode2[i]&0x1d)==0x11)
7730           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7731         else
7732           printf (" %x: %s\n",start+i*4,insn[i]);
7733         break;
7734       case COP0:
7735         if(opcode2[i]==0)
7736           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7737         else if(opcode2[i]==4)
7738           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7739         else printf (" %x: %s\n",start+i*4,insn[i]);
7740         break;
7741       case COP1:
7742         if(opcode2[i]<3)
7743           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7744         else if(opcode2[i]>3)
7745           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7746         else printf (" %x: %s\n",start+i*4,insn[i]);
7747         break;
7748       case COP2:
7749         if(opcode2[i]<3)
7750           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7751         else if(opcode2[i]>3)
7752           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7753         else printf (" %x: %s\n",start+i*4,insn[i]);
7754         break;
7755       case C1LS:
7756         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7757         break;
7758       case C2LS:
7759         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7760         break;
7761       case INTCALL:
7762         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7763         break;
7764       default:
7765         //printf (" %s %8x\n",insn[i],source[i]);
7766         printf (" %x: %s\n",start+i*4,insn[i]);
7767     }
7768 }
7769
7770 // clear the state completely, instead of just marking
7771 // things invalid like invalidate_all_pages() does
7772 void new_dynarec_clear_full()
7773 {
7774   int n;
7775   out=(u_char *)BASE_ADDR;
7776   memset(invalid_code,1,sizeof(invalid_code));
7777   memset(hash_table,0xff,sizeof(hash_table));
7778   memset(mini_ht,-1,sizeof(mini_ht));
7779   memset(restore_candidate,0,sizeof(restore_candidate));
7780   memset(shadow,0,sizeof(shadow));
7781   copy=shadow;
7782   expirep=16384; // Expiry pointer, +2 blocks
7783   pending_exception=0;
7784   literalcount=0;
7785   stop_after_jal=0;
7786   // TLB
7787 #ifndef DISABLE_TLB
7788   using_tlb=0;
7789 #endif
7790   sp_in_mirror=0;
7791   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7792     memory_map[n]=-1;
7793   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7794     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7795   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7796     memory_map[n]=-1;
7797   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7798   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7799   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7800 }
7801
7802 void new_dynarec_init()
7803 {
7804   printf("Init new dynarec\n");
7805   out=(u_char *)BASE_ADDR;
7806   if (mmap (out, 1<<TARGET_SIZE_2,
7807             PROT_READ | PROT_WRITE | PROT_EXEC,
7808             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7809             -1, 0) <= 0) {printf("mmap() failed\n");}
7810 #ifdef MUPEN64
7811   rdword=&readmem_dword;
7812   fake_pc.f.r.rs=&readmem_dword;
7813   fake_pc.f.r.rt=&readmem_dword;
7814   fake_pc.f.r.rd=&readmem_dword;
7815 #endif
7816   int n;
7817   new_dynarec_clear_full();
7818 #ifdef HOST_IMM8
7819   // Copy this into local area so we don't have to put it in every literal pool
7820   invc_ptr=invalid_code;
7821 #endif
7822 #ifdef MUPEN64
7823   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7824     writemem[n] = write_nomem_new;
7825     writememb[n] = write_nomemb_new;
7826     writememh[n] = write_nomemh_new;
7827 #ifndef FORCE32
7828     writememd[n] = write_nomemd_new;
7829 #endif
7830     readmem[n] = read_nomem_new;
7831     readmemb[n] = read_nomemb_new;
7832     readmemh[n] = read_nomemh_new;
7833 #ifndef FORCE32
7834     readmemd[n] = read_nomemd_new;
7835 #endif
7836   }
7837   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7838     writemem[n] = write_rdram_new;
7839     writememb[n] = write_rdramb_new;
7840     writememh[n] = write_rdramh_new;
7841 #ifndef FORCE32
7842     writememd[n] = write_rdramd_new;
7843 #endif
7844   }
7845   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7846     writemem[n] = write_nomem_new;
7847     writememb[n] = write_nomemb_new;
7848     writememh[n] = write_nomemh_new;
7849 #ifndef FORCE32
7850     writememd[n] = write_nomemd_new;
7851 #endif
7852     readmem[n] = read_nomem_new;
7853     readmemb[n] = read_nomemb_new;
7854     readmemh[n] = read_nomemh_new;
7855 #ifndef FORCE32
7856     readmemd[n] = read_nomemd_new;
7857 #endif
7858   }
7859 #endif
7860   tlb_hacks();
7861   arch_init();
7862 }
7863
7864 void new_dynarec_cleanup()
7865 {
7866   int n;
7867   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7868   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7869   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7870   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7871   #ifdef ROM_COPY
7872   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7873   #endif
7874 }
7875
7876 int new_recompile_block(int addr)
7877 {
7878 /*
7879   if(addr==0x800cd050) {
7880     int block;
7881     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7882     int n;
7883     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7884   }
7885 */
7886   //if(Count==365117028) tracedebug=1;
7887   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7888   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7889   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7890   //if(debug) 
7891   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7892   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7893   /*if(Count>=312978186) {
7894     rlist();
7895   }*/
7896   //rlist();
7897   start = (u_int)addr&~3;
7898   //assert(((u_int)addr&1)==0);
7899 #ifdef PCSX
7900   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
7901      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
7902     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp);
7903     sp_in_mirror=1;
7904   }
7905   if (Config.HLE && start == 0x80001000) // hlecall
7906   {
7907     // XXX: is this enough? Maybe check hleSoftCall?
7908     u_int beginning=(u_int)out;
7909     u_int page=get_page(start);
7910     invalid_code[start>>12]=0;
7911     emit_movimm(start,0);
7912     emit_writeword(0,(int)&pcaddr);
7913     emit_jmp((int)new_dyna_leave);
7914 #ifdef __arm__
7915     __clear_cache((void *)beginning,out);
7916 #endif
7917     ll_add(jump_in+page,start,(void *)beginning);
7918     return 0;
7919   }
7920   else if ((u_int)addr < 0x00200000 ||
7921     (0xa0000000 <= addr && addr < 0xa0200000)) {
7922     // used for BIOS calls mostly?
7923     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7924     pagelimit = (addr&0xa0000000)|0x00200000;
7925   }
7926   else if (!Config.HLE && (
7927 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7928     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7929     // BIOS
7930     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7931     pagelimit = (addr&0xfff00000)|0x80000;
7932   }
7933   else
7934 #endif
7935 #ifdef MUPEN64
7936   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7937     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7938     pagelimit = 0xa4001000;
7939   }
7940   else
7941 #endif
7942   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7943     source = (u_int *)((u_int)rdram+start-0x80000000);
7944     pagelimit = 0x80000000+RAM_SIZE;
7945   }
7946 #ifndef DISABLE_TLB
7947   else if ((signed int)addr >= (signed int)0xC0000000) {
7948     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7949     //if(tlb_LUT_r[start>>12])
7950       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7951     if((signed int)memory_map[start>>12]>=0) {
7952       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7953       pagelimit=(start+4096)&0xFFFFF000;
7954       int map=memory_map[start>>12];
7955       int i;
7956       for(i=0;i<5;i++) {
7957         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7958         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7959       }
7960       assem_debug("pagelimit=%x\n",pagelimit);
7961       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7962     }
7963     else {
7964       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7965       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7966       return -1; // Caller will invoke exception handler
7967     }
7968     //printf("source= %x\n",(int)source);
7969   }
7970 #endif
7971   else {
7972     printf("Compile at bogus memory address: %x \n", (int)addr);
7973     exit(1);
7974   }
7975
7976   /* Pass 1: disassemble */
7977   /* Pass 2: register dependencies, branch targets */
7978   /* Pass 3: register allocation */
7979   /* Pass 4: branch dependencies */
7980   /* Pass 5: pre-alloc */
7981   /* Pass 6: optimize clean/dirty state */
7982   /* Pass 7: flag 32-bit registers */
7983   /* Pass 8: assembly */
7984   /* Pass 9: linker */
7985   /* Pass 10: garbage collection / free memory */
7986
7987   int i,j;
7988   int done=0;
7989   unsigned int type,op,op2;
7990
7991   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7992   
7993   /* Pass 1 disassembly */
7994
7995   for(i=0;!done;i++) {
7996     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7997     minimum_free_regs[i]=0;
7998     opcode[i]=op=source[i]>>26;
7999     switch(op)
8000     {
8001       case 0x00: strcpy(insn[i],"special"); type=NI;
8002         op2=source[i]&0x3f;
8003         switch(op2)
8004         {
8005           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8006           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8007           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8008           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8009           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8010           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8011           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8012           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8013           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8014           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8015           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8016           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8017           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8018           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8019           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8020           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8021           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8022           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8023           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8024           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8025           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8026           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8027           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8028           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8029           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8030           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8031           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8032           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8033           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8034           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8035           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8036           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8037           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8038           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8039           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8040 #ifndef FORCE32
8041           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8042           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8043           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8044           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8045           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8046           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8047           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8048           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8049           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8050           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8051           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8052           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8053           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8054           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8055           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8056           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8057           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8058 #endif
8059         }
8060         break;
8061       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8062         op2=(source[i]>>16)&0x1f;
8063         switch(op2)
8064         {
8065           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8066           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8067           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8068           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8069           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8070           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8071           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8072           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8073           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8074           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8075           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8076           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8077           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8078           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8079         }
8080         break;
8081       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8082       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8083       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8084       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8085       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8086       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8087       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8088       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8089       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8090       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8091       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8092       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8093       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8094       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8095       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8096         op2=(source[i]>>21)&0x1f;
8097         switch(op2)
8098         {
8099           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8100           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8101           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8102           switch(source[i]&0x3f)
8103           {
8104             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8105             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8106             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8107             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8108 #ifdef PCSX
8109             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8110 #else
8111             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8112 #endif
8113           }
8114         }
8115         break;
8116       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8117         op2=(source[i]>>21)&0x1f;
8118         switch(op2)
8119         {
8120           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8121           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8122           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8123           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8124           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8125           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8126           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8127           switch((source[i]>>16)&0x3)
8128           {
8129             case 0x00: strcpy(insn[i],"BC1F"); break;
8130             case 0x01: strcpy(insn[i],"BC1T"); break;
8131             case 0x02: strcpy(insn[i],"BC1FL"); break;
8132             case 0x03: strcpy(insn[i],"BC1TL"); break;
8133           }
8134           break;
8135           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8136           switch(source[i]&0x3f)
8137           {
8138             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8139             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8140             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8141             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8142             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8143             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8144             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8145             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8146             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8147             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8148             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8149             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8150             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8151             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8152             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8153             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8154             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8155             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8156             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8157             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8158             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8159             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8160             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8161             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8162             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8163             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8164             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8165             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8166             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8167             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8168             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8169             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8170             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8171             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8172             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8173           }
8174           break;
8175           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8176           switch(source[i]&0x3f)
8177           {
8178             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8179             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8180             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8181             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8182             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8183             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8184             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8185             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8186             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8187             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8188             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8189             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8190             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8191             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8192             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8193             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8194             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8195             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8196             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8197             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8198             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8199             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8200             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8201             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8202             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8203             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8204             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8205             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8206             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8207             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8208             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8209             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8210             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8211             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8212             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8213           }
8214           break;
8215           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8216           switch(source[i]&0x3f)
8217           {
8218             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8219             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8220           }
8221           break;
8222           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8223           switch(source[i]&0x3f)
8224           {
8225             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8226             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8227           }
8228           break;
8229         }
8230         break;
8231 #ifndef FORCE32
8232       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8233       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8234       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8235       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8236       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8237       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8238       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8239       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8240 #endif
8241       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8242       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8243       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8244       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8245       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8246       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8247       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8248       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8249       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8250       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8251       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8252       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8253 #ifndef FORCE32
8254       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8255       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8256 #endif
8257       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8258       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8259       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8260       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8261 #ifndef FORCE32
8262       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8263       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8264       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8265 #endif
8266       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8267       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8268 #ifndef FORCE32
8269       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8270       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8271       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8272 #endif
8273 #ifdef PCSX
8274       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8275         // note: COP MIPS-1 encoding differs from MIPS32
8276         op2=(source[i]>>21)&0x1f;
8277         if (source[i]&0x3f) {
8278           if (gte_handlers[source[i]&0x3f]!=NULL) {
8279             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8280             type=C2OP;
8281           }
8282         }
8283         else switch(op2)
8284         {
8285           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8286           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8287           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8288           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8289         }
8290         break;
8291       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8292       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8293       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8294 #endif
8295       default: strcpy(insn[i],"???"); type=NI;
8296         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8297         break;
8298     }
8299     itype[i]=type;
8300     opcode2[i]=op2;
8301     /* Get registers/immediates */
8302     lt1[i]=0;
8303     us1[i]=0;
8304     us2[i]=0;
8305     dep1[i]=0;
8306     dep2[i]=0;
8307     switch(type) {
8308       case LOAD:
8309         rs1[i]=(source[i]>>21)&0x1f;
8310         rs2[i]=0;
8311         rt1[i]=(source[i]>>16)&0x1f;
8312         rt2[i]=0;
8313         imm[i]=(short)source[i];
8314         break;
8315       case STORE:
8316       case STORELR:
8317         rs1[i]=(source[i]>>21)&0x1f;
8318         rs2[i]=(source[i]>>16)&0x1f;
8319         rt1[i]=0;
8320         rt2[i]=0;
8321         imm[i]=(short)source[i];
8322         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8323         break;
8324       case LOADLR:
8325         // LWL/LWR only load part of the register,
8326         // therefore the target register must be treated as a source too
8327         rs1[i]=(source[i]>>21)&0x1f;
8328         rs2[i]=(source[i]>>16)&0x1f;
8329         rt1[i]=(source[i]>>16)&0x1f;
8330         rt2[i]=0;
8331         imm[i]=(short)source[i];
8332         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8333         if(op==0x26) dep1[i]=rt1[i]; // LWR
8334         break;
8335       case IMM16:
8336         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8337         else rs1[i]=(source[i]>>21)&0x1f;
8338         rs2[i]=0;
8339         rt1[i]=(source[i]>>16)&0x1f;
8340         rt2[i]=0;
8341         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8342           imm[i]=(unsigned short)source[i];
8343         }else{
8344           imm[i]=(short)source[i];
8345         }
8346         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8347         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8348         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8349         break;
8350       case UJUMP:
8351         rs1[i]=0;
8352         rs2[i]=0;
8353         rt1[i]=0;
8354         rt2[i]=0;
8355         // The JAL instruction writes to r31.
8356         if (op&1) {
8357           rt1[i]=31;
8358         }
8359         rs2[i]=CCREG;
8360         break;
8361       case RJUMP:
8362         rs1[i]=(source[i]>>21)&0x1f;
8363         rs2[i]=0;
8364         rt1[i]=0;
8365         rt2[i]=0;
8366         // The JALR instruction writes to rd.
8367         if (op2&1) {
8368           rt1[i]=(source[i]>>11)&0x1f;
8369         }
8370         rs2[i]=CCREG;
8371         break;
8372       case CJUMP:
8373         rs1[i]=(source[i]>>21)&0x1f;
8374         rs2[i]=(source[i]>>16)&0x1f;
8375         rt1[i]=0;
8376         rt2[i]=0;
8377         if(op&2) { // BGTZ/BLEZ
8378           rs2[i]=0;
8379         }
8380         us1[i]=rs1[i];
8381         us2[i]=rs2[i];
8382         likely[i]=op>>4;
8383         break;
8384       case SJUMP:
8385         rs1[i]=(source[i]>>21)&0x1f;
8386         rs2[i]=CCREG;
8387         rt1[i]=0;
8388         rt2[i]=0;
8389         us1[i]=rs1[i];
8390         if(op2&0x10) { // BxxAL
8391           rt1[i]=31;
8392           // NOTE: If the branch is not taken, r31 is still overwritten
8393         }
8394         likely[i]=(op2&2)>>1;
8395         break;
8396       case FJUMP:
8397         rs1[i]=FSREG;
8398         rs2[i]=CSREG;
8399         rt1[i]=0;
8400         rt2[i]=0;
8401         likely[i]=((source[i])>>17)&1;
8402         break;
8403       case ALU:
8404         rs1[i]=(source[i]>>21)&0x1f; // source
8405         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8406         rt1[i]=(source[i]>>11)&0x1f; // destination
8407         rt2[i]=0;
8408         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8409           us1[i]=rs1[i];us2[i]=rs2[i];
8410         }
8411         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8412           dep1[i]=rs1[i];dep2[i]=rs2[i];
8413         }
8414         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8415           dep1[i]=rs1[i];dep2[i]=rs2[i];
8416         }
8417         break;
8418       case MULTDIV:
8419         rs1[i]=(source[i]>>21)&0x1f; // source
8420         rs2[i]=(source[i]>>16)&0x1f; // divisor
8421         rt1[i]=HIREG;
8422         rt2[i]=LOREG;
8423         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8424           us1[i]=rs1[i];us2[i]=rs2[i];
8425         }
8426         break;
8427       case MOV:
8428         rs1[i]=0;
8429         rs2[i]=0;
8430         rt1[i]=0;
8431         rt2[i]=0;
8432         if(op2==0x10) rs1[i]=HIREG; // MFHI
8433         if(op2==0x11) rt1[i]=HIREG; // MTHI
8434         if(op2==0x12) rs1[i]=LOREG; // MFLO
8435         if(op2==0x13) rt1[i]=LOREG; // MTLO
8436         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8437         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8438         dep1[i]=rs1[i];
8439         break;
8440       case SHIFT:
8441         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8442         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8443         rt1[i]=(source[i]>>11)&0x1f; // destination
8444         rt2[i]=0;
8445         // DSLLV/DSRLV/DSRAV are 64-bit
8446         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8447         break;
8448       case SHIFTIMM:
8449         rs1[i]=(source[i]>>16)&0x1f;
8450         rs2[i]=0;
8451         rt1[i]=(source[i]>>11)&0x1f;
8452         rt2[i]=0;
8453         imm[i]=(source[i]>>6)&0x1f;
8454         // DSxx32 instructions
8455         if(op2>=0x3c) imm[i]|=0x20;
8456         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8457         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8458         break;
8459       case COP0:
8460         rs1[i]=0;
8461         rs2[i]=0;
8462         rt1[i]=0;
8463         rt2[i]=0;
8464         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8465         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8466         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8467         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8468         break;
8469       case COP1:
8470       case COP2:
8471         rs1[i]=0;
8472         rs2[i]=0;
8473         rt1[i]=0;
8474         rt2[i]=0;
8475         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8476         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8477         if(op2==5) us1[i]=rs1[i]; // DMTC1
8478         rs2[i]=CSREG;
8479         break;
8480       case C1LS:
8481         rs1[i]=(source[i]>>21)&0x1F;
8482         rs2[i]=CSREG;
8483         rt1[i]=0;
8484         rt2[i]=0;
8485         imm[i]=(short)source[i];
8486         break;
8487       case C2LS:
8488         rs1[i]=(source[i]>>21)&0x1F;
8489         rs2[i]=0;
8490         rt1[i]=0;
8491         rt2[i]=0;
8492         imm[i]=(short)source[i];
8493         break;
8494       case FLOAT:
8495       case FCONV:
8496         rs1[i]=0;
8497         rs2[i]=CSREG;
8498         rt1[i]=0;
8499         rt2[i]=0;
8500         break;
8501       case FCOMP:
8502         rs1[i]=FSREG;
8503         rs2[i]=CSREG;
8504         rt1[i]=FSREG;
8505         rt2[i]=0;
8506         break;
8507       case SYSCALL:
8508       case HLECALL:
8509       case INTCALL:
8510         rs1[i]=CCREG;
8511         rs2[i]=0;
8512         rt1[i]=0;
8513         rt2[i]=0;
8514         break;
8515       default:
8516         rs1[i]=0;
8517         rs2[i]=0;
8518         rt1[i]=0;
8519         rt2[i]=0;
8520     }
8521     /* Calculate branch target addresses */
8522     if(type==UJUMP)
8523       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8524     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8525       ba[i]=start+i*4+8; // Ignore never taken branch
8526     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8527       ba[i]=start+i*4+8; // Ignore never taken branch
8528     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8529       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8530     else ba[i]=-1;
8531 #ifdef PCSX
8532     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8533       int do_in_intrp=0;
8534       // branch in delay slot?
8535       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8536         // don't handle first branch and call interpreter if it's hit
8537         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8538         do_in_intrp=1;
8539       }
8540       // basic load delay detection
8541       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8542         int t=(ba[i-1]-start)/4;
8543         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8544           // jump target wants DS result - potential load delay effect
8545           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8546           do_in_intrp=1;
8547           bt[t+1]=1; // expected return from interpreter
8548         }
8549         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8550               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8551           // v0 overwrite like this is a sign of trouble, bail out
8552           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8553           do_in_intrp=1;
8554         }
8555       }
8556       if(do_in_intrp) {
8557         rs1[i-1]=CCREG;
8558         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8559         ba[i-1]=-1;
8560         itype[i-1]=INTCALL;
8561         done=2;
8562         i--; // don't compile the DS
8563       }
8564     }
8565 #endif
8566     /* Is this the end of the block? */
8567     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8568       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8569         done=2;
8570       }
8571       else {
8572         if(stop_after_jal) done=1;
8573         // Stop on BREAK
8574         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8575       }
8576       // Don't recompile stuff that's already compiled
8577       if(check_addr(start+i*4+4)) done=1;
8578       // Don't get too close to the limit
8579       if(i>MAXBLOCK/2) done=1;
8580     }
8581     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8582     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8583     if(done==2) {
8584       // Does the block continue due to a branch?
8585       for(j=i-1;j>=0;j--)
8586       {
8587         if(ba[j]==start+i*4+4) done=j=0;
8588         if(ba[j]==start+i*4+8) done=j=0;
8589       }
8590     }
8591     //assert(i<MAXBLOCK-1);
8592     if(start+i*4==pagelimit-4) done=1;
8593     assert(start+i*4<pagelimit);
8594     if (i==MAXBLOCK-1) done=1;
8595     // Stop if we're compiling junk
8596     if(itype[i]==NI&&opcode[i]==0x11) {
8597       done=stop_after_jal=1;
8598       printf("Disabled speculative precompilation\n");
8599     }
8600   }
8601   slen=i;
8602   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8603     if(start+i*4==pagelimit) {
8604       itype[i-1]=SPAN;
8605     }
8606   }
8607   assert(slen>0);
8608
8609   /* Pass 2 - Register dependencies and branch targets */
8610
8611   unneeded_registers(0,slen-1,0);
8612   
8613   /* Pass 3 - Register allocation */
8614
8615   struct regstat current; // Current register allocations/status
8616   current.is32=1;
8617   current.dirty=0;
8618   current.u=unneeded_reg[0];
8619   current.uu=unneeded_reg_upper[0];
8620   clear_all_regs(current.regmap);
8621   alloc_reg(&current,0,CCREG);
8622   dirty_reg(&current,CCREG);
8623   current.isconst=0;
8624   current.wasconst=0;
8625   int ds=0;
8626   int cc=0;
8627   int hr;
8628
8629 #ifndef FORCE32
8630   provisional_32bit();
8631 #endif
8632   if((u_int)addr&1) {
8633     // First instruction is delay slot
8634     cc=-1;
8635     bt[1]=1;
8636     ds=1;
8637     unneeded_reg[0]=1;
8638     unneeded_reg_upper[0]=1;
8639     current.regmap[HOST_BTREG]=BTREG;
8640   }
8641   
8642   for(i=0;i<slen;i++)
8643   {
8644     if(bt[i])
8645     {
8646       int hr;
8647       for(hr=0;hr<HOST_REGS;hr++)
8648       {
8649         // Is this really necessary?
8650         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8651       }
8652       current.isconst=0;
8653     }
8654     if(i>1)
8655     {
8656       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8657       {
8658         if(rs1[i-2]==0||rs2[i-2]==0)
8659         {
8660           if(rs1[i-2]) {
8661             current.is32|=1LL<<rs1[i-2];
8662             int hr=get_reg(current.regmap,rs1[i-2]|64);
8663             if(hr>=0) current.regmap[hr]=-1;
8664           }
8665           if(rs2[i-2]) {
8666             current.is32|=1LL<<rs2[i-2];
8667             int hr=get_reg(current.regmap,rs2[i-2]|64);
8668             if(hr>=0) current.regmap[hr]=-1;
8669           }
8670         }
8671       }
8672     }
8673 #ifndef FORCE32
8674     // If something jumps here with 64-bit values
8675     // then promote those registers to 64 bits
8676     if(bt[i])
8677     {
8678       uint64_t temp_is32=current.is32;
8679       for(j=i-1;j>=0;j--)
8680       {
8681         if(ba[j]==start+i*4) 
8682           temp_is32&=branch_regs[j].is32;
8683       }
8684       for(j=i;j<slen;j++)
8685       {
8686         if(ba[j]==start+i*4) 
8687           //temp_is32=1;
8688           temp_is32&=p32[j];
8689       }
8690       if(temp_is32!=current.is32) {
8691         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8692         #ifdef DESTRUCTIVE_WRITEBACK
8693         for(hr=0;hr<HOST_REGS;hr++)
8694         {
8695           int r=current.regmap[hr];
8696           if(r>0&&r<64)
8697           {
8698             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8699               temp_is32|=1LL<<r;
8700               //printf("restore %d\n",r);
8701             }
8702           }
8703         }
8704         #endif
8705         current.is32=temp_is32;
8706       }
8707     }
8708 #else
8709     current.is32=-1LL;
8710 #endif
8711
8712     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8713     regs[i].wasconst=current.isconst;
8714     regs[i].was32=current.is32;
8715     regs[i].wasdirty=current.dirty;
8716     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8717     // To change a dirty register from 32 to 64 bits, we must write
8718     // it out during the previous cycle (for branches, 2 cycles)
8719     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8720     {
8721       uint64_t temp_is32=current.is32;
8722       for(j=i-1;j>=0;j--)
8723       {
8724         if(ba[j]==start+i*4+4) 
8725           temp_is32&=branch_regs[j].is32;
8726       }
8727       for(j=i;j<slen;j++)
8728       {
8729         if(ba[j]==start+i*4+4) 
8730           //temp_is32=1;
8731           temp_is32&=p32[j];
8732       }
8733       if(temp_is32!=current.is32) {
8734         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8735         for(hr=0;hr<HOST_REGS;hr++)
8736         {
8737           int r=current.regmap[hr];
8738           if(r>0)
8739           {
8740             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8741               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8742               {
8743                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8744                 {
8745                   //printf("dump %d/r%d\n",hr,r);
8746                   current.regmap[hr]=-1;
8747                   if(get_reg(current.regmap,r|64)>=0) 
8748                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8749                 }
8750               }
8751             }
8752           }
8753         }
8754       }
8755     }
8756     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8757     {
8758       uint64_t temp_is32=current.is32;
8759       for(j=i-1;j>=0;j--)
8760       {
8761         if(ba[j]==start+i*4+8) 
8762           temp_is32&=branch_regs[j].is32;
8763       }
8764       for(j=i;j<slen;j++)
8765       {
8766         if(ba[j]==start+i*4+8) 
8767           //temp_is32=1;
8768           temp_is32&=p32[j];
8769       }
8770       if(temp_is32!=current.is32) {
8771         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8772         for(hr=0;hr<HOST_REGS;hr++)
8773         {
8774           int r=current.regmap[hr];
8775           if(r>0)
8776           {
8777             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8778               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8779               {
8780                 //printf("dump %d/r%d\n",hr,r);
8781                 current.regmap[hr]=-1;
8782                 if(get_reg(current.regmap,r|64)>=0) 
8783                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8784               }
8785             }
8786           }
8787         }
8788       }
8789     }
8790     #endif
8791     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8792       if(i+1<slen) {
8793         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8794         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8795         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8796         current.u|=1;
8797         current.uu|=1;
8798       } else {
8799         current.u=1;
8800         current.uu=1;
8801       }
8802     } else {
8803       if(i+1<slen) {
8804         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8805         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8806         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8807         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8808         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8809         current.u|=1;
8810         current.uu|=1;
8811       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8812     }
8813     is_ds[i]=ds;
8814     if(ds) {
8815       ds=0; // Skip delay slot, already allocated as part of branch
8816       // ...but we need to alloc it in case something jumps here
8817       if(i+1<slen) {
8818         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8819         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8820       }else{
8821         current.u=branch_unneeded_reg[i-1];
8822         current.uu=branch_unneeded_reg_upper[i-1];
8823       }
8824       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8825       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8826       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8827       current.u|=1;
8828       current.uu|=1;
8829       struct regstat temp;
8830       memcpy(&temp,&current,sizeof(current));
8831       temp.wasdirty=temp.dirty;
8832       temp.was32=temp.is32;
8833       // TODO: Take into account unconditional branches, as below
8834       delayslot_alloc(&temp,i);
8835       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8836       regs[i].wasdirty=temp.wasdirty;
8837       regs[i].was32=temp.was32;
8838       regs[i].dirty=temp.dirty;
8839       regs[i].is32=temp.is32;
8840       regs[i].isconst=0;
8841       regs[i].wasconst=0;
8842       current.isconst=0;
8843       // Create entry (branch target) regmap
8844       for(hr=0;hr<HOST_REGS;hr++)
8845       {
8846         int r=temp.regmap[hr];
8847         if(r>=0) {
8848           if(r!=regmap_pre[i][hr]) {
8849             regs[i].regmap_entry[hr]=-1;
8850           }
8851           else
8852           {
8853             if(r<64){
8854               if((current.u>>r)&1) {
8855                 regs[i].regmap_entry[hr]=-1;
8856                 regs[i].regmap[hr]=-1;
8857                 //Don't clear regs in the delay slot as the branch might need them
8858                 //current.regmap[hr]=-1;
8859               }else
8860                 regs[i].regmap_entry[hr]=r;
8861             }
8862             else {
8863               if((current.uu>>(r&63))&1) {
8864                 regs[i].regmap_entry[hr]=-1;
8865                 regs[i].regmap[hr]=-1;
8866                 //Don't clear regs in the delay slot as the branch might need them
8867                 //current.regmap[hr]=-1;
8868               }else
8869                 regs[i].regmap_entry[hr]=r;
8870             }
8871           }
8872         } else {
8873           // First instruction expects CCREG to be allocated
8874           if(i==0&&hr==HOST_CCREG) 
8875             regs[i].regmap_entry[hr]=CCREG;
8876           else
8877             regs[i].regmap_entry[hr]=-1;
8878         }
8879       }
8880     }
8881     else { // Not delay slot
8882       switch(itype[i]) {
8883         case UJUMP:
8884           //current.isconst=0; // DEBUG
8885           //current.wasconst=0; // DEBUG
8886           //regs[i].wasconst=0; // DEBUG
8887           clear_const(&current,rt1[i]);
8888           alloc_cc(&current,i);
8889           dirty_reg(&current,CCREG);
8890           ooo[i]=1;
8891           delayslot_alloc(&current,i+1);
8892           if (rt1[i]==31) {
8893             alloc_reg(&current,i,31);
8894             dirty_reg(&current,31);
8895             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8896             //assert(rt1[i+1]!=rt1[i]);
8897             #ifdef REG_PREFETCH
8898             alloc_reg(&current,i,PTEMP);
8899             #endif
8900             //current.is32|=1LL<<rt1[i];
8901           }
8902           //current.isconst=0; // DEBUG
8903           ds=1;
8904           //printf("i=%d, isconst=%x\n",i,current.isconst);
8905           break;
8906         case RJUMP:
8907           //current.isconst=0;
8908           //current.wasconst=0;
8909           //regs[i].wasconst=0;
8910           clear_const(&current,rs1[i]);
8911           clear_const(&current,rt1[i]);
8912           alloc_cc(&current,i);
8913           dirty_reg(&current,CCREG);
8914           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8915             alloc_reg(&current,i,rs1[i]);
8916             if (rt1[i]!=0) {
8917               alloc_reg(&current,i,rt1[i]);
8918               dirty_reg(&current,rt1[i]);
8919               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8920               assert(rt1[i+1]!=rt1[i]);
8921               #ifdef REG_PREFETCH
8922               alloc_reg(&current,i,PTEMP);
8923               #endif
8924             }
8925             #ifdef USE_MINI_HT
8926             if(rs1[i]==31) { // JALR
8927               alloc_reg(&current,i,RHASH);
8928               #ifndef HOST_IMM_ADDR32
8929               alloc_reg(&current,i,RHTBL);
8930               #endif
8931             }
8932             #endif
8933             delayslot_alloc(&current,i+1);
8934           } else {
8935             // The delay slot overwrites our source register,
8936             // allocate a temporary register to hold the old value.
8937             current.isconst=0;
8938             current.wasconst=0;
8939             regs[i].wasconst=0;
8940             delayslot_alloc(&current,i+1);
8941             current.isconst=0;
8942             alloc_reg(&current,i,RTEMP);
8943           }
8944           //current.isconst=0; // DEBUG
8945           ooo[i]=1;
8946           ds=1;
8947           break;
8948         case CJUMP:
8949           //current.isconst=0;
8950           //current.wasconst=0;
8951           //regs[i].wasconst=0;
8952           clear_const(&current,rs1[i]);
8953           clear_const(&current,rs2[i]);
8954           if((opcode[i]&0x3E)==4) // BEQ/BNE
8955           {
8956             alloc_cc(&current,i);
8957             dirty_reg(&current,CCREG);
8958             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8959             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8960             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8961             {
8962               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8963               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8964             }
8965             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8966                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8967               // The delay slot overwrites one of our conditions.
8968               // Allocate the branch condition registers instead.
8969               current.isconst=0;
8970               current.wasconst=0;
8971               regs[i].wasconst=0;
8972               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8973               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8974               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8975               {
8976                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8977                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8978               }
8979             }
8980             else
8981             {
8982               ooo[i]=1;
8983               delayslot_alloc(&current,i+1);
8984             }
8985           }
8986           else
8987           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8988           {
8989             alloc_cc(&current,i);
8990             dirty_reg(&current,CCREG);
8991             alloc_reg(&current,i,rs1[i]);
8992             if(!(current.is32>>rs1[i]&1))
8993             {
8994               alloc_reg64(&current,i,rs1[i]);
8995             }
8996             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8997               // The delay slot overwrites one of our conditions.
8998               // Allocate the branch condition registers instead.
8999               current.isconst=0;
9000               current.wasconst=0;
9001               regs[i].wasconst=0;
9002               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9003               if(!((current.is32>>rs1[i])&1))
9004               {
9005                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9006               }
9007             }
9008             else
9009             {
9010               ooo[i]=1;
9011               delayslot_alloc(&current,i+1);
9012             }
9013           }
9014           else
9015           // Don't alloc the delay slot yet because we might not execute it
9016           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9017           {
9018             current.isconst=0;
9019             current.wasconst=0;
9020             regs[i].wasconst=0;
9021             alloc_cc(&current,i);
9022             dirty_reg(&current,CCREG);
9023             alloc_reg(&current,i,rs1[i]);
9024             alloc_reg(&current,i,rs2[i]);
9025             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9026             {
9027               alloc_reg64(&current,i,rs1[i]);
9028               alloc_reg64(&current,i,rs2[i]);
9029             }
9030           }
9031           else
9032           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9033           {
9034             current.isconst=0;
9035             current.wasconst=0;
9036             regs[i].wasconst=0;
9037             alloc_cc(&current,i);
9038             dirty_reg(&current,CCREG);
9039             alloc_reg(&current,i,rs1[i]);
9040             if(!(current.is32>>rs1[i]&1))
9041             {
9042               alloc_reg64(&current,i,rs1[i]);
9043             }
9044           }
9045           ds=1;
9046           //current.isconst=0;
9047           break;
9048         case SJUMP:
9049           //current.isconst=0;
9050           //current.wasconst=0;
9051           //regs[i].wasconst=0;
9052           clear_const(&current,rs1[i]);
9053           clear_const(&current,rt1[i]);
9054           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9055           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9056           {
9057             alloc_cc(&current,i);
9058             dirty_reg(&current,CCREG);
9059             alloc_reg(&current,i,rs1[i]);
9060             if(!(current.is32>>rs1[i]&1))
9061             {
9062               alloc_reg64(&current,i,rs1[i]);
9063             }
9064             if (rt1[i]==31) { // BLTZAL/BGEZAL
9065               alloc_reg(&current,i,31);
9066               dirty_reg(&current,31);
9067               //#ifdef REG_PREFETCH
9068               //alloc_reg(&current,i,PTEMP);
9069               //#endif
9070               //current.is32|=1LL<<rt1[i];
9071             }
9072             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9073                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9074               // Allocate the branch condition registers instead.
9075               current.isconst=0;
9076               current.wasconst=0;
9077               regs[i].wasconst=0;
9078               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9079               if(!((current.is32>>rs1[i])&1))
9080               {
9081                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9082               }
9083             }
9084             else
9085             {
9086               ooo[i]=1;
9087               delayslot_alloc(&current,i+1);
9088             }
9089           }
9090           else
9091           // Don't alloc the delay slot yet because we might not execute it
9092           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9093           {
9094             current.isconst=0;
9095             current.wasconst=0;
9096             regs[i].wasconst=0;
9097             alloc_cc(&current,i);
9098             dirty_reg(&current,CCREG);
9099             alloc_reg(&current,i,rs1[i]);
9100             if(!(current.is32>>rs1[i]&1))
9101             {
9102               alloc_reg64(&current,i,rs1[i]);
9103             }
9104           }
9105           ds=1;
9106           //current.isconst=0;
9107           break;
9108         case FJUMP:
9109           current.isconst=0;
9110           current.wasconst=0;
9111           regs[i].wasconst=0;
9112           if(likely[i]==0) // BC1F/BC1T
9113           {
9114             // TODO: Theoretically we can run out of registers here on x86.
9115             // The delay slot can allocate up to six, and we need to check
9116             // CSREG before executing the delay slot.  Possibly we can drop
9117             // the cycle count and then reload it after checking that the
9118             // FPU is in a usable state, or don't do out-of-order execution.
9119             alloc_cc(&current,i);
9120             dirty_reg(&current,CCREG);
9121             alloc_reg(&current,i,FSREG);
9122             alloc_reg(&current,i,CSREG);
9123             if(itype[i+1]==FCOMP) {
9124               // The delay slot overwrites the branch condition.
9125               // Allocate the branch condition registers instead.
9126               alloc_cc(&current,i);
9127               dirty_reg(&current,CCREG);
9128               alloc_reg(&current,i,CSREG);
9129               alloc_reg(&current,i,FSREG);
9130             }
9131             else {
9132               ooo[i]=1;
9133               delayslot_alloc(&current,i+1);
9134               alloc_reg(&current,i+1,CSREG);
9135             }
9136           }
9137           else
9138           // Don't alloc the delay slot yet because we might not execute it
9139           if(likely[i]) // BC1FL/BC1TL
9140           {
9141             alloc_cc(&current,i);
9142             dirty_reg(&current,CCREG);
9143             alloc_reg(&current,i,CSREG);
9144             alloc_reg(&current,i,FSREG);
9145           }
9146           ds=1;
9147           current.isconst=0;
9148           break;
9149         case IMM16:
9150           imm16_alloc(&current,i);
9151           break;
9152         case LOAD:
9153         case LOADLR:
9154           load_alloc(&current,i);
9155           break;
9156         case STORE:
9157         case STORELR:
9158           store_alloc(&current,i);
9159           break;
9160         case ALU:
9161           alu_alloc(&current,i);
9162           break;
9163         case SHIFT:
9164           shift_alloc(&current,i);
9165           break;
9166         case MULTDIV:
9167           multdiv_alloc(&current,i);
9168           break;
9169         case SHIFTIMM:
9170           shiftimm_alloc(&current,i);
9171           break;
9172         case MOV:
9173           mov_alloc(&current,i);
9174           break;
9175         case COP0:
9176           cop0_alloc(&current,i);
9177           break;
9178         case COP1:
9179         case COP2:
9180           cop1_alloc(&current,i);
9181           break;
9182         case C1LS:
9183           c1ls_alloc(&current,i);
9184           break;
9185         case C2LS:
9186           c2ls_alloc(&current,i);
9187           break;
9188         case C2OP:
9189           c2op_alloc(&current,i);
9190           break;
9191         case FCONV:
9192           fconv_alloc(&current,i);
9193           break;
9194         case FLOAT:
9195           float_alloc(&current,i);
9196           break;
9197         case FCOMP:
9198           fcomp_alloc(&current,i);
9199           break;
9200         case SYSCALL:
9201         case HLECALL:
9202         case INTCALL:
9203           syscall_alloc(&current,i);
9204           break;
9205         case SPAN:
9206           pagespan_alloc(&current,i);
9207           break;
9208       }
9209       
9210       // Drop the upper half of registers that have become 32-bit
9211       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9212       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9213         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9214         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9215         current.uu|=1;
9216       } else {
9217         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9218         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9219         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9220         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9221         current.uu|=1;
9222       }
9223
9224       // Create entry (branch target) regmap
9225       for(hr=0;hr<HOST_REGS;hr++)
9226       {
9227         int r,or,er;
9228         r=current.regmap[hr];
9229         if(r>=0) {
9230           if(r!=regmap_pre[i][hr]) {
9231             // TODO: delay slot (?)
9232             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9233             if(or<0||(r&63)>=TEMPREG){
9234               regs[i].regmap_entry[hr]=-1;
9235             }
9236             else
9237             {
9238               // Just move it to a different register
9239               regs[i].regmap_entry[hr]=r;
9240               // If it was dirty before, it's still dirty
9241               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9242             }
9243           }
9244           else
9245           {
9246             // Unneeded
9247             if(r==0){
9248               regs[i].regmap_entry[hr]=0;
9249             }
9250             else
9251             if(r<64){
9252               if((current.u>>r)&1) {
9253                 regs[i].regmap_entry[hr]=-1;
9254                 //regs[i].regmap[hr]=-1;
9255                 current.regmap[hr]=-1;
9256               }else
9257                 regs[i].regmap_entry[hr]=r;
9258             }
9259             else {
9260               if((current.uu>>(r&63))&1) {
9261                 regs[i].regmap_entry[hr]=-1;
9262                 //regs[i].regmap[hr]=-1;
9263                 current.regmap[hr]=-1;
9264               }else
9265                 regs[i].regmap_entry[hr]=r;
9266             }
9267           }
9268         } else {
9269           // Branches expect CCREG to be allocated at the target
9270           if(regmap_pre[i][hr]==CCREG) 
9271             regs[i].regmap_entry[hr]=CCREG;
9272           else
9273             regs[i].regmap_entry[hr]=-1;
9274         }
9275       }
9276       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9277     }
9278     /* Branch post-alloc */
9279     if(i>0)
9280     {
9281       current.was32=current.is32;
9282       current.wasdirty=current.dirty;
9283       switch(itype[i-1]) {
9284         case UJUMP:
9285           memcpy(&branch_regs[i-1],&current,sizeof(current));
9286           branch_regs[i-1].isconst=0;
9287           branch_regs[i-1].wasconst=0;
9288           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9289           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9290           alloc_cc(&branch_regs[i-1],i-1);
9291           dirty_reg(&branch_regs[i-1],CCREG);
9292           if(rt1[i-1]==31) { // JAL
9293             alloc_reg(&branch_regs[i-1],i-1,31);
9294             dirty_reg(&branch_regs[i-1],31);
9295             branch_regs[i-1].is32|=1LL<<31;
9296           }
9297           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9298           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9299           break;
9300         case RJUMP:
9301           memcpy(&branch_regs[i-1],&current,sizeof(current));
9302           branch_regs[i-1].isconst=0;
9303           branch_regs[i-1].wasconst=0;
9304           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9305           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9306           alloc_cc(&branch_regs[i-1],i-1);
9307           dirty_reg(&branch_regs[i-1],CCREG);
9308           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9309           if(rt1[i-1]!=0) { // JALR
9310             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9311             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9312             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9313           }
9314           #ifdef USE_MINI_HT
9315           if(rs1[i-1]==31) { // JALR
9316             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9317             #ifndef HOST_IMM_ADDR32
9318             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9319             #endif
9320           }
9321           #endif
9322           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9323           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9324           break;
9325         case CJUMP:
9326           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9327           {
9328             alloc_cc(&current,i-1);
9329             dirty_reg(&current,CCREG);
9330             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9331                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9332               // The delay slot overwrote one of our conditions
9333               // Delay slot goes after the test (in order)
9334               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9335               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9336               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9337               current.u|=1;
9338               current.uu|=1;
9339               delayslot_alloc(&current,i);
9340               current.isconst=0;
9341             }
9342             else
9343             {
9344               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9345               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9346               // Alloc the branch condition registers
9347               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9348               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9349               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9350               {
9351                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9352                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9353               }
9354             }
9355             memcpy(&branch_regs[i-1],&current,sizeof(current));
9356             branch_regs[i-1].isconst=0;
9357             branch_regs[i-1].wasconst=0;
9358             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9359             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9360           }
9361           else
9362           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9363           {
9364             alloc_cc(&current,i-1);
9365             dirty_reg(&current,CCREG);
9366             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9367               // The delay slot overwrote the branch condition
9368               // Delay slot goes after the test (in order)
9369               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9370               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9371               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9372               current.u|=1;
9373               current.uu|=1;
9374               delayslot_alloc(&current,i);
9375               current.isconst=0;
9376             }
9377             else
9378             {
9379               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9380               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9381               // Alloc the branch condition register
9382               alloc_reg(&current,i-1,rs1[i-1]);
9383               if(!(current.is32>>rs1[i-1]&1))
9384               {
9385                 alloc_reg64(&current,i-1,rs1[i-1]);
9386               }
9387             }
9388             memcpy(&branch_regs[i-1],&current,sizeof(current));
9389             branch_regs[i-1].isconst=0;
9390             branch_regs[i-1].wasconst=0;
9391             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9392             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9393           }
9394           else
9395           // Alloc the delay slot in case the branch is taken
9396           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9397           {
9398             memcpy(&branch_regs[i-1],&current,sizeof(current));
9399             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9400             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9401             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9402             alloc_cc(&branch_regs[i-1],i);
9403             dirty_reg(&branch_regs[i-1],CCREG);
9404             delayslot_alloc(&branch_regs[i-1],i);
9405             branch_regs[i-1].isconst=0;
9406             alloc_reg(&current,i,CCREG); // Not taken path
9407             dirty_reg(&current,CCREG);
9408             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9409           }
9410           else
9411           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9412           {
9413             memcpy(&branch_regs[i-1],&current,sizeof(current));
9414             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9415             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9416             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9417             alloc_cc(&branch_regs[i-1],i);
9418             dirty_reg(&branch_regs[i-1],CCREG);
9419             delayslot_alloc(&branch_regs[i-1],i);
9420             branch_regs[i-1].isconst=0;
9421             alloc_reg(&current,i,CCREG); // Not taken path
9422             dirty_reg(&current,CCREG);
9423             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9424           }
9425           break;
9426         case SJUMP:
9427           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9428           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9429           {
9430             alloc_cc(&current,i-1);
9431             dirty_reg(&current,CCREG);
9432             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9433               // The delay slot overwrote the branch condition
9434               // Delay slot goes after the test (in order)
9435               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9436               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9437               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9438               current.u|=1;
9439               current.uu|=1;
9440               delayslot_alloc(&current,i);
9441               current.isconst=0;
9442             }
9443             else
9444             {
9445               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9446               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9447               // Alloc the branch condition register
9448               alloc_reg(&current,i-1,rs1[i-1]);
9449               if(!(current.is32>>rs1[i-1]&1))
9450               {
9451                 alloc_reg64(&current,i-1,rs1[i-1]);
9452               }
9453             }
9454             memcpy(&branch_regs[i-1],&current,sizeof(current));
9455             branch_regs[i-1].isconst=0;
9456             branch_regs[i-1].wasconst=0;
9457             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9458             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9459           }
9460           else
9461           // Alloc the delay slot in case the branch is taken
9462           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9463           {
9464             memcpy(&branch_regs[i-1],&current,sizeof(current));
9465             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9466             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9467             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9468             alloc_cc(&branch_regs[i-1],i);
9469             dirty_reg(&branch_regs[i-1],CCREG);
9470             delayslot_alloc(&branch_regs[i-1],i);
9471             branch_regs[i-1].isconst=0;
9472             alloc_reg(&current,i,CCREG); // Not taken path
9473             dirty_reg(&current,CCREG);
9474             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9475           }
9476           // FIXME: BLTZAL/BGEZAL
9477           if(opcode2[i-1]&0x10) { // BxxZAL
9478             alloc_reg(&branch_regs[i-1],i-1,31);
9479             dirty_reg(&branch_regs[i-1],31);
9480             branch_regs[i-1].is32|=1LL<<31;
9481           }
9482           break;
9483         case FJUMP:
9484           if(likely[i-1]==0) // BC1F/BC1T
9485           {
9486             alloc_cc(&current,i-1);
9487             dirty_reg(&current,CCREG);
9488             if(itype[i]==FCOMP) {
9489               // The delay slot overwrote the branch condition
9490               // Delay slot goes after the test (in order)
9491               delayslot_alloc(&current,i);
9492               current.isconst=0;
9493             }
9494             else
9495             {
9496               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9497               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9498               // Alloc the branch condition register
9499               alloc_reg(&current,i-1,FSREG);
9500             }
9501             memcpy(&branch_regs[i-1],&current,sizeof(current));
9502             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9503           }
9504           else // BC1FL/BC1TL
9505           {
9506             // Alloc the delay slot in case the branch is taken
9507             memcpy(&branch_regs[i-1],&current,sizeof(current));
9508             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9509             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9510             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9511             alloc_cc(&branch_regs[i-1],i);
9512             dirty_reg(&branch_regs[i-1],CCREG);
9513             delayslot_alloc(&branch_regs[i-1],i);
9514             branch_regs[i-1].isconst=0;
9515             alloc_reg(&current,i,CCREG); // Not taken path
9516             dirty_reg(&current,CCREG);
9517             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9518           }
9519           break;
9520       }
9521
9522       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9523       {
9524         if(rt1[i-1]==31) // JAL/JALR
9525         {
9526           // Subroutine call will return here, don't alloc any registers
9527           current.is32=1;
9528           current.dirty=0;
9529           clear_all_regs(current.regmap);
9530           alloc_reg(&current,i,CCREG);
9531           dirty_reg(&current,CCREG);
9532         }
9533         else if(i+1<slen)
9534         {
9535           // Internal branch will jump here, match registers to caller
9536           current.is32=0x3FFFFFFFFLL;
9537           current.dirty=0;
9538           clear_all_regs(current.regmap);
9539           alloc_reg(&current,i,CCREG);
9540           dirty_reg(&current,CCREG);
9541           for(j=i-1;j>=0;j--)
9542           {
9543             if(ba[j]==start+i*4+4) {
9544               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9545               current.is32=branch_regs[j].is32;
9546               current.dirty=branch_regs[j].dirty;
9547               break;
9548             }
9549           }
9550           while(j>=0) {
9551             if(ba[j]==start+i*4+4) {
9552               for(hr=0;hr<HOST_REGS;hr++) {
9553                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9554                   current.regmap[hr]=-1;
9555                 }
9556                 current.is32&=branch_regs[j].is32;
9557                 current.dirty&=branch_regs[j].dirty;
9558               }
9559             }
9560             j--;
9561           }
9562         }
9563       }
9564     }
9565
9566     // Count cycles in between branches
9567     ccadj[i]=cc;
9568     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9569     {
9570       cc=0;
9571     }
9572 #ifdef PCSX
9573     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9574     {
9575       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9576     }
9577     else if(itype[i]==C2LS)
9578     {
9579       cc+=4;
9580     }
9581 #endif
9582     else
9583     {
9584       cc++;
9585     }
9586
9587     flush_dirty_uppers(&current);
9588     if(!is_ds[i]) {
9589       regs[i].is32=current.is32;
9590       regs[i].dirty=current.dirty;
9591       regs[i].isconst=current.isconst;
9592       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9593     }
9594     for(hr=0;hr<HOST_REGS;hr++) {
9595       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9596         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9597           regs[i].wasconst&=~(1<<hr);
9598         }
9599       }
9600     }
9601     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9602   }
9603   
9604   /* Pass 4 - Cull unused host registers */
9605   
9606   uint64_t nr=0;
9607   
9608   for (i=slen-1;i>=0;i--)
9609   {
9610     int hr;
9611     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9612     {
9613       if(ba[i]<start || ba[i]>=(start+slen*4))
9614       {
9615         // Branch out of this block, don't need anything
9616         nr=0;
9617       }
9618       else
9619       {
9620         // Internal branch
9621         // Need whatever matches the target
9622         nr=0;
9623         int t=(ba[i]-start)>>2;
9624         for(hr=0;hr<HOST_REGS;hr++)
9625         {
9626           if(regs[i].regmap_entry[hr]>=0) {
9627             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9628           }
9629         }
9630       }
9631       // Conditional branch may need registers for following instructions
9632       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9633       {
9634         if(i<slen-2) {
9635           nr|=needed_reg[i+2];
9636           for(hr=0;hr<HOST_REGS;hr++)
9637           {
9638             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9639             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9640           }
9641         }
9642       }
9643       // Don't need stuff which is overwritten
9644       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9645       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9646       // Merge in delay slot
9647       for(hr=0;hr<HOST_REGS;hr++)
9648       {
9649         if(!likely[i]) {
9650           // These are overwritten unless the branch is "likely"
9651           // and the delay slot is nullified if not taken
9652           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9653           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9654         }
9655         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9656         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9657         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9658         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9659         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9660         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9661         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9662         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9663         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9664           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9665           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9666         }
9667         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9668           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9669           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9670         }
9671         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9672           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9673           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9674         }
9675       }
9676     }
9677     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9678     {
9679       // SYSCALL instruction (software interrupt)
9680       nr=0;
9681     }
9682     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9683     {
9684       // ERET instruction (return from interrupt)
9685       nr=0;
9686     }
9687     else // Non-branch
9688     {
9689       if(i<slen-1) {
9690         for(hr=0;hr<HOST_REGS;hr++) {
9691           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9692           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9693           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9694           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9695         }
9696       }
9697     }
9698     for(hr=0;hr<HOST_REGS;hr++)
9699     {
9700       // Overwritten registers are not needed
9701       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9702       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9703       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9704       // Source registers are needed
9705       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9706       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9707       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9708       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9709       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9710       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9711       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9712       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9713       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9714         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9715         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9716       }
9717       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9718         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9719         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9720       }
9721       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9722         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9723         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9724       }
9725       // Don't store a register immediately after writing it,
9726       // may prevent dual-issue.
9727       // But do so if this is a branch target, otherwise we
9728       // might have to load the register before the branch.
9729       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9730         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9731            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9732           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9733           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9734         }
9735         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9736            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9737           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9738           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9739         }
9740       }
9741     }
9742     // Cycle count is needed at branches.  Assume it is needed at the target too.
9743     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9744       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9745       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9746     }
9747     // Save it
9748     needed_reg[i]=nr;
9749     
9750     // Deallocate unneeded registers
9751     for(hr=0;hr<HOST_REGS;hr++)
9752     {
9753       if(!((nr>>hr)&1)) {
9754         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9755         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9756            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9757            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9758         {
9759           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9760           {
9761             if(likely[i]) {
9762               regs[i].regmap[hr]=-1;
9763               regs[i].isconst&=~(1<<hr);
9764               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9765             }
9766           }
9767         }
9768         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9769         {
9770           int d1=0,d2=0,map=0,temp=0;
9771           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9772           {
9773             d1=dep1[i+1];
9774             d2=dep2[i+1];
9775           }
9776           if(using_tlb) {
9777             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9778                itype[i+1]==STORE || itype[i+1]==STORELR ||
9779                itype[i+1]==C1LS || itype[i+1]==C2LS)
9780             map=TLREG;
9781           } else
9782           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9783              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9784             map=INVCP;
9785           }
9786           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9787              itype[i+1]==C1LS || itype[i+1]==C2LS)
9788             temp=FTEMP;
9789           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9790              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9791              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9792              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9793              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9794              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9795              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9796              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9797              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9798              regs[i].regmap[hr]!=map )
9799           {
9800             regs[i].regmap[hr]=-1;
9801             regs[i].isconst&=~(1<<hr);
9802             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9803                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9804                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9805                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9806                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9807                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9808                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9809                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9810                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9811                branch_regs[i].regmap[hr]!=map)
9812             {
9813               branch_regs[i].regmap[hr]=-1;
9814               branch_regs[i].regmap_entry[hr]=-1;
9815               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9816               {
9817                 if(!likely[i]&&i<slen-2) {
9818                   regmap_pre[i+2][hr]=-1;
9819                 }
9820               }
9821             }
9822           }
9823         }
9824         else
9825         {
9826           // Non-branch
9827           if(i>0)
9828           {
9829             int d1=0,d2=0,map=-1,temp=-1;
9830             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9831             {
9832               d1=dep1[i];
9833               d2=dep2[i];
9834             }
9835             if(using_tlb) {
9836               if(itype[i]==LOAD || itype[i]==LOADLR ||
9837                  itype[i]==STORE || itype[i]==STORELR ||
9838                  itype[i]==C1LS || itype[i]==C2LS)
9839               map=TLREG;
9840             } else if(itype[i]==STORE || itype[i]==STORELR ||
9841                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9842               map=INVCP;
9843             }
9844             if(itype[i]==LOADLR || itype[i]==STORELR ||
9845                itype[i]==C1LS || itype[i]==C2LS)
9846               temp=FTEMP;
9847             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9848                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9849                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9850                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9851                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9852                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9853             {
9854               if(i<slen-1&&!is_ds[i]) {
9855                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9856                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9857                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9858                 {
9859                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9860                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9861                 }
9862                 regmap_pre[i+1][hr]=-1;
9863                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9864               }
9865               regs[i].regmap[hr]=-1;
9866               regs[i].isconst&=~(1<<hr);
9867             }
9868           }
9869         }
9870       }
9871     }
9872   }
9873   
9874   /* Pass 5 - Pre-allocate registers */
9875   
9876   // If a register is allocated during a loop, try to allocate it for the
9877   // entire loop, if possible.  This avoids loading/storing registers
9878   // inside of the loop.
9879
9880   signed char f_regmap[HOST_REGS];
9881   clear_all_regs(f_regmap);
9882   for(i=0;i<slen-1;i++)
9883   {
9884     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9885     {
9886       if(ba[i]>=start && ba[i]<(start+i*4)) 
9887       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9888       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9889       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9890       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9891       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9892       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9893       {
9894         int t=(ba[i]-start)>>2;
9895         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9896         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9897         for(hr=0;hr<HOST_REGS;hr++)
9898         {
9899           if(regs[i].regmap[hr]>64) {
9900             if(!((regs[i].dirty>>hr)&1))
9901               f_regmap[hr]=regs[i].regmap[hr];
9902             else f_regmap[hr]=-1;
9903           }
9904           else if(regs[i].regmap[hr]>=0) {
9905             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9906               // dealloc old register
9907               int n;
9908               for(n=0;n<HOST_REGS;n++)
9909               {
9910                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9911               }
9912               // and alloc new one
9913               f_regmap[hr]=regs[i].regmap[hr];
9914             }
9915           }
9916           if(branch_regs[i].regmap[hr]>64) {
9917             if(!((branch_regs[i].dirty>>hr)&1))
9918               f_regmap[hr]=branch_regs[i].regmap[hr];
9919             else f_regmap[hr]=-1;
9920           }
9921           else if(branch_regs[i].regmap[hr]>=0) {
9922             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9923               // dealloc old register
9924               int n;
9925               for(n=0;n<HOST_REGS;n++)
9926               {
9927                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9928               }
9929               // and alloc new one
9930               f_regmap[hr]=branch_regs[i].regmap[hr];
9931             }
9932           }
9933           if(ooo[i]) {
9934             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9935               f_regmap[hr]=branch_regs[i].regmap[hr];
9936           }else{
9937             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9938               f_regmap[hr]=branch_regs[i].regmap[hr];
9939           }
9940           // Avoid dirty->clean transition
9941           #ifdef DESTRUCTIVE_WRITEBACK
9942           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9943           #endif
9944           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9945           // case above, however it's always a good idea.  We can't hoist the
9946           // load if the register was already allocated, so there's no point
9947           // wasting time analyzing most of these cases.  It only "succeeds"
9948           // when the mapping was different and the load can be replaced with
9949           // a mov, which is of negligible benefit.  So such cases are
9950           // skipped below.
9951           if(f_regmap[hr]>0) {
9952             if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
9953               int r=f_regmap[hr];
9954               for(j=t;j<=i;j++)
9955               {
9956                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9957                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9958                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9959                 if(r>63) {
9960                   // NB This can exclude the case where the upper-half
9961                   // register is lower numbered than the lower-half
9962                   // register.  Not sure if it's worth fixing...
9963                   if(get_reg(regs[j].regmap,r&63)<0) break;
9964                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9965                   if(regs[j].is32&(1LL<<(r&63))) break;
9966                 }
9967                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9968                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9969                   int k;
9970                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9971                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9972                     if(r>63) {
9973                       if(get_reg(regs[i].regmap,r&63)<0) break;
9974                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9975                     }
9976                     k=i;
9977                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9978                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9979                         //printf("no free regs for store %x\n",start+(k-1)*4);
9980                         break;
9981                       }
9982                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9983                         //printf("no-match due to different register\n");
9984                         break;
9985                       }
9986                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9987                         //printf("no-match due to branch\n");
9988                         break;
9989                       }
9990                       // call/ret fast path assumes no registers allocated
9991                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9992                         break;
9993                       }
9994                       if(r>63) {
9995                         // NB This can exclude the case where the upper-half
9996                         // register is lower numbered than the lower-half
9997                         // register.  Not sure if it's worth fixing...
9998                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9999                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10000                       }
10001                       k--;
10002                     }
10003                     if(i<slen-1) {
10004                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10005                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10006                         //printf("bad match after branch\n");
10007                         break;
10008                       }
10009                     }
10010                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10011                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10012                       while(k<i) {
10013                         regs[k].regmap_entry[hr]=f_regmap[hr];
10014                         regs[k].regmap[hr]=f_regmap[hr];
10015                         regmap_pre[k+1][hr]=f_regmap[hr];
10016                         regs[k].wasdirty&=~(1<<hr);
10017                         regs[k].dirty&=~(1<<hr);
10018                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10019                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10020                         regs[k].wasconst&=~(1<<hr);
10021                         regs[k].isconst&=~(1<<hr);
10022                         k++;
10023                       }
10024                     }
10025                     else {
10026                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10027                       break;
10028                     }
10029                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10030                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10031                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10032                       regs[i].regmap_entry[hr]=f_regmap[hr];
10033                       regs[i].regmap[hr]=f_regmap[hr];
10034                       regs[i].wasdirty&=~(1<<hr);
10035                       regs[i].dirty&=~(1<<hr);
10036                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10037                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10038                       regs[i].wasconst&=~(1<<hr);
10039                       regs[i].isconst&=~(1<<hr);
10040                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10041                       branch_regs[i].wasdirty&=~(1<<hr);
10042                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10043                       branch_regs[i].regmap[hr]=f_regmap[hr];
10044                       branch_regs[i].dirty&=~(1<<hr);
10045                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10046                       branch_regs[i].wasconst&=~(1<<hr);
10047                       branch_regs[i].isconst&=~(1<<hr);
10048                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10049                         regmap_pre[i+2][hr]=f_regmap[hr];
10050                         regs[i+2].wasdirty&=~(1<<hr);
10051                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10052                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10053                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10054                       }
10055                     }
10056                   }
10057                   for(k=t;k<j;k++) {
10058                     // Alloc register clean at beginning of loop,
10059                     // but may dirty it in pass 6
10060                     regs[k].regmap_entry[hr]=f_regmap[hr];
10061                     regs[k].regmap[hr]=f_regmap[hr];
10062                     regs[k].dirty&=~(1<<hr);
10063                     regs[k].wasconst&=~(1<<hr);
10064                     regs[k].isconst&=~(1<<hr);
10065                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10066                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10067                       branch_regs[k].regmap[hr]=f_regmap[hr];
10068                       branch_regs[k].dirty&=~(1<<hr);
10069                       branch_regs[k].wasconst&=~(1<<hr);
10070                       branch_regs[k].isconst&=~(1<<hr);
10071                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10072                         regmap_pre[k+2][hr]=f_regmap[hr];
10073                         regs[k+2].wasdirty&=~(1<<hr);
10074                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10075                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10076                       }
10077                     }
10078                     else
10079                     {
10080                       regmap_pre[k+1][hr]=f_regmap[hr];
10081                       regs[k+1].wasdirty&=~(1<<hr);
10082                     }
10083                   }
10084                   if(regs[j].regmap[hr]==f_regmap[hr])
10085                     regs[j].regmap_entry[hr]=f_regmap[hr];
10086                   break;
10087                 }
10088                 if(j==i) break;
10089                 if(regs[j].regmap[hr]>=0)
10090                   break;
10091                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10092                   //printf("no-match due to different register\n");
10093                   break;
10094                 }
10095                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10096                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10097                   break;
10098                 }
10099                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10100                 {
10101                   // Stop on unconditional branch
10102                   break;
10103                 }
10104                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10105                 {
10106                   if(ooo[j]) {
10107                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10108                       break;
10109                   }else{
10110                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10111                       break;
10112                   }
10113                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10114                     //printf("no-match due to different register (branch)\n");
10115                     break;
10116                   }
10117                 }
10118                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10119                   //printf("No free regs for store %x\n",start+j*4);
10120                   break;
10121                 }
10122                 if(f_regmap[hr]>=64) {
10123                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10124                     break;
10125                   }
10126                   else
10127                   {
10128                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10129                       break;
10130                     }
10131                   }
10132                 }
10133               }
10134             }
10135           }
10136         }
10137       }
10138     }else{
10139       int count=0;
10140       for(hr=0;hr<HOST_REGS;hr++)
10141       {
10142         if(hr!=EXCLUDE_REG) {
10143           if(regs[i].regmap[hr]>64) {
10144             if(!((regs[i].dirty>>hr)&1))
10145               f_regmap[hr]=regs[i].regmap[hr];
10146           }
10147           else if(regs[i].regmap[hr]>=0) {
10148             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10149               // dealloc old register
10150               int n;
10151               for(n=0;n<HOST_REGS;n++)
10152               {
10153                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10154               }
10155               // and alloc new one
10156               f_regmap[hr]=regs[i].regmap[hr];
10157             }
10158           }
10159           else if(regs[i].regmap[hr]<0) count++;
10160         }
10161       }
10162       // Try to restore cycle count at branch targets
10163       if(bt[i]) {
10164         for(j=i;j<slen-1;j++) {
10165           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10166           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10167             //printf("no free regs for store %x\n",start+j*4);
10168             break;
10169           }
10170         }
10171         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10172           int k=i;
10173           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10174           while(k<j) {
10175             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10176             regs[k].regmap[HOST_CCREG]=CCREG;
10177             regmap_pre[k+1][HOST_CCREG]=CCREG;
10178             regs[k+1].wasdirty|=1<<HOST_CCREG;
10179             regs[k].dirty|=1<<HOST_CCREG;
10180             regs[k].wasconst&=~(1<<HOST_CCREG);
10181             regs[k].isconst&=~(1<<HOST_CCREG);
10182             k++;
10183           }
10184           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10185         }
10186         // Work backwards from the branch target
10187         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10188         {
10189           //printf("Extend backwards\n");
10190           int k;
10191           k=i;
10192           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10193             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10194               //printf("no free regs for store %x\n",start+(k-1)*4);
10195               break;
10196             }
10197             k--;
10198           }
10199           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10200             //printf("Extend CC, %x ->\n",start+k*4);
10201             while(k<=i) {
10202               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10203               regs[k].regmap[HOST_CCREG]=CCREG;
10204               regmap_pre[k+1][HOST_CCREG]=CCREG;
10205               regs[k+1].wasdirty|=1<<HOST_CCREG;
10206               regs[k].dirty|=1<<HOST_CCREG;
10207               regs[k].wasconst&=~(1<<HOST_CCREG);
10208               regs[k].isconst&=~(1<<HOST_CCREG);
10209               k++;
10210             }
10211           }
10212           else {
10213             //printf("Fail Extend CC, %x ->\n",start+k*4);
10214           }
10215         }
10216       }
10217       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10218          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10219          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10220          itype[i]!=FCONV&&itype[i]!=FCOMP)
10221       {
10222         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10223       }
10224     }
10225   }
10226   
10227   // This allocates registers (if possible) one instruction prior
10228   // to use, which can avoid a load-use penalty on certain CPUs.
10229   for(i=0;i<slen-1;i++)
10230   {
10231     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10232     {
10233       if(!bt[i+1])
10234       {
10235         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10236            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10237         {
10238           if(rs1[i+1]) {
10239             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10240             {
10241               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10242               {
10243                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10244                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10245                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10246                 regs[i].isconst&=~(1<<hr);
10247                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10248                 constmap[i][hr]=constmap[i+1][hr];
10249                 regs[i+1].wasdirty&=~(1<<hr);
10250                 regs[i].dirty&=~(1<<hr);
10251               }
10252             }
10253           }
10254           if(rs2[i+1]) {
10255             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10256             {
10257               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10258               {
10259                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10260                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10261                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10262                 regs[i].isconst&=~(1<<hr);
10263                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10264                 constmap[i][hr]=constmap[i+1][hr];
10265                 regs[i+1].wasdirty&=~(1<<hr);
10266                 regs[i].dirty&=~(1<<hr);
10267               }
10268             }
10269           }
10270           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10271             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10272             {
10273               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10274               {
10275                 regs[i].regmap[hr]=rs1[i+1];
10276                 regmap_pre[i+1][hr]=rs1[i+1];
10277                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10278                 regs[i].isconst&=~(1<<hr);
10279                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10280                 constmap[i][hr]=constmap[i+1][hr];
10281                 regs[i+1].wasdirty&=~(1<<hr);
10282                 regs[i].dirty&=~(1<<hr);
10283               }
10284             }
10285           }
10286           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10287             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10288             {
10289               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10290               {
10291                 regs[i].regmap[hr]=rs1[i+1];
10292                 regmap_pre[i+1][hr]=rs1[i+1];
10293                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10294                 regs[i].isconst&=~(1<<hr);
10295                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10296                 constmap[i][hr]=constmap[i+1][hr];
10297                 regs[i+1].wasdirty&=~(1<<hr);
10298                 regs[i].dirty&=~(1<<hr);
10299               }
10300             }
10301           }
10302           #ifndef HOST_IMM_ADDR32
10303           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10304             hr=get_reg(regs[i+1].regmap,TLREG);
10305             if(hr>=0) {
10306               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10307               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10308                 int nr;
10309                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10310                 {
10311                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10312                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10313                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10314                   regs[i].isconst&=~(1<<hr);
10315                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10316                   constmap[i][hr]=constmap[i+1][hr];
10317                   regs[i+1].wasdirty&=~(1<<hr);
10318                   regs[i].dirty&=~(1<<hr);
10319                 }
10320                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10321                 {
10322                   // move it to another register
10323                   regs[i+1].regmap[hr]=-1;
10324                   regmap_pre[i+2][hr]=-1;
10325                   regs[i+1].regmap[nr]=TLREG;
10326                   regmap_pre[i+2][nr]=TLREG;
10327                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10328                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10329                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10330                   regs[i].isconst&=~(1<<nr);
10331                   regs[i+1].isconst&=~(1<<nr);
10332                   regs[i].dirty&=~(1<<nr);
10333                   regs[i+1].wasdirty&=~(1<<nr);
10334                   regs[i+1].dirty&=~(1<<nr);
10335                   regs[i+2].wasdirty&=~(1<<nr);
10336                 }
10337               }
10338             }
10339           }
10340           #endif
10341           if(itype[i+1]==STORE||itype[i+1]==STORELR
10342              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10343             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10344               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10345               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10346               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10347               assert(hr>=0);
10348               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10349               {
10350                 regs[i].regmap[hr]=rs1[i+1];
10351                 regmap_pre[i+1][hr]=rs1[i+1];
10352                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10353                 regs[i].isconst&=~(1<<hr);
10354                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10355                 constmap[i][hr]=constmap[i+1][hr];
10356                 regs[i+1].wasdirty&=~(1<<hr);
10357                 regs[i].dirty&=~(1<<hr);
10358               }
10359             }
10360           }
10361           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10362             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10363               int nr;
10364               hr=get_reg(regs[i+1].regmap,FTEMP);
10365               assert(hr>=0);
10366               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10367               {
10368                 regs[i].regmap[hr]=rs1[i+1];
10369                 regmap_pre[i+1][hr]=rs1[i+1];
10370                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10371                 regs[i].isconst&=~(1<<hr);
10372                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10373                 constmap[i][hr]=constmap[i+1][hr];
10374                 regs[i+1].wasdirty&=~(1<<hr);
10375                 regs[i].dirty&=~(1<<hr);
10376               }
10377               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10378               {
10379                 // move it to another register
10380                 regs[i+1].regmap[hr]=-1;
10381                 regmap_pre[i+2][hr]=-1;
10382                 regs[i+1].regmap[nr]=FTEMP;
10383                 regmap_pre[i+2][nr]=FTEMP;
10384                 regs[i].regmap[nr]=rs1[i+1];
10385                 regmap_pre[i+1][nr]=rs1[i+1];
10386                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10387                 regs[i].isconst&=~(1<<nr);
10388                 regs[i+1].isconst&=~(1<<nr);
10389                 regs[i].dirty&=~(1<<nr);
10390                 regs[i+1].wasdirty&=~(1<<nr);
10391                 regs[i+1].dirty&=~(1<<nr);
10392                 regs[i+2].wasdirty&=~(1<<nr);
10393               }
10394             }
10395           }
10396           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10397             if(itype[i+1]==LOAD) 
10398               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10399             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10400               hr=get_reg(regs[i+1].regmap,FTEMP);
10401             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10402               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10403               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10404             }
10405             if(hr>=0&&regs[i].regmap[hr]<0) {
10406               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10407               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10408                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10409                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10410                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10411                 regs[i].isconst&=~(1<<hr);
10412                 regs[i+1].wasdirty&=~(1<<hr);
10413                 regs[i].dirty&=~(1<<hr);
10414               }
10415             }
10416           }
10417         }
10418       }
10419     }
10420   }
10421   
10422   /* Pass 6 - Optimize clean/dirty state */
10423   clean_registers(0,slen-1,1);
10424   
10425   /* Pass 7 - Identify 32-bit registers */
10426 #ifndef FORCE32
10427   provisional_r32();
10428
10429   u_int r32=0;
10430   
10431   for (i=slen-1;i>=0;i--)
10432   {
10433     int hr;
10434     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10435     {
10436       if(ba[i]<start || ba[i]>=(start+slen*4))
10437       {
10438         // Branch out of this block, don't need anything
10439         r32=0;
10440       }
10441       else
10442       {
10443         // Internal branch
10444         // Need whatever matches the target
10445         // (and doesn't get overwritten by the delay slot instruction)
10446         r32=0;
10447         int t=(ba[i]-start)>>2;
10448         if(ba[i]>start+i*4) {
10449           // Forward branch
10450           if(!(requires_32bit[t]&~regs[i].was32))
10451             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10452         }else{
10453           // Backward branch
10454           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10455           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10456           if(!(pr32[t]&~regs[i].was32))
10457             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10458         }
10459       }
10460       // Conditional branch may need registers for following instructions
10461       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10462       {
10463         if(i<slen-2) {
10464           r32|=requires_32bit[i+2];
10465           r32&=regs[i].was32;
10466           // Mark this address as a branch target since it may be called
10467           // upon return from interrupt
10468           bt[i+2]=1;
10469         }
10470       }
10471       // Merge in delay slot
10472       if(!likely[i]) {
10473         // These are overwritten unless the branch is "likely"
10474         // and the delay slot is nullified if not taken
10475         r32&=~(1LL<<rt1[i+1]);
10476         r32&=~(1LL<<rt2[i+1]);
10477       }
10478       // Assume these are needed (delay slot)
10479       if(us1[i+1]>0)
10480       {
10481         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10482       }
10483       if(us2[i+1]>0)
10484       {
10485         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10486       }
10487       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10488       {
10489         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10490       }
10491       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10492       {
10493         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10494       }
10495     }
10496     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10497     {
10498       // SYSCALL instruction (software interrupt)
10499       r32=0;
10500     }
10501     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10502     {
10503       // ERET instruction (return from interrupt)
10504       r32=0;
10505     }
10506     // Check 32 bits
10507     r32&=~(1LL<<rt1[i]);
10508     r32&=~(1LL<<rt2[i]);
10509     if(us1[i]>0)
10510     {
10511       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10512     }
10513     if(us2[i]>0)
10514     {
10515       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10516     }
10517     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10518     {
10519       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10520     }
10521     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10522     {
10523       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10524     }
10525     requires_32bit[i]=r32;
10526     
10527     // Dirty registers which are 32-bit, require 32-bit input
10528     // as they will be written as 32-bit values
10529     for(hr=0;hr<HOST_REGS;hr++)
10530     {
10531       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10532         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10533           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10534           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10535         }
10536       }
10537     }
10538     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10539   }
10540 #endif
10541
10542   if(itype[slen-1]==SPAN) {
10543     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10544   }
10545   
10546   /* Debug/disassembly */
10547   if((void*)assem_debug==(void*)printf) 
10548   for(i=0;i<slen;i++)
10549   {
10550     printf("U:");
10551     int r;
10552     for(r=1;r<=CCREG;r++) {
10553       if((unneeded_reg[i]>>r)&1) {
10554         if(r==HIREG) printf(" HI");
10555         else if(r==LOREG) printf(" LO");
10556         else printf(" r%d",r);
10557       }
10558     }
10559 #ifndef FORCE32
10560     printf(" UU:");
10561     for(r=1;r<=CCREG;r++) {
10562       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10563         if(r==HIREG) printf(" HI");
10564         else if(r==LOREG) printf(" LO");
10565         else printf(" r%d",r);
10566       }
10567     }
10568     printf(" 32:");
10569     for(r=0;r<=CCREG;r++) {
10570       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10571       if((regs[i].was32>>r)&1) {
10572         if(r==CCREG) printf(" CC");
10573         else if(r==HIREG) printf(" HI");
10574         else if(r==LOREG) printf(" LO");
10575         else printf(" r%d",r);
10576       }
10577     }
10578 #endif
10579     printf("\n");
10580     #if defined(__i386__) || defined(__x86_64__)
10581     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10582     #endif
10583     #ifdef __arm__
10584     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10585     #endif
10586     printf("needs: ");
10587     if(needed_reg[i]&1) printf("eax ");
10588     if((needed_reg[i]>>1)&1) printf("ecx ");
10589     if((needed_reg[i]>>2)&1) printf("edx ");
10590     if((needed_reg[i]>>3)&1) printf("ebx ");
10591     if((needed_reg[i]>>5)&1) printf("ebp ");
10592     if((needed_reg[i]>>6)&1) printf("esi ");
10593     if((needed_reg[i]>>7)&1) printf("edi ");
10594     printf("r:");
10595     for(r=0;r<=CCREG;r++) {
10596       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10597       if((requires_32bit[i]>>r)&1) {
10598         if(r==CCREG) printf(" CC");
10599         else if(r==HIREG) printf(" HI");
10600         else if(r==LOREG) printf(" LO");
10601         else printf(" r%d",r);
10602       }
10603     }
10604     printf("\n");
10605     /*printf("pr:");
10606     for(r=0;r<=CCREG;r++) {
10607       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10608       if((pr32[i]>>r)&1) {
10609         if(r==CCREG) printf(" CC");
10610         else if(r==HIREG) printf(" HI");
10611         else if(r==LOREG) printf(" LO");
10612         else printf(" r%d",r);
10613       }
10614     }
10615     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10616     printf("\n");*/
10617     #if defined(__i386__) || defined(__x86_64__)
10618     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10619     printf("dirty: ");
10620     if(regs[i].wasdirty&1) printf("eax ");
10621     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10622     if((regs[i].wasdirty>>2)&1) printf("edx ");
10623     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10624     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10625     if((regs[i].wasdirty>>6)&1) printf("esi ");
10626     if((regs[i].wasdirty>>7)&1) printf("edi ");
10627     #endif
10628     #ifdef __arm__
10629     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10630     printf("dirty: ");
10631     if(regs[i].wasdirty&1) printf("r0 ");
10632     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10633     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10634     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10635     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10636     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10637     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10638     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10639     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10640     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10641     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10642     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10643     #endif
10644     printf("\n");
10645     disassemble_inst(i);
10646     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10647     #if defined(__i386__) || defined(__x86_64__)
10648     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10649     if(regs[i].dirty&1) printf("eax ");
10650     if((regs[i].dirty>>1)&1) printf("ecx ");
10651     if((regs[i].dirty>>2)&1) printf("edx ");
10652     if((regs[i].dirty>>3)&1) printf("ebx ");
10653     if((regs[i].dirty>>5)&1) printf("ebp ");
10654     if((regs[i].dirty>>6)&1) printf("esi ");
10655     if((regs[i].dirty>>7)&1) printf("edi ");
10656     #endif
10657     #ifdef __arm__
10658     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10659     if(regs[i].dirty&1) printf("r0 ");
10660     if((regs[i].dirty>>1)&1) printf("r1 ");
10661     if((regs[i].dirty>>2)&1) printf("r2 ");
10662     if((regs[i].dirty>>3)&1) printf("r3 ");
10663     if((regs[i].dirty>>4)&1) printf("r4 ");
10664     if((regs[i].dirty>>5)&1) printf("r5 ");
10665     if((regs[i].dirty>>6)&1) printf("r6 ");
10666     if((regs[i].dirty>>7)&1) printf("r7 ");
10667     if((regs[i].dirty>>8)&1) printf("r8 ");
10668     if((regs[i].dirty>>9)&1) printf("r9 ");
10669     if((regs[i].dirty>>10)&1) printf("r10 ");
10670     if((regs[i].dirty>>12)&1) printf("r12 ");
10671     #endif
10672     printf("\n");
10673     if(regs[i].isconst) {
10674       printf("constants: ");
10675       #if defined(__i386__) || defined(__x86_64__)
10676       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10677       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10678       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10679       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10680       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10681       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10682       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10683       #endif
10684       #ifdef __arm__
10685       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10686       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10687       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10688       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10689       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10690       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10691       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10692       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10693       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10694       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10695       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10696       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10697       #endif
10698       printf("\n");
10699     }
10700 #ifndef FORCE32
10701     printf(" 32:");
10702     for(r=0;r<=CCREG;r++) {
10703       if((regs[i].is32>>r)&1) {
10704         if(r==CCREG) printf(" CC");
10705         else if(r==HIREG) printf(" HI");
10706         else if(r==LOREG) printf(" LO");
10707         else printf(" r%d",r);
10708       }
10709     }
10710     printf("\n");
10711 #endif
10712     /*printf(" p32:");
10713     for(r=0;r<=CCREG;r++) {
10714       if((p32[i]>>r)&1) {
10715         if(r==CCREG) printf(" CC");
10716         else if(r==HIREG) printf(" HI");
10717         else if(r==LOREG) printf(" LO");
10718         else printf(" r%d",r);
10719       }
10720     }
10721     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10722     else printf("\n");*/
10723     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10724       #if defined(__i386__) || defined(__x86_64__)
10725       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10726       if(branch_regs[i].dirty&1) printf("eax ");
10727       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10728       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10729       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10730       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10731       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10732       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10733       #endif
10734       #ifdef __arm__
10735       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10736       if(branch_regs[i].dirty&1) printf("r0 ");
10737       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10738       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10739       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10740       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10741       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10742       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10743       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10744       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10745       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10746       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10747       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10748       #endif
10749 #ifndef FORCE32
10750       printf(" 32:");
10751       for(r=0;r<=CCREG;r++) {
10752         if((branch_regs[i].is32>>r)&1) {
10753           if(r==CCREG) printf(" CC");
10754           else if(r==HIREG) printf(" HI");
10755           else if(r==LOREG) printf(" LO");
10756           else printf(" r%d",r);
10757         }
10758       }
10759       printf("\n");
10760 #endif
10761     }
10762   }
10763
10764   /* Pass 8 - Assembly */
10765   linkcount=0;stubcount=0;
10766   ds=0;is_delayslot=0;
10767   cop1_usable=0;
10768   uint64_t is32_pre=0;
10769   u_int dirty_pre=0;
10770   u_int beginning=(u_int)out;
10771   if((u_int)addr&1) {
10772     ds=1;
10773     pagespan_ds();
10774   }
10775   u_int instr_addr0_override=0;
10776
10777 #ifdef PCSX
10778   if (start == 0x80030000) {
10779     // nasty hack for fastbios thing
10780     instr_addr0_override=(u_int)out;
10781     emit_movimm(start,0);
10782     emit_readword((int)&pcaddr,1);
10783     emit_writeword(0,(int)&pcaddr);
10784     emit_cmp(0,1);
10785     emit_jne((int)new_dyna_leave);
10786   }
10787 #endif
10788   for(i=0;i<slen;i++)
10789   {
10790     //if(ds) printf("ds: ");
10791     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10792     if(ds) {
10793       ds=0; // Skip delay slot
10794       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10795       instr_addr[i]=0;
10796     } else {
10797       #ifndef DESTRUCTIVE_WRITEBACK
10798       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10799       {
10800         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10801               unneeded_reg[i],unneeded_reg_upper[i]);
10802         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10803               unneeded_reg[i],unneeded_reg_upper[i]);
10804       }
10805       is32_pre=regs[i].is32;
10806       dirty_pre=regs[i].dirty;
10807       #endif
10808       // write back
10809       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10810       {
10811         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10812                       unneeded_reg[i],unneeded_reg_upper[i]);
10813         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10814       }
10815       // branch target entry point
10816       instr_addr[i]=(u_int)out;
10817       assem_debug("<->\n");
10818       // load regs
10819       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10820         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10821       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10822       address_generation(i,&regs[i],regs[i].regmap_entry);
10823       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10824       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10825       {
10826         // Load the delay slot registers if necessary
10827         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10828           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10829         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10830           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10831         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10832           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10833       }
10834       else if(i+1<slen)
10835       {
10836         // Preload registers for following instruction
10837         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10838           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10839             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10840         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10841           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10842             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10843       }
10844       // TODO: if(is_ooo(i)) address_generation(i+1);
10845       if(itype[i]==CJUMP||itype[i]==FJUMP)
10846         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10847       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10848         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10849       if(bt[i]) cop1_usable=0;
10850       // assemble
10851       switch(itype[i]) {
10852         case ALU:
10853           alu_assemble(i,&regs[i]);break;
10854         case IMM16:
10855           imm16_assemble(i,&regs[i]);break;
10856         case SHIFT:
10857           shift_assemble(i,&regs[i]);break;
10858         case SHIFTIMM:
10859           shiftimm_assemble(i,&regs[i]);break;
10860         case LOAD:
10861           load_assemble(i,&regs[i]);break;
10862         case LOADLR:
10863           loadlr_assemble(i,&regs[i]);break;
10864         case STORE:
10865           store_assemble(i,&regs[i]);break;
10866         case STORELR:
10867           storelr_assemble(i,&regs[i]);break;
10868         case COP0:
10869           cop0_assemble(i,&regs[i]);break;
10870         case COP1:
10871           cop1_assemble(i,&regs[i]);break;
10872         case C1LS:
10873           c1ls_assemble(i,&regs[i]);break;
10874         case COP2:
10875           cop2_assemble(i,&regs[i]);break;
10876         case C2LS:
10877           c2ls_assemble(i,&regs[i]);break;
10878         case C2OP:
10879           c2op_assemble(i,&regs[i]);break;
10880         case FCONV:
10881           fconv_assemble(i,&regs[i]);break;
10882         case FLOAT:
10883           float_assemble(i,&regs[i]);break;
10884         case FCOMP:
10885           fcomp_assemble(i,&regs[i]);break;
10886         case MULTDIV:
10887           multdiv_assemble(i,&regs[i]);break;
10888         case MOV:
10889           mov_assemble(i,&regs[i]);break;
10890         case SYSCALL:
10891           syscall_assemble(i,&regs[i]);break;
10892         case HLECALL:
10893           hlecall_assemble(i,&regs[i]);break;
10894         case INTCALL:
10895           intcall_assemble(i,&regs[i]);break;
10896         case UJUMP:
10897           ujump_assemble(i,&regs[i]);ds=1;break;
10898         case RJUMP:
10899           rjump_assemble(i,&regs[i]);ds=1;break;
10900         case CJUMP:
10901           cjump_assemble(i,&regs[i]);ds=1;break;
10902         case SJUMP:
10903           sjump_assemble(i,&regs[i]);ds=1;break;
10904         case FJUMP:
10905           fjump_assemble(i,&regs[i]);ds=1;break;
10906         case SPAN:
10907           pagespan_assemble(i,&regs[i]);break;
10908       }
10909       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10910         literal_pool(1024);
10911       else
10912         literal_pool_jumpover(256);
10913     }
10914   }
10915   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10916   // If the block did not end with an unconditional branch,
10917   // add a jump to the next instruction.
10918   if(i>1) {
10919     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10920       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10921       assert(i==slen);
10922       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10923         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10924         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10925           emit_loadreg(CCREG,HOST_CCREG);
10926         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10927       }
10928       else if(!likely[i-2])
10929       {
10930         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10931         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10932       }
10933       else
10934       {
10935         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10936         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10937       }
10938       add_to_linker((int)out,start+i*4,0);
10939       emit_jmp(0);
10940     }
10941   }
10942   else
10943   {
10944     assert(i>0);
10945     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10946     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10947     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10948       emit_loadreg(CCREG,HOST_CCREG);
10949     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10950     add_to_linker((int)out,start+i*4,0);
10951     emit_jmp(0);
10952   }
10953
10954   // TODO: delay slot stubs?
10955   // Stubs
10956   for(i=0;i<stubcount;i++)
10957   {
10958     switch(stubs[i][0])
10959     {
10960       case LOADB_STUB:
10961       case LOADH_STUB:
10962       case LOADW_STUB:
10963       case LOADD_STUB:
10964       case LOADBU_STUB:
10965       case LOADHU_STUB:
10966         do_readstub(i);break;
10967       case STOREB_STUB:
10968       case STOREH_STUB:
10969       case STOREW_STUB:
10970       case STORED_STUB:
10971         do_writestub(i);break;
10972       case CC_STUB:
10973         do_ccstub(i);break;
10974       case INVCODE_STUB:
10975         do_invstub(i);break;
10976       case FP_STUB:
10977         do_cop1stub(i);break;
10978       case STORELR_STUB:
10979         do_unalignedwritestub(i);break;
10980     }
10981   }
10982
10983   if (instr_addr0_override)
10984     instr_addr[0] = instr_addr0_override;
10985
10986   /* Pass 9 - Linker */
10987   for(i=0;i<linkcount;i++)
10988   {
10989     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10990     literal_pool(64);
10991     if(!link_addr[i][2])
10992     {
10993       void *stub=out;
10994       void *addr=check_addr(link_addr[i][1]);
10995       emit_extjump(link_addr[i][0],link_addr[i][1]);
10996       if(addr) {
10997         set_jump_target(link_addr[i][0],(int)addr);
10998         add_link(link_addr[i][1],stub);
10999       }
11000       else set_jump_target(link_addr[i][0],(int)stub);
11001     }
11002     else
11003     {
11004       // Internal branch
11005       int target=(link_addr[i][1]-start)>>2;
11006       assert(target>=0&&target<slen);
11007       assert(instr_addr[target]);
11008       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11009       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11010       //#else
11011       set_jump_target(link_addr[i][0],instr_addr[target]);
11012       //#endif
11013     }
11014   }
11015   // External Branch Targets (jump_in)
11016   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11017   for(i=0;i<slen;i++)
11018   {
11019     if(bt[i]||i==0)
11020     {
11021       if(instr_addr[i]) // TODO - delay slots (=null)
11022       {
11023         u_int vaddr=start+i*4;
11024         u_int page=get_page(vaddr);
11025         u_int vpage=get_vpage(vaddr);
11026         literal_pool(256);
11027         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11028 #ifndef FORCE32
11029         if(!requires_32bit[i])
11030 #else
11031         if(1)
11032 #endif
11033         {
11034           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11035           assem_debug("jump_in: %x\n",start+i*4);
11036           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11037           int entry_point=do_dirty_stub(i);
11038           ll_add(jump_in+page,vaddr,(void *)entry_point);
11039           // If there was an existing entry in the hash table,
11040           // replace it with the new address.
11041           // Don't add new entries.  We'll insert the
11042           // ones that actually get used in check_addr().
11043           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11044           if(ht_bin[0]==vaddr) {
11045             ht_bin[1]=entry_point;
11046           }
11047           if(ht_bin[2]==vaddr) {
11048             ht_bin[3]=entry_point;
11049           }
11050         }
11051         else
11052         {
11053           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11054           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11055           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11056           //int entry_point=(int)out;
11057           ////assem_debug("entry_point: %x\n",entry_point);
11058           //load_regs_entry(i);
11059           //if(entry_point==(int)out)
11060           //  entry_point=instr_addr[i];
11061           //else
11062           //  emit_jmp(instr_addr[i]);
11063           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11064           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11065           int entry_point=do_dirty_stub(i);
11066           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11067         }
11068       }
11069     }
11070   }
11071   // Write out the literal pool if necessary
11072   literal_pool(0);
11073   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11074   // Align code
11075   if(((u_int)out)&7) emit_addnop(13);
11076   #endif
11077   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11078   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11079   memcpy(copy,source,slen*4);
11080   copy+=slen*4;
11081   
11082   #ifdef __arm__
11083   __clear_cache((void *)beginning,out);
11084   #endif
11085   
11086   // If we're within 256K of the end of the buffer,
11087   // start over from the beginning. (Is 256K enough?)
11088   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11089   
11090   // Trap writes to any of the pages we compiled
11091   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11092     invalid_code[i]=0;
11093 #ifndef DISABLE_TLB
11094     memory_map[i]|=0x40000000;
11095     if((signed int)start>=(signed int)0xC0000000) {
11096       assert(using_tlb);
11097       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11098       invalid_code[j]=0;
11099       memory_map[j]|=0x40000000;
11100       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11101     }
11102 #endif
11103   }
11104 #ifdef PCSX
11105   // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
11106   if(get_page(start)<(RAM_SIZE>>12))
11107     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11108       invalid_code[((u_int)0x80000000>>12)|i]=0;
11109 #endif
11110   
11111   /* Pass 10 - Free memory by expiring oldest blocks */
11112   
11113   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11114   while(expirep!=end)
11115   {
11116     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11117     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11118     inv_debug("EXP: Phase %d\n",expirep);
11119     switch((expirep>>11)&3)
11120     {
11121       case 0:
11122         // Clear jump_in and jump_dirty
11123         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11124         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11125         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11126         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11127         break;
11128       case 1:
11129         // Clear pointers
11130         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11131         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11132         break;
11133       case 2:
11134         // Clear hash table
11135         for(i=0;i<32;i++) {
11136           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11137           if((ht_bin[3]>>shift)==(base>>shift) ||
11138              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11139             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11140             ht_bin[2]=ht_bin[3]=-1;
11141           }
11142           if((ht_bin[1]>>shift)==(base>>shift) ||
11143              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11144             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11145             ht_bin[0]=ht_bin[2];
11146             ht_bin[1]=ht_bin[3];
11147             ht_bin[2]=ht_bin[3]=-1;
11148           }
11149         }
11150         break;
11151       case 3:
11152         // Clear jump_out
11153         #ifdef __arm__
11154         if((expirep&2047)==0) 
11155           do_clear_cache();
11156         #endif
11157         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11158         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11159         break;
11160     }
11161     expirep=(expirep+1)&65535;
11162   }
11163   return 0;
11164 }
11165
11166 // vim:shiftwidth=2:expandtab