drc: hack for stack-in-RAM-mirror case
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   static u_int sp_in_mirror;
130   u_int stop_after_jal;
131   extern u_char restore_candidate[512];
132   extern int cycle_count;
133
134   /* registers that may be allocated */
135   /* 1-31 gpr */
136 #define HIREG 32 // hi
137 #define LOREG 33 // lo
138 #define FSREG 34 // FPU status (FCSR)
139 #define CSREG 35 // Coprocessor status
140 #define CCREG 36 // Cycle count
141 #define INVCP 37 // Pointer to invalid_code
142 #define MMREG 38 // Pointer to memory_map
143 #define ROREG 39 // ram offset (if rdram!=0x80000000)
144 #define TEMPREG 40
145 #define FTEMP 40 // FPU temporary register
146 #define PTEMP 41 // Prefetch temporary register
147 #define TLREG 42 // TLB mapping offset
148 #define RHASH 43 // Return address hash
149 #define RHTBL 44 // Return address hash table address
150 #define RTEMP 45 // JR/JALR address register
151 #define MAXREG 45
152 #define AGEN1 46 // Address generation temporary register
153 #define AGEN2 47 // Address generation temporary register
154 #define MGEN1 48 // Maptable address generation temporary register
155 #define MGEN2 49 // Maptable address generation temporary register
156 #define BTREG 50 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185 #define HLECALL 26// PCSX fake opcodes for HLE
186 #define COP2 27   // Coprocessor 2 move
187 #define C2LS 28   // Coprocessor 2 load/store
188 #define C2OP 29   // Coprocessor 2 operation
189 #define INTCALL 30// Call interpreter to handle rare corner cases
190
191   /* stubs */
192 #define CC_STUB 1
193 #define FP_STUB 2
194 #define LOADB_STUB 3
195 #define LOADH_STUB 4
196 #define LOADW_STUB 5
197 #define LOADD_STUB 6
198 #define LOADBU_STUB 7
199 #define LOADHU_STUB 8
200 #define STOREB_STUB 9
201 #define STOREH_STUB 10
202 #define STOREW_STUB 11
203 #define STORED_STUB 12
204 #define STORELR_STUB 13
205 #define INVCODE_STUB 14
206
207   /* branch codes */
208 #define TAKEN 1
209 #define NOTTAKEN 2
210 #define NULLDS 3
211
212 // asm linkage
213 int new_recompile_block(int addr);
214 void *get_addr_ht(u_int vaddr);
215 void invalidate_block(u_int block);
216 void invalidate_addr(u_int addr);
217 void remove_hash(int vaddr);
218 void jump_vaddr();
219 void dyna_linker();
220 void dyna_linker_ds();
221 void verify_code();
222 void verify_code_vm();
223 void verify_code_ds();
224 void cc_interrupt();
225 void fp_exception();
226 void fp_exception_ds();
227 void jump_syscall();
228 void jump_syscall_hle();
229 void jump_eret();
230 void jump_hlecall();
231 void jump_intcall();
232 void new_dyna_leave();
233
234 // TLB
235 void TLBWI_new();
236 void TLBWR_new();
237 void read_nomem_new();
238 void read_nomemb_new();
239 void read_nomemh_new();
240 void read_nomemd_new();
241 void write_nomem_new();
242 void write_nomemb_new();
243 void write_nomemh_new();
244 void write_nomemd_new();
245 void write_rdram_new();
246 void write_rdramb_new();
247 void write_rdramh_new();
248 void write_rdramd_new();
249 extern u_int memory_map[1048576];
250
251 // Needed by assembler
252 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
253 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
254 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
255 void load_all_regs(signed char i_regmap[]);
256 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
257 void load_regs_entry(int t);
258 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
259
260 int tracedebug=0;
261
262 //#define DEBUG_CYCLE_COUNT 1
263
264 void nullf() {}
265 //#define assem_debug printf
266 //#define inv_debug printf
267 #define assem_debug nullf
268 #define inv_debug nullf
269
270 static void tlb_hacks()
271 {
272 #ifndef DISABLE_TLB
273   // Goldeneye hack
274   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
275   {
276     u_int addr;
277     int n;
278     switch (ROM_HEADER->Country_code&0xFF) 
279     {
280       case 0x45: // U
281         addr=0x34b30;
282         break;                   
283       case 0x4A: // J 
284         addr=0x34b70;    
285         break;    
286       case 0x50: // E 
287         addr=0x329f0;
288         break;                        
289       default: 
290         // Unknown country code
291         addr=0;
292         break;
293     }
294     u_int rom_addr=(u_int)rom;
295     #ifdef ROM_COPY
296     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
297     // in the lower 4G of memory to use this hack.  Copy it if necessary.
298     if((void *)rom>(void *)0xffffffff) {
299       munmap(ROM_COPY, 67108864);
300       if(mmap(ROM_COPY, 12582912,
301               PROT_READ | PROT_WRITE,
302               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
303               -1, 0) <= 0) {printf("mmap() failed\n");}
304       memcpy(ROM_COPY,rom,12582912);
305       rom_addr=(u_int)ROM_COPY;
306     }
307     #endif
308     if(addr) {
309       for(n=0x7F000;n<0x80000;n++) {
310         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
311       }
312     }
313   }
314 #endif
315 }
316
317 static u_int get_page(u_int vaddr)
318 {
319 #ifndef PCSX
320   u_int page=(vaddr^0x80000000)>>12;
321 #else
322   u_int page=vaddr&~0xe0000000;
323   if (page < 0x1000000)
324     page &= ~0x0e00000; // RAM mirrors
325   page>>=12;
326 #endif
327 #ifndef DISABLE_TLB
328   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
329 #endif
330   if(page>2048) page=2048+(page&2047);
331   return page;
332 }
333
334 static u_int get_vpage(u_int vaddr)
335 {
336   u_int vpage=(vaddr^0x80000000)>>12;
337 #ifndef DISABLE_TLB
338   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
339 #endif
340   if(vpage>2048) vpage=2048+(vpage&2047);
341   return vpage;
342 }
343
344 // Get address from virtual address
345 // This is called from the recompiled JR/JALR instructions
346 void *get_addr(u_int vaddr)
347 {
348   u_int page=get_page(vaddr);
349   u_int vpage=get_vpage(vaddr);
350   struct ll_entry *head;
351   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
352   head=jump_in[page];
353   while(head!=NULL) {
354     if(head->vaddr==vaddr&&head->reg32==0) {
355   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
356       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
357       ht_bin[3]=ht_bin[1];
358       ht_bin[2]=ht_bin[0];
359       ht_bin[1]=(int)head->addr;
360       ht_bin[0]=vaddr;
361       return head->addr;
362     }
363     head=head->next;
364   }
365   head=jump_dirty[vpage];
366   while(head!=NULL) {
367     if(head->vaddr==vaddr&&head->reg32==0) {
368       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
369       // Don't restore blocks which are about to expire from the cache
370       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
371       if(verify_dirty(head->addr)) {
372         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
373         invalid_code[vaddr>>12]=0;
374         memory_map[vaddr>>12]|=0x40000000;
375         if(vpage<2048) {
376 #ifndef DISABLE_TLB
377           if(tlb_LUT_r[vaddr>>12]) {
378             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
379             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
380           }
381 #endif
382           restore_candidate[vpage>>3]|=1<<(vpage&7);
383         }
384         else restore_candidate[page>>3]|=1<<(page&7);
385         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
386         if(ht_bin[0]==vaddr) {
387           ht_bin[1]=(int)head->addr; // Replace existing entry
388         }
389         else
390         {
391           ht_bin[3]=ht_bin[1];
392           ht_bin[2]=ht_bin[0];
393           ht_bin[1]=(int)head->addr;
394           ht_bin[0]=vaddr;
395         }
396         return head->addr;
397       }
398     }
399     head=head->next;
400   }
401   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
402   int r=new_recompile_block(vaddr);
403   if(r==0) return get_addr(vaddr);
404   // Execute in unmapped page, generate pagefault execption
405   Status|=2;
406   Cause=(vaddr<<31)|0x8;
407   EPC=(vaddr&1)?vaddr-5:vaddr;
408   BadVAddr=(vaddr&~1);
409   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
410   EntryHi=BadVAddr&0xFFFFE000;
411   return get_addr_ht(0x80000000);
412 }
413 // Look up address in hash table first
414 void *get_addr_ht(u_int vaddr)
415 {
416   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   return get_addr(vaddr);
421 }
422
423 void *get_addr_32(u_int vaddr,u_int flags)
424 {
425 #ifdef FORCE32
426   return get_addr(vaddr);
427 #else
428   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
429   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
430   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
431   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
432   u_int page=get_page(vaddr);
433   u_int vpage=get_vpage(vaddr);
434   struct ll_entry *head;
435   head=jump_in[page];
436   while(head!=NULL) {
437     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
438       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
439       if(head->reg32==0) {
440         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441         if(ht_bin[0]==-1) {
442           ht_bin[1]=(int)head->addr;
443           ht_bin[0]=vaddr;
444         }else if(ht_bin[2]==-1) {
445           ht_bin[3]=(int)head->addr;
446           ht_bin[2]=vaddr;
447         }
448         //ht_bin[3]=ht_bin[1];
449         //ht_bin[2]=ht_bin[0];
450         //ht_bin[1]=(int)head->addr;
451         //ht_bin[0]=vaddr;
452       }
453       return head->addr;
454     }
455     head=head->next;
456   }
457   head=jump_dirty[vpage];
458   while(head!=NULL) {
459     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
460       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
461       // Don't restore blocks which are about to expire from the cache
462       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
463       if(verify_dirty(head->addr)) {
464         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
465         invalid_code[vaddr>>12]=0;
466         memory_map[vaddr>>12]|=0x40000000;
467         if(vpage<2048) {
468 #ifndef DISABLE_TLB
469           if(tlb_LUT_r[vaddr>>12]) {
470             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
471             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
472           }
473 #endif
474           restore_candidate[vpage>>3]|=1<<(vpage&7);
475         }
476         else restore_candidate[page>>3]|=1<<(page&7);
477         if(head->reg32==0) {
478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479           if(ht_bin[0]==-1) {
480             ht_bin[1]=(int)head->addr;
481             ht_bin[0]=vaddr;
482           }else if(ht_bin[2]==-1) {
483             ht_bin[3]=(int)head->addr;
484             ht_bin[2]=vaddr;
485           }
486           //ht_bin[3]=ht_bin[1];
487           //ht_bin[2]=ht_bin[0];
488           //ht_bin[1]=(int)head->addr;
489           //ht_bin[0]=vaddr;
490         }
491         return head->addr;
492       }
493     }
494     head=head->next;
495   }
496   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
497   int r=new_recompile_block(vaddr);
498   if(r==0) return get_addr(vaddr);
499   // Execute in unmapped page, generate pagefault execption
500   Status|=2;
501   Cause=(vaddr<<31)|0x8;
502   EPC=(vaddr&1)?vaddr-5:vaddr;
503   BadVAddr=(vaddr&~1);
504   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
505   EntryHi=BadVAddr&0xFFFFE000;
506   return get_addr_ht(0x80000000);
507 #endif
508 }
509
510 void clear_all_regs(signed char regmap[])
511 {
512   int hr;
513   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
514 }
515
516 signed char get_reg(signed char regmap[],int r)
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
520   return -1;
521 }
522
523 // Find a register that is available for two consecutive cycles
524 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
525 {
526   int hr;
527   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
528   return -1;
529 }
530
531 int count_free_regs(signed char regmap[])
532 {
533   int count=0;
534   int hr;
535   for(hr=0;hr<HOST_REGS;hr++)
536   {
537     if(hr!=EXCLUDE_REG) {
538       if(regmap[hr]<0) count++;
539     }
540   }
541   return count;
542 }
543
544 void dirty_reg(struct regstat *cur,signed char reg)
545 {
546   int hr;
547   if(!reg) return;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       cur->dirty|=1<<hr;
551     }
552   }
553 }
554
555 // If we dirty the lower half of a 64 bit register which is now being
556 // sign-extended, we need to dump the upper half.
557 // Note: Do this only after completion of the instruction, because
558 // some instructions may need to read the full 64-bit value even if
559 // overwriting it (eg SLTI, DSRA32).
560 static void flush_dirty_uppers(struct regstat *cur)
561 {
562   int hr,reg;
563   for (hr=0;hr<HOST_REGS;hr++) {
564     if((cur->dirty>>hr)&1) {
565       reg=cur->regmap[hr];
566       if(reg>=64) 
567         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
568     }
569   }
570 }
571
572 void set_const(struct regstat *cur,signed char reg,uint64_t value)
573 {
574   int hr;
575   if(!reg) return;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       cur->isconst|=1<<hr;
579       cur->constmap[hr]=value;
580     }
581     else if((cur->regmap[hr]^64)==reg) {
582       cur->isconst|=1<<hr;
583       cur->constmap[hr]=value>>32;
584     }
585   }
586 }
587
588 void clear_const(struct regstat *cur,signed char reg)
589 {
590   int hr;
591   if(!reg) return;
592   for (hr=0;hr<HOST_REGS;hr++) {
593     if((cur->regmap[hr]&63)==reg) {
594       cur->isconst&=~(1<<hr);
595     }
596   }
597 }
598
599 int is_const(struct regstat *cur,signed char reg)
600 {
601   int hr;
602   if(!reg) return 1;
603   for (hr=0;hr<HOST_REGS;hr++) {
604     if((cur->regmap[hr]&63)==reg) {
605       return (cur->isconst>>hr)&1;
606     }
607   }
608   return 0;
609 }
610 uint64_t get_const(struct regstat *cur,signed char reg)
611 {
612   int hr;
613   if(!reg) return 0;
614   for (hr=0;hr<HOST_REGS;hr++) {
615     if(cur->regmap[hr]==reg) {
616       return cur->constmap[hr];
617     }
618   }
619   printf("Unknown constant in r%d\n",reg);
620   exit(1);
621 }
622
623 // Least soon needed registers
624 // Look at the next ten instructions and see which registers
625 // will be used.  Try not to reallocate these.
626 void lsn(u_char hsn[], int i, int *preferred_reg)
627 {
628   int j;
629   int b=-1;
630   for(j=0;j<9;j++)
631   {
632     if(i+j>=slen) {
633       j=slen-i-1;
634       break;
635     }
636     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
637     {
638       // Don't go past an unconditonal jump
639       j++;
640       break;
641     }
642   }
643   for(;j>=0;j--)
644   {
645     if(rs1[i+j]) hsn[rs1[i+j]]=j;
646     if(rs2[i+j]) hsn[rs2[i+j]]=j;
647     if(rt1[i+j]) hsn[rt1[i+j]]=j;
648     if(rt2[i+j]) hsn[rt2[i+j]]=j;
649     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
650       // Stores can allocate zero
651       hsn[rs1[i+j]]=j;
652       hsn[rs2[i+j]]=j;
653     }
654     // On some architectures stores need invc_ptr
655     #if defined(HOST_IMM8)
656     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
657       hsn[INVCP]=j;
658     }
659     #endif
660     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
661     {
662       hsn[CCREG]=j;
663       b=j;
664     }
665   }
666   if(b>=0)
667   {
668     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
669     {
670       // Follow first branch
671       int t=(ba[i+b]-start)>>2;
672       j=7-b;if(t+j>=slen) j=slen-t-1;
673       for(;j>=0;j--)
674       {
675         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
676         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
677         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
678         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
679       }
680     }
681     // TODO: preferred register based on backward branch
682   }
683   // Delay slot should preferably not overwrite branch conditions or cycle count
684   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
685     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
686     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
687     hsn[CCREG]=1;
688     // ...or hash tables
689     hsn[RHASH]=1;
690     hsn[RHTBL]=1;
691   }
692   // Coprocessor load/store needs FTEMP, even if not declared
693   if(itype[i]==C1LS||itype[i]==C2LS) {
694     hsn[FTEMP]=0;
695   }
696   // Load L/R also uses FTEMP as a temporary register
697   if(itype[i]==LOADLR) {
698     hsn[FTEMP]=0;
699   }
700   // Also SWL/SWR/SDL/SDR
701   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
702     hsn[FTEMP]=0;
703   }
704   // Don't remove the TLB registers either
705   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
706     hsn[TLREG]=0;
707   }
708   // Don't remove the miniht registers
709   if(itype[i]==UJUMP||itype[i]==RJUMP)
710   {
711     hsn[RHASH]=0;
712     hsn[RHTBL]=0;
713   }
714 }
715
716 // We only want to allocate registers if we're going to use them again soon
717 int needed_again(int r, int i)
718 {
719   int j;
720   int b=-1;
721   int rn=10;
722   int hr;
723   u_char hsn[MAXREG+1];
724   int preferred_reg;
725   
726   memset(hsn,10,sizeof(hsn));
727   lsn(hsn,i,&preferred_reg);
728   
729   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
730   {
731     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
732       return 0; // Don't need any registers if exiting the block
733   }
734   for(j=0;j<9;j++)
735   {
736     if(i+j>=slen) {
737       j=slen-i-1;
738       break;
739     }
740     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
741     {
742       // Don't go past an unconditonal jump
743       j++;
744       break;
745     }
746     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
747     {
748       break;
749     }
750   }
751   for(;j>=1;j--)
752   {
753     if(rs1[i+j]==r) rn=j;
754     if(rs2[i+j]==r) rn=j;
755     if((unneeded_reg[i+j]>>r)&1) rn=10;
756     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
757     {
758       b=j;
759     }
760   }
761   /*
762   if(b>=0)
763   {
764     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
765     {
766       // Follow first branch
767       int o=rn;
768       int t=(ba[i+b]-start)>>2;
769       j=7-b;if(t+j>=slen) j=slen-t-1;
770       for(;j>=0;j--)
771       {
772         if(!((unneeded_reg[t+j]>>r)&1)) {
773           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
774           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
775         }
776         else rn=o;
777       }
778     }
779   }*/
780   for(hr=0;hr<HOST_REGS;hr++) {
781     if(hr!=EXCLUDE_REG) {
782       if(rn<hsn[hr]) return 1;
783     }
784   }
785   return 0;
786 }
787
788 // Try to match register allocations at the end of a loop with those
789 // at the beginning
790 int loop_reg(int i, int r, int hr)
791 {
792   int j,k;
793   for(j=0;j<9;j++)
794   {
795     if(i+j>=slen) {
796       j=slen-i-1;
797       break;
798     }
799     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
800     {
801       // Don't go past an unconditonal jump
802       j++;
803       break;
804     }
805   }
806   k=0;
807   if(i>0){
808     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
809       k--;
810   }
811   for(;k<j;k++)
812   {
813     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
814     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
815     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
816     {
817       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
818       {
819         int t=(ba[i+k]-start)>>2;
820         int reg=get_reg(regs[t].regmap_entry,r);
821         if(reg>=0) return reg;
822         //reg=get_reg(regs[t+1].regmap_entry,r);
823         //if(reg>=0) return reg;
824       }
825     }
826   }
827   return hr;
828 }
829
830
831 // Allocate every register, preserving source/target regs
832 void alloc_all(struct regstat *cur,int i)
833 {
834   int hr;
835   
836   for(hr=0;hr<HOST_REGS;hr++) {
837     if(hr!=EXCLUDE_REG) {
838       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
839          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
840       {
841         cur->regmap[hr]=-1;
842         cur->dirty&=~(1<<hr);
843       }
844       // Don't need zeros
845       if((cur->regmap[hr]&63)==0)
846       {
847         cur->regmap[hr]=-1;
848         cur->dirty&=~(1<<hr);
849       }
850     }
851   }
852 }
853
854
855 void div64(int64_t dividend,int64_t divisor)
856 {
857   lo=dividend/divisor;
858   hi=dividend%divisor;
859   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
860   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
861 }
862 void divu64(uint64_t dividend,uint64_t divisor)
863 {
864   lo=dividend/divisor;
865   hi=dividend%divisor;
866   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
867   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
868 }
869
870 void mult64(uint64_t m1,uint64_t m2)
871 {
872    unsigned long long int op1, op2, op3, op4;
873    unsigned long long int result1, result2, result3, result4;
874    unsigned long long int temp1, temp2, temp3, temp4;
875    int sign = 0;
876    
877    if (m1 < 0)
878      {
879     op2 = -m1;
880     sign = 1 - sign;
881      }
882    else op2 = m1;
883    if (m2 < 0)
884      {
885     op4 = -m2;
886     sign = 1 - sign;
887      }
888    else op4 = m2;
889    
890    op1 = op2 & 0xFFFFFFFF;
891    op2 = (op2 >> 32) & 0xFFFFFFFF;
892    op3 = op4 & 0xFFFFFFFF;
893    op4 = (op4 >> 32) & 0xFFFFFFFF;
894    
895    temp1 = op1 * op3;
896    temp2 = (temp1 >> 32) + op1 * op4;
897    temp3 = op2 * op3;
898    temp4 = (temp3 >> 32) + op2 * op4;
899    
900    result1 = temp1 & 0xFFFFFFFF;
901    result2 = temp2 + (temp3 & 0xFFFFFFFF);
902    result3 = (result2 >> 32) + temp4;
903    result4 = (result3 >> 32);
904    
905    lo = result1 | (result2 << 32);
906    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
907    if (sign)
908      {
909     hi = ~hi;
910     if (!lo) hi++;
911     else lo = ~lo + 1;
912      }
913 }
914
915 void multu64(uint64_t m1,uint64_t m2)
916 {
917    unsigned long long int op1, op2, op3, op4;
918    unsigned long long int result1, result2, result3, result4;
919    unsigned long long int temp1, temp2, temp3, temp4;
920    
921    op1 = m1 & 0xFFFFFFFF;
922    op2 = (m1 >> 32) & 0xFFFFFFFF;
923    op3 = m2 & 0xFFFFFFFF;
924    op4 = (m2 >> 32) & 0xFFFFFFFF;
925    
926    temp1 = op1 * op3;
927    temp2 = (temp1 >> 32) + op1 * op4;
928    temp3 = op2 * op3;
929    temp4 = (temp3 >> 32) + op2 * op4;
930    
931    result1 = temp1 & 0xFFFFFFFF;
932    result2 = temp2 + (temp3 & 0xFFFFFFFF);
933    result3 = (result2 >> 32) + temp4;
934    result4 = (result3 >> 32);
935    
936    lo = result1 | (result2 << 32);
937    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
938    
939   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
940   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
941 }
942
943 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
944 {
945   if(bits) {
946     original<<=64-bits;
947     original>>=64-bits;
948     loaded<<=bits;
949     original|=loaded;
950   }
951   else original=loaded;
952   return original;
953 }
954 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
955 {
956   if(bits^56) {
957     original>>=64-(bits^56);
958     original<<=64-(bits^56);
959     loaded>>=bits^56;
960     original|=loaded;
961   }
962   else original=loaded;
963   return original;
964 }
965
966 #ifdef __i386__
967 #include "assem_x86.c"
968 #endif
969 #ifdef __x86_64__
970 #include "assem_x64.c"
971 #endif
972 #ifdef __arm__
973 #include "assem_arm.c"
974 #endif
975
976 // Add virtual address mapping to linked list
977 void ll_add(struct ll_entry **head,int vaddr,void *addr)
978 {
979   struct ll_entry *new_entry;
980   new_entry=malloc(sizeof(struct ll_entry));
981   assert(new_entry!=NULL);
982   new_entry->vaddr=vaddr;
983   new_entry->reg32=0;
984   new_entry->addr=addr;
985   new_entry->next=*head;
986   *head=new_entry;
987 }
988
989 // Add virtual address mapping for 32-bit compiled block
990 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
991 {
992   ll_add(head,vaddr,addr);
993 #ifndef FORCE32
994   (*head)->reg32=reg32;
995 #endif
996 }
997
998 // Check if an address is already compiled
999 // but don't return addresses which are about to expire from the cache
1000 void *check_addr(u_int vaddr)
1001 {
1002   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1003   if(ht_bin[0]==vaddr) {
1004     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1005       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1006   }
1007   if(ht_bin[2]==vaddr) {
1008     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1009       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1010   }
1011   u_int page=get_page(vaddr);
1012   struct ll_entry *head;
1013   head=jump_in[page];
1014   while(head!=NULL) {
1015     if(head->vaddr==vaddr&&head->reg32==0) {
1016       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1017         // Update existing entry with current address
1018         if(ht_bin[0]==vaddr) {
1019           ht_bin[1]=(int)head->addr;
1020           return head->addr;
1021         }
1022         if(ht_bin[2]==vaddr) {
1023           ht_bin[3]=(int)head->addr;
1024           return head->addr;
1025         }
1026         // Insert into hash table with low priority.
1027         // Don't evict existing entries, as they are probably
1028         // addresses that are being accessed frequently.
1029         if(ht_bin[0]==-1) {
1030           ht_bin[1]=(int)head->addr;
1031           ht_bin[0]=vaddr;
1032         }else if(ht_bin[2]==-1) {
1033           ht_bin[3]=(int)head->addr;
1034           ht_bin[2]=vaddr;
1035         }
1036         return head->addr;
1037       }
1038     }
1039     head=head->next;
1040   }
1041   return 0;
1042 }
1043
1044 void remove_hash(int vaddr)
1045 {
1046   //printf("remove hash: %x\n",vaddr);
1047   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1048   if(ht_bin[2]==vaddr) {
1049     ht_bin[2]=ht_bin[3]=-1;
1050   }
1051   if(ht_bin[0]==vaddr) {
1052     ht_bin[0]=ht_bin[2];
1053     ht_bin[1]=ht_bin[3];
1054     ht_bin[2]=ht_bin[3]=-1;
1055   }
1056 }
1057
1058 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1059 {
1060   struct ll_entry *next;
1061   while(*head) {
1062     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1063        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1064     {
1065       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1066       remove_hash((*head)->vaddr);
1067       next=(*head)->next;
1068       free(*head);
1069       *head=next;
1070     }
1071     else
1072     {
1073       head=&((*head)->next);
1074     }
1075   }
1076 }
1077
1078 // Remove all entries from linked list
1079 void ll_clear(struct ll_entry **head)
1080 {
1081   struct ll_entry *cur;
1082   struct ll_entry *next;
1083   if(cur=*head) {
1084     *head=0;
1085     while(cur) {
1086       next=cur->next;
1087       free(cur);
1088       cur=next;
1089     }
1090   }
1091 }
1092
1093 // Dereference the pointers and remove if it matches
1094 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1095 {
1096   while(head) {
1097     int ptr=get_pointer(head->addr);
1098     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1099     if(((ptr>>shift)==(addr>>shift)) ||
1100        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1101     {
1102       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1103       u_int host_addr=(u_int)kill_pointer(head->addr);
1104       #ifdef __arm__
1105         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1106       #endif
1107     }
1108     head=head->next;
1109   }
1110 }
1111
1112 // This is called when we write to a compiled block (see do_invstub)
1113 void invalidate_page(u_int page)
1114 {
1115   struct ll_entry *head;
1116   struct ll_entry *next;
1117   head=jump_in[page];
1118   jump_in[page]=0;
1119   while(head!=NULL) {
1120     inv_debug("INVALIDATE: %x\n",head->vaddr);
1121     remove_hash(head->vaddr);
1122     next=head->next;
1123     free(head);
1124     head=next;
1125   }
1126   head=jump_out[page];
1127   jump_out[page]=0;
1128   while(head!=NULL) {
1129     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1130     u_int host_addr=(u_int)kill_pointer(head->addr);
1131     #ifdef __arm__
1132       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1133     #endif
1134     next=head->next;
1135     free(head);
1136     head=next;
1137   }
1138 }
1139 void invalidate_block(u_int block)
1140 {
1141   u_int page=get_page(block<<12);
1142   u_int vpage=get_vpage(block<<12);
1143   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1144   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1145   u_int first,last;
1146   first=last=page;
1147   struct ll_entry *head;
1148   head=jump_dirty[vpage];
1149   //printf("page=%d vpage=%d\n",page,vpage);
1150   while(head!=NULL) {
1151     u_int start,end;
1152     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1153       get_bounds((int)head->addr,&start,&end);
1154       //printf("start: %x end: %x\n",start,end);
1155       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1156         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1157           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1158           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1159         }
1160       }
1161 #ifndef DISABLE_TLB
1162       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1163         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1164           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1165           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1166         }
1167       }
1168 #endif
1169     }
1170     head=head->next;
1171   }
1172   //printf("first=%d last=%d\n",first,last);
1173   invalidate_page(page);
1174   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1175   assert(last<page+5);
1176   // Invalidate the adjacent pages if a block crosses a 4K boundary
1177   while(first<page) {
1178     invalidate_page(first);
1179     first++;
1180   }
1181   for(first=page+1;first<last;first++) {
1182     invalidate_page(first);
1183   }
1184   #ifdef __arm__
1185     do_clear_cache();
1186   #endif
1187   
1188   // Don't trap writes
1189   invalid_code[block]=1;
1190 #ifdef PCSX
1191   invalid_code[((u_int)0x80000000>>12)|page]=1;
1192 #endif
1193 #ifndef DISABLE_TLB
1194   // If there is a valid TLB entry for this page, remove write protect
1195   if(tlb_LUT_w[block]) {
1196     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1197     // CHECK: Is this right?
1198     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1199     u_int real_block=tlb_LUT_w[block]>>12;
1200     invalid_code[real_block]=1;
1201     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1202   }
1203   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1204 #endif
1205
1206   #ifdef USE_MINI_HT
1207   memset(mini_ht,-1,sizeof(mini_ht));
1208   #endif
1209 }
1210 void invalidate_addr(u_int addr)
1211 {
1212   invalidate_block(addr>>12);
1213 }
1214 // This is called when loading a save state.
1215 // Anything could have changed, so invalidate everything.
1216 void invalidate_all_pages()
1217 {
1218   u_int page,n;
1219   for(page=0;page<4096;page++)
1220     invalidate_page(page);
1221   for(page=0;page<1048576;page++)
1222     if(!invalid_code[page]) {
1223       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1224       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1225     }
1226   #ifdef __arm__
1227   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1228   #endif
1229   #ifdef USE_MINI_HT
1230   memset(mini_ht,-1,sizeof(mini_ht));
1231   #endif
1232   #ifndef DISABLE_TLB
1233   // TLB
1234   for(page=0;page<0x100000;page++) {
1235     if(tlb_LUT_r[page]) {
1236       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1237       if(!tlb_LUT_w[page]||!invalid_code[page])
1238         memory_map[page]|=0x40000000; // Write protect
1239     }
1240     else memory_map[page]=-1;
1241     if(page==0x80000) page=0xC0000;
1242   }
1243   tlb_hacks();
1244   #endif
1245 }
1246
1247 // Add an entry to jump_out after making a link
1248 void add_link(u_int vaddr,void *src)
1249 {
1250   u_int page=get_page(vaddr);
1251   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1252   ll_add(jump_out+page,vaddr,src);
1253   //int ptr=get_pointer(src);
1254   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1255 }
1256
1257 // If a code block was found to be unmodified (bit was set in
1258 // restore_candidate) and it remains unmodified (bit is clear
1259 // in invalid_code) then move the entries for that 4K page from
1260 // the dirty list to the clean list.
1261 void clean_blocks(u_int page)
1262 {
1263   struct ll_entry *head;
1264   inv_debug("INV: clean_blocks page=%d\n",page);
1265   head=jump_dirty[page];
1266   while(head!=NULL) {
1267     if(!invalid_code[head->vaddr>>12]) {
1268       // Don't restore blocks which are about to expire from the cache
1269       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1270         u_int start,end;
1271         if(verify_dirty((int)head->addr)) {
1272           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1273           u_int i;
1274           u_int inv=0;
1275           get_bounds((int)head->addr,&start,&end);
1276           if(start-(u_int)rdram<RAM_SIZE) {
1277             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1278               inv|=invalid_code[i];
1279             }
1280           }
1281           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1282             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1283             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1284             if(addr<start||addr>=end) inv=1;
1285           }
1286           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1287             inv=1;
1288           }
1289           if(!inv) {
1290             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1291             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1292               u_int ppage=page;
1293 #ifndef DISABLE_TLB
1294               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1295 #endif
1296               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1297               //printf("page=%x, addr=%x\n",page,head->vaddr);
1298               //assert(head->vaddr>>12==(page|0x80000));
1299               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1300               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1301               if(!head->reg32) {
1302                 if(ht_bin[0]==head->vaddr) {
1303                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1304                 }
1305                 if(ht_bin[2]==head->vaddr) {
1306                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1307                 }
1308               }
1309             }
1310           }
1311         }
1312       }
1313     }
1314     head=head->next;
1315   }
1316 }
1317
1318
1319 void mov_alloc(struct regstat *current,int i)
1320 {
1321   // Note: Don't need to actually alloc the source registers
1322   if((~current->is32>>rs1[i])&1) {
1323     //alloc_reg64(current,i,rs1[i]);
1324     alloc_reg64(current,i,rt1[i]);
1325     current->is32&=~(1LL<<rt1[i]);
1326   } else {
1327     //alloc_reg(current,i,rs1[i]);
1328     alloc_reg(current,i,rt1[i]);
1329     current->is32|=(1LL<<rt1[i]);
1330   }
1331   clear_const(current,rs1[i]);
1332   clear_const(current,rt1[i]);
1333   dirty_reg(current,rt1[i]);
1334 }
1335
1336 void shiftimm_alloc(struct regstat *current,int i)
1337 {
1338   clear_const(current,rs1[i]);
1339   clear_const(current,rt1[i]);
1340   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1341   {
1342     if(rt1[i]) {
1343       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1344       else lt1[i]=rs1[i];
1345       alloc_reg(current,i,rt1[i]);
1346       current->is32|=1LL<<rt1[i];
1347       dirty_reg(current,rt1[i]);
1348     }
1349   }
1350   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1351   {
1352     if(rt1[i]) {
1353       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1354       alloc_reg64(current,i,rt1[i]);
1355       current->is32&=~(1LL<<rt1[i]);
1356       dirty_reg(current,rt1[i]);
1357     }
1358   }
1359   if(opcode2[i]==0x3c) // DSLL32
1360   {
1361     if(rt1[i]) {
1362       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1363       alloc_reg64(current,i,rt1[i]);
1364       current->is32&=~(1LL<<rt1[i]);
1365       dirty_reg(current,rt1[i]);
1366     }
1367   }
1368   if(opcode2[i]==0x3e) // DSRL32
1369   {
1370     if(rt1[i]) {
1371       alloc_reg64(current,i,rs1[i]);
1372       if(imm[i]==32) {
1373         alloc_reg64(current,i,rt1[i]);
1374         current->is32&=~(1LL<<rt1[i]);
1375       } else {
1376         alloc_reg(current,i,rt1[i]);
1377         current->is32|=1LL<<rt1[i];
1378       }
1379       dirty_reg(current,rt1[i]);
1380     }
1381   }
1382   if(opcode2[i]==0x3f) // DSRA32
1383   {
1384     if(rt1[i]) {
1385       alloc_reg64(current,i,rs1[i]);
1386       alloc_reg(current,i,rt1[i]);
1387       current->is32|=1LL<<rt1[i];
1388       dirty_reg(current,rt1[i]);
1389     }
1390   }
1391 }
1392
1393 void shift_alloc(struct regstat *current,int i)
1394 {
1395   if(rt1[i]) {
1396     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1397     {
1398       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1399       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1400       alloc_reg(current,i,rt1[i]);
1401       if(rt1[i]==rs2[i]) {
1402         alloc_reg_temp(current,i,-1);
1403         minimum_free_regs[i]=1;
1404       }
1405       current->is32|=1LL<<rt1[i];
1406     } else { // DSLLV/DSRLV/DSRAV
1407       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1408       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1409       alloc_reg64(current,i,rt1[i]);
1410       current->is32&=~(1LL<<rt1[i]);
1411       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1412       {
1413         alloc_reg_temp(current,i,-1);
1414         minimum_free_regs[i]=1;
1415       }
1416     }
1417     clear_const(current,rs1[i]);
1418     clear_const(current,rs2[i]);
1419     clear_const(current,rt1[i]);
1420     dirty_reg(current,rt1[i]);
1421   }
1422 }
1423
1424 void alu_alloc(struct regstat *current,int i)
1425 {
1426   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1427     if(rt1[i]) {
1428       if(rs1[i]&&rs2[i]) {
1429         alloc_reg(current,i,rs1[i]);
1430         alloc_reg(current,i,rs2[i]);
1431       }
1432       else {
1433         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1434         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1435       }
1436       alloc_reg(current,i,rt1[i]);
1437     }
1438     current->is32|=1LL<<rt1[i];
1439   }
1440   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1441     if(rt1[i]) {
1442       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1443       {
1444         alloc_reg64(current,i,rs1[i]);
1445         alloc_reg64(current,i,rs2[i]);
1446         alloc_reg(current,i,rt1[i]);
1447       } else {
1448         alloc_reg(current,i,rs1[i]);
1449         alloc_reg(current,i,rs2[i]);
1450         alloc_reg(current,i,rt1[i]);
1451       }
1452     }
1453     current->is32|=1LL<<rt1[i];
1454   }
1455   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1456     if(rt1[i]) {
1457       if(rs1[i]&&rs2[i]) {
1458         alloc_reg(current,i,rs1[i]);
1459         alloc_reg(current,i,rs2[i]);
1460       }
1461       else
1462       {
1463         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1464         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1465       }
1466       alloc_reg(current,i,rt1[i]);
1467       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1468       {
1469         if(!((current->uu>>rt1[i])&1)) {
1470           alloc_reg64(current,i,rt1[i]);
1471         }
1472         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1473           if(rs1[i]&&rs2[i]) {
1474             alloc_reg64(current,i,rs1[i]);
1475             alloc_reg64(current,i,rs2[i]);
1476           }
1477           else
1478           {
1479             // Is is really worth it to keep 64-bit values in registers?
1480             #ifdef NATIVE_64BIT
1481             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1482             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1483             #endif
1484           }
1485         }
1486         current->is32&=~(1LL<<rt1[i]);
1487       } else {
1488         current->is32|=1LL<<rt1[i];
1489       }
1490     }
1491   }
1492   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1493     if(rt1[i]) {
1494       if(rs1[i]&&rs2[i]) {
1495         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1496           alloc_reg64(current,i,rs1[i]);
1497           alloc_reg64(current,i,rs2[i]);
1498           alloc_reg64(current,i,rt1[i]);
1499         } else {
1500           alloc_reg(current,i,rs1[i]);
1501           alloc_reg(current,i,rs2[i]);
1502           alloc_reg(current,i,rt1[i]);
1503         }
1504       }
1505       else {
1506         alloc_reg(current,i,rt1[i]);
1507         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1508           // DADD used as move, or zeroing
1509           // If we have a 64-bit source, then make the target 64 bits too
1510           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1511             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1512             alloc_reg64(current,i,rt1[i]);
1513           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1514             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1515             alloc_reg64(current,i,rt1[i]);
1516           }
1517           if(opcode2[i]>=0x2e&&rs2[i]) {
1518             // DSUB used as negation - 64-bit result
1519             // If we have a 32-bit register, extend it to 64 bits
1520             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1521             alloc_reg64(current,i,rt1[i]);
1522           }
1523         }
1524       }
1525       if(rs1[i]&&rs2[i]) {
1526         current->is32&=~(1LL<<rt1[i]);
1527       } else if(rs1[i]) {
1528         current->is32&=~(1LL<<rt1[i]);
1529         if((current->is32>>rs1[i])&1)
1530           current->is32|=1LL<<rt1[i];
1531       } else if(rs2[i]) {
1532         current->is32&=~(1LL<<rt1[i]);
1533         if((current->is32>>rs2[i])&1)
1534           current->is32|=1LL<<rt1[i];
1535       } else {
1536         current->is32|=1LL<<rt1[i];
1537       }
1538     }
1539   }
1540   clear_const(current,rs1[i]);
1541   clear_const(current,rs2[i]);
1542   clear_const(current,rt1[i]);
1543   dirty_reg(current,rt1[i]);
1544 }
1545
1546 void imm16_alloc(struct regstat *current,int i)
1547 {
1548   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1549   else lt1[i]=rs1[i];
1550   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1551   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1552     current->is32&=~(1LL<<rt1[i]);
1553     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1554       // TODO: Could preserve the 32-bit flag if the immediate is zero
1555       alloc_reg64(current,i,rt1[i]);
1556       alloc_reg64(current,i,rs1[i]);
1557     }
1558     clear_const(current,rs1[i]);
1559     clear_const(current,rt1[i]);
1560   }
1561   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1562     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1563     current->is32|=1LL<<rt1[i];
1564     clear_const(current,rs1[i]);
1565     clear_const(current,rt1[i]);
1566   }
1567   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1568     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1569       if(rs1[i]!=rt1[i]) {
1570         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1571         alloc_reg64(current,i,rt1[i]);
1572         current->is32&=~(1LL<<rt1[i]);
1573       }
1574     }
1575     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1576     if(is_const(current,rs1[i])) {
1577       int v=get_const(current,rs1[i]);
1578       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1579       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1580       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1581     }
1582     else clear_const(current,rt1[i]);
1583   }
1584   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1585     if(is_const(current,rs1[i])) {
1586       int v=get_const(current,rs1[i]);
1587       set_const(current,rt1[i],v+imm[i]);
1588     }
1589     else clear_const(current,rt1[i]);
1590     current->is32|=1LL<<rt1[i];
1591   }
1592   else {
1593     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1594     current->is32|=1LL<<rt1[i];
1595   }
1596   dirty_reg(current,rt1[i]);
1597 }
1598
1599 void load_alloc(struct regstat *current,int i)
1600 {
1601   clear_const(current,rt1[i]);
1602   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1603   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1604   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1605   if(rt1[i]) {
1606     alloc_reg(current,i,rt1[i]);
1607     if(get_reg(current->regmap,rt1[i])<0) {
1608       // dummy load, but we still need a register to calculate the address
1609       alloc_reg_temp(current,i,-1);
1610       minimum_free_regs[i]=1;
1611     }
1612     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1613     {
1614       current->is32&=~(1LL<<rt1[i]);
1615       alloc_reg64(current,i,rt1[i]);
1616     }
1617     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1618     {
1619       current->is32&=~(1LL<<rt1[i]);
1620       alloc_reg64(current,i,rt1[i]);
1621       alloc_all(current,i);
1622       alloc_reg64(current,i,FTEMP);
1623       minimum_free_regs[i]=HOST_REGS;
1624     }
1625     else current->is32|=1LL<<rt1[i];
1626     dirty_reg(current,rt1[i]);
1627     // If using TLB, need a register for pointer to the mapping table
1628     if(using_tlb) alloc_reg(current,i,TLREG);
1629     // LWL/LWR need a temporary register for the old value
1630     if(opcode[i]==0x22||opcode[i]==0x26)
1631     {
1632       alloc_reg(current,i,FTEMP);
1633       alloc_reg_temp(current,i,-1);
1634       minimum_free_regs[i]=1;
1635     }
1636   }
1637   else
1638   {
1639     // Load to r0 (dummy load)
1640     // but we still need a register to calculate the address
1641     if(opcode[i]==0x22||opcode[i]==0x26)
1642     {
1643       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1644     }
1645     alloc_reg_temp(current,i,-1);
1646     minimum_free_regs[i]=1;
1647     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1648     {
1649       alloc_all(current,i);
1650       alloc_reg64(current,i,FTEMP);
1651       minimum_free_regs[i]=HOST_REGS;
1652     }
1653   }
1654 }
1655
1656 void store_alloc(struct regstat *current,int i)
1657 {
1658   clear_const(current,rs2[i]);
1659   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1660   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1661   alloc_reg(current,i,rs2[i]);
1662   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1663     alloc_reg64(current,i,rs2[i]);
1664     if(rs2[i]) alloc_reg(current,i,FTEMP);
1665   }
1666   // If using TLB, need a register for pointer to the mapping table
1667   if(using_tlb) alloc_reg(current,i,TLREG);
1668   #if defined(HOST_IMM8)
1669   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1670   else alloc_reg(current,i,INVCP);
1671   #endif
1672   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1673     alloc_reg(current,i,FTEMP);
1674   }
1675   // We need a temporary register for address generation
1676   alloc_reg_temp(current,i,-1);
1677   minimum_free_regs[i]=1;
1678 }
1679
1680 void c1ls_alloc(struct regstat *current,int i)
1681 {
1682   //clear_const(current,rs1[i]); // FIXME
1683   clear_const(current,rt1[i]);
1684   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1685   alloc_reg(current,i,CSREG); // Status
1686   alloc_reg(current,i,FTEMP);
1687   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1688     alloc_reg64(current,i,FTEMP);
1689   }
1690   // If using TLB, need a register for pointer to the mapping table
1691   if(using_tlb) alloc_reg(current,i,TLREG);
1692   #if defined(HOST_IMM8)
1693   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1694   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1695     alloc_reg(current,i,INVCP);
1696   #endif
1697   // We need a temporary register for address generation
1698   alloc_reg_temp(current,i,-1);
1699 }
1700
1701 void c2ls_alloc(struct regstat *current,int i)
1702 {
1703   clear_const(current,rt1[i]);
1704   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1705   alloc_reg(current,i,FTEMP);
1706   // If using TLB, need a register for pointer to the mapping table
1707   if(using_tlb) alloc_reg(current,i,TLREG);
1708   #if defined(HOST_IMM8)
1709   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1710   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1711     alloc_reg(current,i,INVCP);
1712   #endif
1713   // We need a temporary register for address generation
1714   alloc_reg_temp(current,i,-1);
1715   minimum_free_regs[i]=1;
1716 }
1717
1718 #ifndef multdiv_alloc
1719 void multdiv_alloc(struct regstat *current,int i)
1720 {
1721   //  case 0x18: MULT
1722   //  case 0x19: MULTU
1723   //  case 0x1A: DIV
1724   //  case 0x1B: DIVU
1725   //  case 0x1C: DMULT
1726   //  case 0x1D: DMULTU
1727   //  case 0x1E: DDIV
1728   //  case 0x1F: DDIVU
1729   clear_const(current,rs1[i]);
1730   clear_const(current,rs2[i]);
1731   if(rs1[i]&&rs2[i])
1732   {
1733     if((opcode2[i]&4)==0) // 32-bit
1734     {
1735       current->u&=~(1LL<<HIREG);
1736       current->u&=~(1LL<<LOREG);
1737       alloc_reg(current,i,HIREG);
1738       alloc_reg(current,i,LOREG);
1739       alloc_reg(current,i,rs1[i]);
1740       alloc_reg(current,i,rs2[i]);
1741       current->is32|=1LL<<HIREG;
1742       current->is32|=1LL<<LOREG;
1743       dirty_reg(current,HIREG);
1744       dirty_reg(current,LOREG);
1745     }
1746     else // 64-bit
1747     {
1748       current->u&=~(1LL<<HIREG);
1749       current->u&=~(1LL<<LOREG);
1750       current->uu&=~(1LL<<HIREG);
1751       current->uu&=~(1LL<<LOREG);
1752       alloc_reg64(current,i,HIREG);
1753       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1754       alloc_reg64(current,i,rs1[i]);
1755       alloc_reg64(current,i,rs2[i]);
1756       alloc_all(current,i);
1757       current->is32&=~(1LL<<HIREG);
1758       current->is32&=~(1LL<<LOREG);
1759       dirty_reg(current,HIREG);
1760       dirty_reg(current,LOREG);
1761       minimum_free_regs[i]=HOST_REGS;
1762     }
1763   }
1764   else
1765   {
1766     // Multiply by zero is zero.
1767     // MIPS does not have a divide by zero exception.
1768     // The result is undefined, we return zero.
1769     alloc_reg(current,i,HIREG);
1770     alloc_reg(current,i,LOREG);
1771     current->is32|=1LL<<HIREG;
1772     current->is32|=1LL<<LOREG;
1773     dirty_reg(current,HIREG);
1774     dirty_reg(current,LOREG);
1775   }
1776 }
1777 #endif
1778
1779 void cop0_alloc(struct regstat *current,int i)
1780 {
1781   if(opcode2[i]==0) // MFC0
1782   {
1783     if(rt1[i]) {
1784       clear_const(current,rt1[i]);
1785       alloc_all(current,i);
1786       alloc_reg(current,i,rt1[i]);
1787       current->is32|=1LL<<rt1[i];
1788       dirty_reg(current,rt1[i]);
1789     }
1790   }
1791   else if(opcode2[i]==4) // MTC0
1792   {
1793     if(rs1[i]){
1794       clear_const(current,rs1[i]);
1795       alloc_reg(current,i,rs1[i]);
1796       alloc_all(current,i);
1797     }
1798     else {
1799       alloc_all(current,i); // FIXME: Keep r0
1800       current->u&=~1LL;
1801       alloc_reg(current,i,0);
1802     }
1803   }
1804   else
1805   {
1806     // TLBR/TLBWI/TLBWR/TLBP/ERET
1807     assert(opcode2[i]==0x10);
1808     alloc_all(current,i);
1809   }
1810   minimum_free_regs[i]=HOST_REGS;
1811 }
1812
1813 void cop1_alloc(struct regstat *current,int i)
1814 {
1815   alloc_reg(current,i,CSREG); // Load status
1816   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1817   {
1818     if(rt1[i]){
1819       clear_const(current,rt1[i]);
1820       if(opcode2[i]==1) {
1821         alloc_reg64(current,i,rt1[i]); // DMFC1
1822         current->is32&=~(1LL<<rt1[i]);
1823       }else{
1824         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1825         current->is32|=1LL<<rt1[i];
1826       }
1827       dirty_reg(current,rt1[i]);
1828     }
1829     alloc_reg_temp(current,i,-1);
1830   }
1831   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1832   {
1833     if(rs1[i]){
1834       clear_const(current,rs1[i]);
1835       if(opcode2[i]==5)
1836         alloc_reg64(current,i,rs1[i]); // DMTC1
1837       else
1838         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1839       alloc_reg_temp(current,i,-1);
1840     }
1841     else {
1842       current->u&=~1LL;
1843       alloc_reg(current,i,0);
1844       alloc_reg_temp(current,i,-1);
1845     }
1846   }
1847   minimum_free_regs[i]=1;
1848 }
1849 void fconv_alloc(struct regstat *current,int i)
1850 {
1851   alloc_reg(current,i,CSREG); // Load status
1852   alloc_reg_temp(current,i,-1);
1853   minimum_free_regs[i]=1;
1854 }
1855 void float_alloc(struct regstat *current,int i)
1856 {
1857   alloc_reg(current,i,CSREG); // Load status
1858   alloc_reg_temp(current,i,-1);
1859   minimum_free_regs[i]=1;
1860 }
1861 void c2op_alloc(struct regstat *current,int i)
1862 {
1863   alloc_reg_temp(current,i,-1);
1864 }
1865 void fcomp_alloc(struct regstat *current,int i)
1866 {
1867   alloc_reg(current,i,CSREG); // Load status
1868   alloc_reg(current,i,FSREG); // Load flags
1869   dirty_reg(current,FSREG); // Flag will be modified
1870   alloc_reg_temp(current,i,-1);
1871   minimum_free_regs[i]=1;
1872 }
1873
1874 void syscall_alloc(struct regstat *current,int i)
1875 {
1876   alloc_cc(current,i);
1877   dirty_reg(current,CCREG);
1878   alloc_all(current,i);
1879   minimum_free_regs[i]=HOST_REGS;
1880   current->isconst=0;
1881 }
1882
1883 void delayslot_alloc(struct regstat *current,int i)
1884 {
1885   switch(itype[i]) {
1886     case UJUMP:
1887     case CJUMP:
1888     case SJUMP:
1889     case RJUMP:
1890     case FJUMP:
1891     case SYSCALL:
1892     case HLECALL:
1893     case SPAN:
1894       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1895       printf("Disabled speculative precompilation\n");
1896       stop_after_jal=1;
1897       break;
1898     case IMM16:
1899       imm16_alloc(current,i);
1900       break;
1901     case LOAD:
1902     case LOADLR:
1903       load_alloc(current,i);
1904       break;
1905     case STORE:
1906     case STORELR:
1907       store_alloc(current,i);
1908       break;
1909     case ALU:
1910       alu_alloc(current,i);
1911       break;
1912     case SHIFT:
1913       shift_alloc(current,i);
1914       break;
1915     case MULTDIV:
1916       multdiv_alloc(current,i);
1917       break;
1918     case SHIFTIMM:
1919       shiftimm_alloc(current,i);
1920       break;
1921     case MOV:
1922       mov_alloc(current,i);
1923       break;
1924     case COP0:
1925       cop0_alloc(current,i);
1926       break;
1927     case COP1:
1928     case COP2:
1929       cop1_alloc(current,i);
1930       break;
1931     case C1LS:
1932       c1ls_alloc(current,i);
1933       break;
1934     case C2LS:
1935       c2ls_alloc(current,i);
1936       break;
1937     case FCONV:
1938       fconv_alloc(current,i);
1939       break;
1940     case FLOAT:
1941       float_alloc(current,i);
1942       break;
1943     case FCOMP:
1944       fcomp_alloc(current,i);
1945       break;
1946     case C2OP:
1947       c2op_alloc(current,i);
1948       break;
1949   }
1950 }
1951
1952 // Special case where a branch and delay slot span two pages in virtual memory
1953 static void pagespan_alloc(struct regstat *current,int i)
1954 {
1955   current->isconst=0;
1956   current->wasconst=0;
1957   regs[i].wasconst=0;
1958   minimum_free_regs[i]=HOST_REGS;
1959   alloc_all(current,i);
1960   alloc_cc(current,i);
1961   dirty_reg(current,CCREG);
1962   if(opcode[i]==3) // JAL
1963   {
1964     alloc_reg(current,i,31);
1965     dirty_reg(current,31);
1966   }
1967   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1968   {
1969     alloc_reg(current,i,rs1[i]);
1970     if (rt1[i]!=0) {
1971       alloc_reg(current,i,rt1[i]);
1972       dirty_reg(current,rt1[i]);
1973     }
1974   }
1975   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1976   {
1977     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1978     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1979     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1980     {
1981       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1982       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1983     }
1984   }
1985   else
1986   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1987   {
1988     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1989     if(!((current->is32>>rs1[i])&1))
1990     {
1991       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1992     }
1993   }
1994   else
1995   if(opcode[i]==0x11) // BC1
1996   {
1997     alloc_reg(current,i,FSREG);
1998     alloc_reg(current,i,CSREG);
1999   }
2000   //else ...
2001 }
2002
2003 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2004 {
2005   stubs[stubcount][0]=type;
2006   stubs[stubcount][1]=addr;
2007   stubs[stubcount][2]=retaddr;
2008   stubs[stubcount][3]=a;
2009   stubs[stubcount][4]=b;
2010   stubs[stubcount][5]=c;
2011   stubs[stubcount][6]=d;
2012   stubs[stubcount][7]=e;
2013   stubcount++;
2014 }
2015
2016 // Write out a single register
2017 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2018 {
2019   int hr;
2020   for(hr=0;hr<HOST_REGS;hr++) {
2021     if(hr!=EXCLUDE_REG) {
2022       if((regmap[hr]&63)==r) {
2023         if((dirty>>hr)&1) {
2024           if(regmap[hr]<64) {
2025             emit_storereg(r,hr);
2026 #ifndef FORCE32
2027             if((is32>>regmap[hr])&1) {
2028               emit_sarimm(hr,31,hr);
2029               emit_storereg(r|64,hr);
2030             }
2031 #endif
2032           }else{
2033             emit_storereg(r|64,hr);
2034           }
2035         }
2036       }
2037     }
2038   }
2039 }
2040
2041 int mchecksum()
2042 {
2043   //if(!tracedebug) return 0;
2044   int i;
2045   int sum=0;
2046   for(i=0;i<2097152;i++) {
2047     unsigned int temp=sum;
2048     sum<<=1;
2049     sum|=(~temp)>>31;
2050     sum^=((u_int *)rdram)[i];
2051   }
2052   return sum;
2053 }
2054 int rchecksum()
2055 {
2056   int i;
2057   int sum=0;
2058   for(i=0;i<64;i++)
2059     sum^=((u_int *)reg)[i];
2060   return sum;
2061 }
2062 void rlist()
2063 {
2064   int i;
2065   printf("TRACE: ");
2066   for(i=0;i<32;i++)
2067     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2068   printf("\n");
2069 #ifndef DISABLE_COP1
2070   printf("TRACE: ");
2071   for(i=0;i<32;i++)
2072     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2073   printf("\n");
2074 #endif
2075 }
2076
2077 void enabletrace()
2078 {
2079   tracedebug=1;
2080 }
2081
2082 void memdebug(int i)
2083 {
2084   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2085   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2086   //rlist();
2087   //if(tracedebug) {
2088   //if(Count>=-2084597794) {
2089   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2090   //if(0) {
2091     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2092     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2093     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2094     rlist();
2095     #ifdef __i386__
2096     printf("TRACE: %x\n",(&i)[-1]);
2097     #endif
2098     #ifdef __arm__
2099     int j;
2100     printf("TRACE: %x \n",(&j)[10]);
2101     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2102     #endif
2103     //fflush(stdout);
2104   }
2105   //printf("TRACE: %x\n",(&i)[-1]);
2106 }
2107
2108 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2109 {
2110   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2111 }
2112
2113 void alu_assemble(int i,struct regstat *i_regs)
2114 {
2115   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2116     if(rt1[i]) {
2117       signed char s1,s2,t;
2118       t=get_reg(i_regs->regmap,rt1[i]);
2119       if(t>=0) {
2120         s1=get_reg(i_regs->regmap,rs1[i]);
2121         s2=get_reg(i_regs->regmap,rs2[i]);
2122         if(rs1[i]&&rs2[i]) {
2123           assert(s1>=0);
2124           assert(s2>=0);
2125           if(opcode2[i]&2) emit_sub(s1,s2,t);
2126           else emit_add(s1,s2,t);
2127         }
2128         else if(rs1[i]) {
2129           if(s1>=0) emit_mov(s1,t);
2130           else emit_loadreg(rs1[i],t);
2131         }
2132         else if(rs2[i]) {
2133           if(s2>=0) {
2134             if(opcode2[i]&2) emit_neg(s2,t);
2135             else emit_mov(s2,t);
2136           }
2137           else {
2138             emit_loadreg(rs2[i],t);
2139             if(opcode2[i]&2) emit_neg(t,t);
2140           }
2141         }
2142         else emit_zeroreg(t);
2143       }
2144     }
2145   }
2146   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2147     if(rt1[i]) {
2148       signed char s1l,s2l,s1h,s2h,tl,th;
2149       tl=get_reg(i_regs->regmap,rt1[i]);
2150       th=get_reg(i_regs->regmap,rt1[i]|64);
2151       if(tl>=0) {
2152         s1l=get_reg(i_regs->regmap,rs1[i]);
2153         s2l=get_reg(i_regs->regmap,rs2[i]);
2154         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2155         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2156         if(rs1[i]&&rs2[i]) {
2157           assert(s1l>=0);
2158           assert(s2l>=0);
2159           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2160           else emit_adds(s1l,s2l,tl);
2161           if(th>=0) {
2162             #ifdef INVERTED_CARRY
2163             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2164             #else
2165             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2166             #endif
2167             else emit_add(s1h,s2h,th);
2168           }
2169         }
2170         else if(rs1[i]) {
2171           if(s1l>=0) emit_mov(s1l,tl);
2172           else emit_loadreg(rs1[i],tl);
2173           if(th>=0) {
2174             if(s1h>=0) emit_mov(s1h,th);
2175             else emit_loadreg(rs1[i]|64,th);
2176           }
2177         }
2178         else if(rs2[i]) {
2179           if(s2l>=0) {
2180             if(opcode2[i]&2) emit_negs(s2l,tl);
2181             else emit_mov(s2l,tl);
2182           }
2183           else {
2184             emit_loadreg(rs2[i],tl);
2185             if(opcode2[i]&2) emit_negs(tl,tl);
2186           }
2187           if(th>=0) {
2188             #ifdef INVERTED_CARRY
2189             if(s2h>=0) emit_mov(s2h,th);
2190             else emit_loadreg(rs2[i]|64,th);
2191             if(opcode2[i]&2) {
2192               emit_adcimm(-1,th); // x86 has inverted carry flag
2193               emit_not(th,th);
2194             }
2195             #else
2196             if(opcode2[i]&2) {
2197               if(s2h>=0) emit_rscimm(s2h,0,th);
2198               else {
2199                 emit_loadreg(rs2[i]|64,th);
2200                 emit_rscimm(th,0,th);
2201               }
2202             }else{
2203               if(s2h>=0) emit_mov(s2h,th);
2204               else emit_loadreg(rs2[i]|64,th);
2205             }
2206             #endif
2207           }
2208         }
2209         else {
2210           emit_zeroreg(tl);
2211           if(th>=0) emit_zeroreg(th);
2212         }
2213       }
2214     }
2215   }
2216   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2217     if(rt1[i]) {
2218       signed char s1l,s1h,s2l,s2h,t;
2219       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2220       {
2221         t=get_reg(i_regs->regmap,rt1[i]);
2222         //assert(t>=0);
2223         if(t>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2226           s2l=get_reg(i_regs->regmap,rs2[i]);
2227           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2228           if(rs2[i]==0) // rx<r0
2229           {
2230             assert(s1h>=0);
2231             if(opcode2[i]==0x2a) // SLT
2232               emit_shrimm(s1h,31,t);
2233             else // SLTU (unsigned can not be less than zero)
2234               emit_zeroreg(t);
2235           }
2236           else if(rs1[i]==0) // r0<rx
2237           {
2238             assert(s2h>=0);
2239             if(opcode2[i]==0x2a) // SLT
2240               emit_set_gz64_32(s2h,s2l,t);
2241             else // SLTU (set if not zero)
2242               emit_set_nz64_32(s2h,s2l,t);
2243           }
2244           else {
2245             assert(s1l>=0);assert(s1h>=0);
2246             assert(s2l>=0);assert(s2h>=0);
2247             if(opcode2[i]==0x2a) // SLT
2248               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2249             else // SLTU
2250               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2251           }
2252         }
2253       } else {
2254         t=get_reg(i_regs->regmap,rt1[i]);
2255         //assert(t>=0);
2256         if(t>=0) {
2257           s1l=get_reg(i_regs->regmap,rs1[i]);
2258           s2l=get_reg(i_regs->regmap,rs2[i]);
2259           if(rs2[i]==0) // rx<r0
2260           {
2261             assert(s1l>=0);
2262             if(opcode2[i]==0x2a) // SLT
2263               emit_shrimm(s1l,31,t);
2264             else // SLTU (unsigned can not be less than zero)
2265               emit_zeroreg(t);
2266           }
2267           else if(rs1[i]==0) // r0<rx
2268           {
2269             assert(s2l>=0);
2270             if(opcode2[i]==0x2a) // SLT
2271               emit_set_gz32(s2l,t);
2272             else // SLTU (set if not zero)
2273               emit_set_nz32(s2l,t);
2274           }
2275           else{
2276             assert(s1l>=0);assert(s2l>=0);
2277             if(opcode2[i]==0x2a) // SLT
2278               emit_set_if_less32(s1l,s2l,t);
2279             else // SLTU
2280               emit_set_if_carry32(s1l,s2l,t);
2281           }
2282         }
2283       }
2284     }
2285   }
2286   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2287     if(rt1[i]) {
2288       signed char s1l,s1h,s2l,s2h,th,tl;
2289       tl=get_reg(i_regs->regmap,rt1[i]);
2290       th=get_reg(i_regs->regmap,rt1[i]|64);
2291       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2292       {
2293         assert(tl>=0);
2294         if(tl>=0) {
2295           s1l=get_reg(i_regs->regmap,rs1[i]);
2296           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2297           s2l=get_reg(i_regs->regmap,rs2[i]);
2298           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2299           if(rs1[i]&&rs2[i]) {
2300             assert(s1l>=0);assert(s1h>=0);
2301             assert(s2l>=0);assert(s2h>=0);
2302             if(opcode2[i]==0x24) { // AND
2303               emit_and(s1l,s2l,tl);
2304               emit_and(s1h,s2h,th);
2305             } else
2306             if(opcode2[i]==0x25) { // OR
2307               emit_or(s1l,s2l,tl);
2308               emit_or(s1h,s2h,th);
2309             } else
2310             if(opcode2[i]==0x26) { // XOR
2311               emit_xor(s1l,s2l,tl);
2312               emit_xor(s1h,s2h,th);
2313             } else
2314             if(opcode2[i]==0x27) { // NOR
2315               emit_or(s1l,s2l,tl);
2316               emit_or(s1h,s2h,th);
2317               emit_not(tl,tl);
2318               emit_not(th,th);
2319             }
2320           }
2321           else
2322           {
2323             if(opcode2[i]==0x24) { // AND
2324               emit_zeroreg(tl);
2325               emit_zeroreg(th);
2326             } else
2327             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2328               if(rs1[i]){
2329                 if(s1l>=0) emit_mov(s1l,tl);
2330                 else emit_loadreg(rs1[i],tl);
2331                 if(s1h>=0) emit_mov(s1h,th);
2332                 else emit_loadreg(rs1[i]|64,th);
2333               }
2334               else
2335               if(rs2[i]){
2336                 if(s2l>=0) emit_mov(s2l,tl);
2337                 else emit_loadreg(rs2[i],tl);
2338                 if(s2h>=0) emit_mov(s2h,th);
2339                 else emit_loadreg(rs2[i]|64,th);
2340               }
2341               else{
2342                 emit_zeroreg(tl);
2343                 emit_zeroreg(th);
2344               }
2345             } else
2346             if(opcode2[i]==0x27) { // NOR
2347               if(rs1[i]){
2348                 if(s1l>=0) emit_not(s1l,tl);
2349                 else{
2350                   emit_loadreg(rs1[i],tl);
2351                   emit_not(tl,tl);
2352                 }
2353                 if(s1h>=0) emit_not(s1h,th);
2354                 else{
2355                   emit_loadreg(rs1[i]|64,th);
2356                   emit_not(th,th);
2357                 }
2358               }
2359               else
2360               if(rs2[i]){
2361                 if(s2l>=0) emit_not(s2l,tl);
2362                 else{
2363                   emit_loadreg(rs2[i],tl);
2364                   emit_not(tl,tl);
2365                 }
2366                 if(s2h>=0) emit_not(s2h,th);
2367                 else{
2368                   emit_loadreg(rs2[i]|64,th);
2369                   emit_not(th,th);
2370                 }
2371               }
2372               else {
2373                 emit_movimm(-1,tl);
2374                 emit_movimm(-1,th);
2375               }
2376             }
2377           }
2378         }
2379       }
2380       else
2381       {
2382         // 32 bit
2383         if(tl>=0) {
2384           s1l=get_reg(i_regs->regmap,rs1[i]);
2385           s2l=get_reg(i_regs->regmap,rs2[i]);
2386           if(rs1[i]&&rs2[i]) {
2387             assert(s1l>=0);
2388             assert(s2l>=0);
2389             if(opcode2[i]==0x24) { // AND
2390               emit_and(s1l,s2l,tl);
2391             } else
2392             if(opcode2[i]==0x25) { // OR
2393               emit_or(s1l,s2l,tl);
2394             } else
2395             if(opcode2[i]==0x26) { // XOR
2396               emit_xor(s1l,s2l,tl);
2397             } else
2398             if(opcode2[i]==0x27) { // NOR
2399               emit_or(s1l,s2l,tl);
2400               emit_not(tl,tl);
2401             }
2402           }
2403           else
2404           {
2405             if(opcode2[i]==0x24) { // AND
2406               emit_zeroreg(tl);
2407             } else
2408             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2409               if(rs1[i]){
2410                 if(s1l>=0) emit_mov(s1l,tl);
2411                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2412               }
2413               else
2414               if(rs2[i]){
2415                 if(s2l>=0) emit_mov(s2l,tl);
2416                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2417               }
2418               else emit_zeroreg(tl);
2419             } else
2420             if(opcode2[i]==0x27) { // NOR
2421               if(rs1[i]){
2422                 if(s1l>=0) emit_not(s1l,tl);
2423                 else {
2424                   emit_loadreg(rs1[i],tl);
2425                   emit_not(tl,tl);
2426                 }
2427               }
2428               else
2429               if(rs2[i]){
2430                 if(s2l>=0) emit_not(s2l,tl);
2431                 else {
2432                   emit_loadreg(rs2[i],tl);
2433                   emit_not(tl,tl);
2434                 }
2435               }
2436               else emit_movimm(-1,tl);
2437             }
2438           }
2439         }
2440       }
2441     }
2442   }
2443 }
2444
2445 void imm16_assemble(int i,struct regstat *i_regs)
2446 {
2447   if (opcode[i]==0x0f) { // LUI
2448     if(rt1[i]) {
2449       signed char t;
2450       t=get_reg(i_regs->regmap,rt1[i]);
2451       //assert(t>=0);
2452       if(t>=0) {
2453         if(!((i_regs->isconst>>t)&1))
2454           emit_movimm(imm[i]<<16,t);
2455       }
2456     }
2457   }
2458   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2459     if(rt1[i]) {
2460       signed char s,t;
2461       t=get_reg(i_regs->regmap,rt1[i]);
2462       s=get_reg(i_regs->regmap,rs1[i]);
2463       if(rs1[i]) {
2464         //assert(t>=0);
2465         //assert(s>=0);
2466         if(t>=0) {
2467           if(!((i_regs->isconst>>t)&1)) {
2468             if(s<0) {
2469               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2470               emit_addimm(t,imm[i],t);
2471             }else{
2472               if(!((i_regs->wasconst>>s)&1))
2473                 emit_addimm(s,imm[i],t);
2474               else
2475                 emit_movimm(constmap[i][s]+imm[i],t);
2476             }
2477           }
2478         }
2479       } else {
2480         if(t>=0) {
2481           if(!((i_regs->isconst>>t)&1))
2482             emit_movimm(imm[i],t);
2483         }
2484       }
2485     }
2486   }
2487   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2488     if(rt1[i]) {
2489       signed char sh,sl,th,tl;
2490       th=get_reg(i_regs->regmap,rt1[i]|64);
2491       tl=get_reg(i_regs->regmap,rt1[i]);
2492       sh=get_reg(i_regs->regmap,rs1[i]|64);
2493       sl=get_reg(i_regs->regmap,rs1[i]);
2494       if(tl>=0) {
2495         if(rs1[i]) {
2496           assert(sh>=0);
2497           assert(sl>=0);
2498           if(th>=0) {
2499             emit_addimm64_32(sh,sl,imm[i],th,tl);
2500           }
2501           else {
2502             emit_addimm(sl,imm[i],tl);
2503           }
2504         } else {
2505           emit_movimm(imm[i],tl);
2506           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2507         }
2508       }
2509     }
2510   }
2511   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2512     if(rt1[i]) {
2513       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2514       signed char sh,sl,t;
2515       t=get_reg(i_regs->regmap,rt1[i]);
2516       sh=get_reg(i_regs->regmap,rs1[i]|64);
2517       sl=get_reg(i_regs->regmap,rs1[i]);
2518       //assert(t>=0);
2519       if(t>=0) {
2520         if(rs1[i]>0) {
2521           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2522           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2523             if(opcode[i]==0x0a) { // SLTI
2524               if(sl<0) {
2525                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2526                 emit_slti32(t,imm[i],t);
2527               }else{
2528                 emit_slti32(sl,imm[i],t);
2529               }
2530             }
2531             else { // SLTIU
2532               if(sl<0) {
2533                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2534                 emit_sltiu32(t,imm[i],t);
2535               }else{
2536                 emit_sltiu32(sl,imm[i],t);
2537               }
2538             }
2539           }else{ // 64-bit
2540             assert(sl>=0);
2541             if(opcode[i]==0x0a) // SLTI
2542               emit_slti64_32(sh,sl,imm[i],t);
2543             else // SLTIU
2544               emit_sltiu64_32(sh,sl,imm[i],t);
2545           }
2546         }else{
2547           // SLTI(U) with r0 is just stupid,
2548           // nonetheless examples can be found
2549           if(opcode[i]==0x0a) // SLTI
2550             if(0<imm[i]) emit_movimm(1,t);
2551             else emit_zeroreg(t);
2552           else // SLTIU
2553           {
2554             if(imm[i]) emit_movimm(1,t);
2555             else emit_zeroreg(t);
2556           }
2557         }
2558       }
2559     }
2560   }
2561   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2562     if(rt1[i]) {
2563       signed char sh,sl,th,tl;
2564       th=get_reg(i_regs->regmap,rt1[i]|64);
2565       tl=get_reg(i_regs->regmap,rt1[i]);
2566       sh=get_reg(i_regs->regmap,rs1[i]|64);
2567       sl=get_reg(i_regs->regmap,rs1[i]);
2568       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2569         if(opcode[i]==0x0c) //ANDI
2570         {
2571           if(rs1[i]) {
2572             if(sl<0) {
2573               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2574               emit_andimm(tl,imm[i],tl);
2575             }else{
2576               if(!((i_regs->wasconst>>sl)&1))
2577                 emit_andimm(sl,imm[i],tl);
2578               else
2579                 emit_movimm(constmap[i][sl]&imm[i],tl);
2580             }
2581           }
2582           else
2583             emit_zeroreg(tl);
2584           if(th>=0) emit_zeroreg(th);
2585         }
2586         else
2587         {
2588           if(rs1[i]) {
2589             if(sl<0) {
2590               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2591             }
2592             if(th>=0) {
2593               if(sh<0) {
2594                 emit_loadreg(rs1[i]|64,th);
2595               }else{
2596                 emit_mov(sh,th);
2597               }
2598             }
2599             if(opcode[i]==0x0d) //ORI
2600             if(sl<0) {
2601               emit_orimm(tl,imm[i],tl);
2602             }else{
2603               if(!((i_regs->wasconst>>sl)&1))
2604                 emit_orimm(sl,imm[i],tl);
2605               else
2606                 emit_movimm(constmap[i][sl]|imm[i],tl);
2607             }
2608             if(opcode[i]==0x0e) //XORI
2609             if(sl<0) {
2610               emit_xorimm(tl,imm[i],tl);
2611             }else{
2612               if(!((i_regs->wasconst>>sl)&1))
2613                 emit_xorimm(sl,imm[i],tl);
2614               else
2615                 emit_movimm(constmap[i][sl]^imm[i],tl);
2616             }
2617           }
2618           else {
2619             emit_movimm(imm[i],tl);
2620             if(th>=0) emit_zeroreg(th);
2621           }
2622         }
2623       }
2624     }
2625   }
2626 }
2627
2628 void shiftimm_assemble(int i,struct regstat *i_regs)
2629 {
2630   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2631   {
2632     if(rt1[i]) {
2633       signed char s,t;
2634       t=get_reg(i_regs->regmap,rt1[i]);
2635       s=get_reg(i_regs->regmap,rs1[i]);
2636       //assert(t>=0);
2637       if(t>=0){
2638         if(rs1[i]==0)
2639         {
2640           emit_zeroreg(t);
2641         }
2642         else
2643         {
2644           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2645           if(imm[i]) {
2646             if(opcode2[i]==0) // SLL
2647             {
2648               emit_shlimm(s<0?t:s,imm[i],t);
2649             }
2650             if(opcode2[i]==2) // SRL
2651             {
2652               emit_shrimm(s<0?t:s,imm[i],t);
2653             }
2654             if(opcode2[i]==3) // SRA
2655             {
2656               emit_sarimm(s<0?t:s,imm[i],t);
2657             }
2658           }else{
2659             // Shift by zero
2660             if(s>=0 && s!=t) emit_mov(s,t);
2661           }
2662         }
2663       }
2664       //emit_storereg(rt1[i],t); //DEBUG
2665     }
2666   }
2667   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2668   {
2669     if(rt1[i]) {
2670       signed char sh,sl,th,tl;
2671       th=get_reg(i_regs->regmap,rt1[i]|64);
2672       tl=get_reg(i_regs->regmap,rt1[i]);
2673       sh=get_reg(i_regs->regmap,rs1[i]|64);
2674       sl=get_reg(i_regs->regmap,rs1[i]);
2675       if(tl>=0) {
2676         if(rs1[i]==0)
2677         {
2678           emit_zeroreg(tl);
2679           if(th>=0) emit_zeroreg(th);
2680         }
2681         else
2682         {
2683           assert(sl>=0);
2684           assert(sh>=0);
2685           if(imm[i]) {
2686             if(opcode2[i]==0x38) // DSLL
2687             {
2688               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2689               emit_shlimm(sl,imm[i],tl);
2690             }
2691             if(opcode2[i]==0x3a) // DSRL
2692             {
2693               emit_shrdimm(sl,sh,imm[i],tl);
2694               if(th>=0) emit_shrimm(sh,imm[i],th);
2695             }
2696             if(opcode2[i]==0x3b) // DSRA
2697             {
2698               emit_shrdimm(sl,sh,imm[i],tl);
2699               if(th>=0) emit_sarimm(sh,imm[i],th);
2700             }
2701           }else{
2702             // Shift by zero
2703             if(sl!=tl) emit_mov(sl,tl);
2704             if(th>=0&&sh!=th) emit_mov(sh,th);
2705           }
2706         }
2707       }
2708     }
2709   }
2710   if(opcode2[i]==0x3c) // DSLL32
2711   {
2712     if(rt1[i]) {
2713       signed char sl,tl,th;
2714       tl=get_reg(i_regs->regmap,rt1[i]);
2715       th=get_reg(i_regs->regmap,rt1[i]|64);
2716       sl=get_reg(i_regs->regmap,rs1[i]);
2717       if(th>=0||tl>=0){
2718         assert(tl>=0);
2719         assert(th>=0);
2720         assert(sl>=0);
2721         emit_mov(sl,th);
2722         emit_zeroreg(tl);
2723         if(imm[i]>32)
2724         {
2725           emit_shlimm(th,imm[i]&31,th);
2726         }
2727       }
2728     }
2729   }
2730   if(opcode2[i]==0x3e) // DSRL32
2731   {
2732     if(rt1[i]) {
2733       signed char sh,tl,th;
2734       tl=get_reg(i_regs->regmap,rt1[i]);
2735       th=get_reg(i_regs->regmap,rt1[i]|64);
2736       sh=get_reg(i_regs->regmap,rs1[i]|64);
2737       if(tl>=0){
2738         assert(sh>=0);
2739         emit_mov(sh,tl);
2740         if(th>=0) emit_zeroreg(th);
2741         if(imm[i]>32)
2742         {
2743           emit_shrimm(tl,imm[i]&31,tl);
2744         }
2745       }
2746     }
2747   }
2748   if(opcode2[i]==0x3f) // DSRA32
2749   {
2750     if(rt1[i]) {
2751       signed char sh,tl;
2752       tl=get_reg(i_regs->regmap,rt1[i]);
2753       sh=get_reg(i_regs->regmap,rs1[i]|64);
2754       if(tl>=0){
2755         assert(sh>=0);
2756         emit_mov(sh,tl);
2757         if(imm[i]>32)
2758         {
2759           emit_sarimm(tl,imm[i]&31,tl);
2760         }
2761       }
2762     }
2763   }
2764 }
2765
2766 #ifndef shift_assemble
2767 void shift_assemble(int i,struct regstat *i_regs)
2768 {
2769   printf("Need shift_assemble for this architecture.\n");
2770   exit(1);
2771 }
2772 #endif
2773
2774 void load_assemble(int i,struct regstat *i_regs)
2775 {
2776   int s,th,tl,addr,map=-1;
2777   int offset;
2778   int jaddr=0;
2779   int memtarget=0,c=0;
2780   u_int hr,reglist=0;
2781   th=get_reg(i_regs->regmap,rt1[i]|64);
2782   tl=get_reg(i_regs->regmap,rt1[i]);
2783   s=get_reg(i_regs->regmap,rs1[i]);
2784   offset=imm[i];
2785   for(hr=0;hr<HOST_REGS;hr++) {
2786     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2787   }
2788   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2789   if(s>=0) {
2790     c=(i_regs->wasconst>>s)&1;
2791     if (c) {
2792       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2793       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2794     }
2795   }
2796   //printf("load_assemble: c=%d\n",c);
2797   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2798   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2799 #ifdef PCSX
2800   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2801     ||rt1[i]==0) {
2802       // could be FIFO, must perform the read
2803       // ||dummy read
2804       assem_debug("(forced read)\n");
2805       tl=get_reg(i_regs->regmap,-1);
2806       assert(tl>=0);
2807   }
2808 #endif
2809   if(offset||s<0||c) addr=tl;
2810   else addr=s;
2811   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2812  if(tl>=0) {
2813   //printf("load_assemble: c=%d\n",c);
2814   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2815   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2816   reglist&=~(1<<tl);
2817   if(th>=0) reglist&=~(1<<th);
2818   if(!using_tlb) {
2819     if(!c) {
2820       #ifdef RAM_OFFSET
2821       map=get_reg(i_regs->regmap,ROREG);
2822       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2823       #endif
2824 //#define R29_HACK 1
2825       #ifdef R29_HACK
2826       // Strmnnrmn's speed hack
2827       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2828       #endif
2829       {
2830         #ifdef PCSX
2831         if(sp_in_mirror&&rs1[i]==29) {
2832           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2833           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2834         }
2835         else
2836         #endif
2837         emit_cmpimm(addr,RAM_SIZE);
2838         jaddr=(int)out;
2839         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2840         // Hint to branch predictor that the branch is unlikely to be taken
2841         if(rs1[i]>=28)
2842           emit_jno_unlikely(0);
2843         else
2844         #endif
2845         emit_jno(0);
2846       }
2847     }
2848   }else{ // using tlb
2849     int x=0;
2850     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2851     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2852     map=get_reg(i_regs->regmap,TLREG);
2853     assert(map>=0);
2854     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2855     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2856   }
2857   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2858   if (opcode[i]==0x20) { // LB
2859     if(!c||memtarget) {
2860       if(!dummy) {
2861         #ifdef HOST_IMM_ADDR32
2862         if(c)
2863           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2864         else
2865         #endif
2866         {
2867           //emit_xorimm(addr,3,tl);
2868           //gen_tlb_addr_r(tl,map);
2869           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2870           int x=0,a=tl;
2871 #ifdef BIG_ENDIAN_MIPS
2872           if(!c) emit_xorimm(addr,3,tl);
2873           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2874 #else
2875           if(!c) a=addr;
2876 #endif
2877 #ifdef PCSX
2878           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2879 #endif
2880           emit_movsbl_indexed_tlb(x,a,map,tl);
2881         }
2882       }
2883       if(jaddr)
2884         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2885     }
2886     else
2887       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2888   }
2889   if (opcode[i]==0x21) { // LH
2890     if(!c||memtarget) {
2891       if(!dummy) {
2892         #ifdef HOST_IMM_ADDR32
2893         if(c)
2894           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2895         else
2896         #endif
2897         {
2898           int x=0,a=tl;
2899 #ifdef BIG_ENDIAN_MIPS
2900           if(!c) emit_xorimm(addr,2,tl);
2901           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2902 #else
2903           if(!c) a=addr;
2904 #endif
2905 #ifdef PCSX
2906           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2907 #endif
2908           //#ifdef
2909           //emit_movswl_indexed_tlb(x,tl,map,tl);
2910           //else
2911           if(map>=0) {
2912             gen_tlb_addr_r(a,map);
2913             emit_movswl_indexed(x,a,tl);
2914           }else{
2915             #ifdef RAM_OFFSET
2916             emit_movswl_indexed(x,a,tl);
2917             #else
2918             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2919             #endif
2920           }
2921         }
2922       }
2923       if(jaddr)
2924         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2925     }
2926     else
2927       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2928   }
2929   if (opcode[i]==0x23) { // LW
2930     if(!c||memtarget) {
2931       if(!dummy) {
2932         int a=addr;
2933 #ifdef PCSX
2934         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2935 #endif
2936         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2937         #ifdef HOST_IMM_ADDR32
2938         if(c)
2939           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2940         else
2941         #endif
2942         emit_readword_indexed_tlb(0,a,map,tl);
2943       }
2944       if(jaddr)
2945         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2946     }
2947     else
2948       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2949   }
2950   if (opcode[i]==0x24) { // LBU
2951     if(!c||memtarget) {
2952       if(!dummy) {
2953         #ifdef HOST_IMM_ADDR32
2954         if(c)
2955           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2956         else
2957         #endif
2958         {
2959           //emit_xorimm(addr,3,tl);
2960           //gen_tlb_addr_r(tl,map);
2961           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2962           int x=0,a=tl;
2963 #ifdef BIG_ENDIAN_MIPS
2964           if(!c) emit_xorimm(addr,3,tl);
2965           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2966 #else
2967           if(!c) a=addr;
2968 #endif
2969 #ifdef PCSX
2970           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2971 #endif
2972           emit_movzbl_indexed_tlb(x,a,map,tl);
2973         }
2974       }
2975       if(jaddr)
2976         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2977     }
2978     else
2979       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2980   }
2981   if (opcode[i]==0x25) { // LHU
2982     if(!c||memtarget) {
2983       if(!dummy) {
2984         #ifdef HOST_IMM_ADDR32
2985         if(c)
2986           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2987         else
2988         #endif
2989         {
2990           int x=0,a=tl;
2991 #ifdef BIG_ENDIAN_MIPS
2992           if(!c) emit_xorimm(addr,2,tl);
2993           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2994 #else
2995           if(!c) a=addr;
2996 #endif
2997 #ifdef PCSX
2998           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2999 #endif
3000           //#ifdef
3001           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3002           //#else
3003           if(map>=0) {
3004             gen_tlb_addr_r(a,map);
3005             emit_movzwl_indexed(x,a,tl);
3006           }else{
3007             #ifdef RAM_OFFSET
3008             emit_movzwl_indexed(x,a,tl);
3009             #else
3010             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3011             #endif
3012           }
3013         }
3014       }
3015       if(jaddr)
3016         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3017     }
3018     else
3019       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3020   }
3021   if (opcode[i]==0x27) { // LWU
3022     assert(th>=0);
3023     if(!c||memtarget) {
3024       if(!dummy) {
3025         int a=addr;
3026 #ifdef PCSX
3027         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3028 #endif
3029         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3030         #ifdef HOST_IMM_ADDR32
3031         if(c)
3032           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3033         else
3034         #endif
3035         emit_readword_indexed_tlb(0,a,map,tl);
3036       }
3037       if(jaddr)
3038         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3039     }
3040     else {
3041       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3042     }
3043     emit_zeroreg(th);
3044   }
3045   if (opcode[i]==0x37) { // LD
3046     if(!c||memtarget) {
3047       if(!dummy) {
3048         int a=addr;
3049 #ifdef PCSX
3050         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3051 #endif
3052         //gen_tlb_addr_r(tl,map);
3053         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3054         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3055         #ifdef HOST_IMM_ADDR32
3056         if(c)
3057           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3058         else
3059         #endif
3060         emit_readdword_indexed_tlb(0,a,map,th,tl);
3061       }
3062       if(jaddr)
3063         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3064     }
3065     else
3066       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3067   }
3068  }
3069   //emit_storereg(rt1[i],tl); // DEBUG
3070   //if(opcode[i]==0x23)
3071   //if(opcode[i]==0x24)
3072   //if(opcode[i]==0x23||opcode[i]==0x24)
3073   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3074   {
3075     //emit_pusha();
3076     save_regs(0x100f);
3077         emit_readword((int)&last_count,ECX);
3078         #ifdef __i386__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,HOST_CCREG);
3081         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3082         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3083         emit_writeword(HOST_CCREG,(int)&Count);
3084         #endif
3085         #ifdef __arm__
3086         if(get_reg(i_regs->regmap,CCREG)<0)
3087           emit_loadreg(CCREG,0);
3088         else
3089           emit_mov(HOST_CCREG,0);
3090         emit_add(0,ECX,0);
3091         emit_addimm(0,2*ccadj[i],0);
3092         emit_writeword(0,(int)&Count);
3093         #endif
3094     emit_call((int)memdebug);
3095     //emit_popa();
3096     restore_regs(0x100f);
3097   }/**/
3098 }
3099
3100 #ifndef loadlr_assemble
3101 void loadlr_assemble(int i,struct regstat *i_regs)
3102 {
3103   printf("Need loadlr_assemble for this architecture.\n");
3104   exit(1);
3105 }
3106 #endif
3107
3108 void store_assemble(int i,struct regstat *i_regs)
3109 {
3110   int s,th,tl,map=-1;
3111   int addr,temp;
3112   int offset;
3113   int jaddr=0,jaddr2,type;
3114   int memtarget=0,c=0;
3115   int agr=AGEN1+(i&1);
3116   u_int hr,reglist=0;
3117   th=get_reg(i_regs->regmap,rs2[i]|64);
3118   tl=get_reg(i_regs->regmap,rs2[i]);
3119   s=get_reg(i_regs->regmap,rs1[i]);
3120   temp=get_reg(i_regs->regmap,agr);
3121   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3122   offset=imm[i];
3123   if(s>=0) {
3124     c=(i_regs->wasconst>>s)&1;
3125     if(c) {
3126       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3127       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3128     }
3129   }
3130   assert(tl>=0);
3131   assert(temp>=0);
3132   for(hr=0;hr<HOST_REGS;hr++) {
3133     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3134   }
3135   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3136   if(offset||s<0||c) addr=temp;
3137   else addr=s;
3138   if(!using_tlb) {
3139     if(!c) {
3140       #ifdef PCSX
3141       if(sp_in_mirror&&rs1[i]==29) {
3142         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3143         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3144       }
3145       else
3146       #endif
3147       #ifdef R29_HACK
3148       // Strmnnrmn's speed hack
3149       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3150       #endif
3151       emit_cmpimm(addr,RAM_SIZE);
3152       #ifdef DESTRUCTIVE_SHIFT
3153       if(s==addr) emit_mov(s,temp);
3154       #endif
3155       #ifdef R29_HACK
3156       memtarget=1;
3157       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3158       #endif
3159       {
3160         jaddr=(int)out;
3161         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3162         // Hint to branch predictor that the branch is unlikely to be taken
3163         if(rs1[i]>=28)
3164           emit_jno_unlikely(0);
3165         else
3166         #endif
3167         emit_jno(0);
3168       }
3169     }
3170   }else{ // using tlb
3171     int x=0;
3172     if (opcode[i]==0x28) x=3; // SB
3173     if (opcode[i]==0x29) x=2; // SH
3174     map=get_reg(i_regs->regmap,TLREG);
3175     assert(map>=0);
3176     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3177     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3178   }
3179
3180   if (opcode[i]==0x28) { // SB
3181     if(!c||memtarget) {
3182       int x=0,a=temp;
3183 #ifdef BIG_ENDIAN_MIPS
3184       if(!c) emit_xorimm(addr,3,temp);
3185       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3186 #else
3187       if(!c) a=addr;
3188 #endif
3189 #ifdef PCSX
3190       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3191 #endif
3192       //gen_tlb_addr_w(temp,map);
3193       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3194       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3195     }
3196     type=STOREB_STUB;
3197   }
3198   if (opcode[i]==0x29) { // SH
3199     if(!c||memtarget) {
3200       int x=0,a=temp;
3201 #ifdef BIG_ENDIAN_MIPS
3202       if(!c) emit_xorimm(addr,2,temp);
3203       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3204 #else
3205       if(!c) a=addr;
3206 #endif
3207 #ifdef PCSX
3208       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3209 #endif
3210       //#ifdef
3211       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3212       //#else
3213       if(map>=0) {
3214         gen_tlb_addr_w(a,map);
3215         emit_writehword_indexed(tl,x,a);
3216       }else
3217         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3218     }
3219     type=STOREH_STUB;
3220   }
3221   if (opcode[i]==0x2B) { // SW
3222     if(!c||memtarget) {
3223       int a=addr;
3224 #ifdef PCSX
3225       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3226 #endif
3227       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3228       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3229     }
3230     type=STOREW_STUB;
3231   }
3232   if (opcode[i]==0x3F) { // SD
3233     if(!c||memtarget) {
3234       int a=addr;
3235 #ifdef PCSX
3236       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3237 #endif
3238       if(rs2[i]) {
3239         assert(th>=0);
3240         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3241         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3242         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3243       }else{
3244         // Store zero
3245         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3246         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3247         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3248       }
3249     }
3250     type=STORED_STUB;
3251   }
3252   if(!using_tlb) {
3253     if(!c||memtarget) {
3254       #ifdef DESTRUCTIVE_SHIFT
3255       // The x86 shift operation is 'destructive'; it overwrites the
3256       // source register, so we need to make a copy first and use that.
3257       addr=temp;
3258       #endif
3259       #if defined(HOST_IMM8)
3260       int ir=get_reg(i_regs->regmap,INVCP);
3261       assert(ir>=0);
3262       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3263       #else
3264       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3265       #endif
3266       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3267       emit_callne(invalidate_addr_reg[addr]);
3268       #else
3269       jaddr2=(int)out;
3270       emit_jne(0);
3271       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3272       #endif
3273     }
3274   }
3275   if(jaddr) {
3276     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3277   } else if(c&&!memtarget) {
3278     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3279   }
3280   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3281   //if(opcode[i]==0x2B || opcode[i]==0x28)
3282   //if(opcode[i]==0x2B || opcode[i]==0x29)
3283   //if(opcode[i]==0x2B)
3284   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3285   {
3286     //emit_pusha();
3287     save_regs(0x100f);
3288         emit_readword((int)&last_count,ECX);
3289         #ifdef __i386__
3290         if(get_reg(i_regs->regmap,CCREG)<0)
3291           emit_loadreg(CCREG,HOST_CCREG);
3292         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3293         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3294         emit_writeword(HOST_CCREG,(int)&Count);
3295         #endif
3296         #ifdef __arm__
3297         if(get_reg(i_regs->regmap,CCREG)<0)
3298           emit_loadreg(CCREG,0);
3299         else
3300           emit_mov(HOST_CCREG,0);
3301         emit_add(0,ECX,0);
3302         emit_addimm(0,2*ccadj[i],0);
3303         emit_writeword(0,(int)&Count);
3304         #endif
3305     emit_call((int)memdebug);
3306     //emit_popa();
3307     restore_regs(0x100f);
3308   }/**/
3309 }
3310
3311 void storelr_assemble(int i,struct regstat *i_regs)
3312 {
3313   int s,th,tl;
3314   int temp;
3315   int temp2;
3316   int offset;
3317   int jaddr=0,jaddr2;
3318   int case1,case2,case3;
3319   int done0,done1,done2;
3320   int memtarget=0,c=0;
3321   int agr=AGEN1+(i&1);
3322   u_int hr,reglist=0;
3323   th=get_reg(i_regs->regmap,rs2[i]|64);
3324   tl=get_reg(i_regs->regmap,rs2[i]);
3325   s=get_reg(i_regs->regmap,rs1[i]);
3326   temp=get_reg(i_regs->regmap,agr);
3327   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3328   offset=imm[i];
3329   if(s>=0) {
3330     c=(i_regs->isconst>>s)&1;
3331     if(c) {
3332       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3333       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3334     }
3335   }
3336   assert(tl>=0);
3337   for(hr=0;hr<HOST_REGS;hr++) {
3338     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3339   }
3340   assert(temp>=0);
3341   if(!using_tlb) {
3342     if(!c) {
3343       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3344       if(!offset&&s!=temp) emit_mov(s,temp);
3345       jaddr=(int)out;
3346       emit_jno(0);
3347     }
3348     else
3349     {
3350       if(!memtarget||!rs1[i]) {
3351         jaddr=(int)out;
3352         emit_jmp(0);
3353       }
3354     }
3355     #ifdef RAM_OFFSET
3356     int map=get_reg(i_regs->regmap,ROREG);
3357     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3358     gen_tlb_addr_w(temp,map);
3359     #else
3360     if((u_int)rdram!=0x80000000) 
3361       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3362     #endif
3363   }else{ // using tlb
3364     int map=get_reg(i_regs->regmap,TLREG);
3365     assert(map>=0);
3366     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3367     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3368     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3369     if(!jaddr&&!memtarget) {
3370       jaddr=(int)out;
3371       emit_jmp(0);
3372     }
3373     gen_tlb_addr_w(temp,map);
3374   }
3375
3376   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3377     temp2=get_reg(i_regs->regmap,FTEMP);
3378     if(!rs2[i]) temp2=th=tl;
3379   }
3380
3381 #ifndef BIG_ENDIAN_MIPS
3382     emit_xorimm(temp,3,temp);
3383 #endif
3384   emit_testimm(temp,2);
3385   case2=(int)out;
3386   emit_jne(0);
3387   emit_testimm(temp,1);
3388   case1=(int)out;
3389   emit_jne(0);
3390   // 0
3391   if (opcode[i]==0x2A) { // SWL
3392     emit_writeword_indexed(tl,0,temp);
3393   }
3394   if (opcode[i]==0x2E) { // SWR
3395     emit_writebyte_indexed(tl,3,temp);
3396   }
3397   if (opcode[i]==0x2C) { // SDL
3398     emit_writeword_indexed(th,0,temp);
3399     if(rs2[i]) emit_mov(tl,temp2);
3400   }
3401   if (opcode[i]==0x2D) { // SDR
3402     emit_writebyte_indexed(tl,3,temp);
3403     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3404   }
3405   done0=(int)out;
3406   emit_jmp(0);
3407   // 1
3408   set_jump_target(case1,(int)out);
3409   if (opcode[i]==0x2A) { // SWL
3410     // Write 3 msb into three least significant bytes
3411     if(rs2[i]) emit_rorimm(tl,8,tl);
3412     emit_writehword_indexed(tl,-1,temp);
3413     if(rs2[i]) emit_rorimm(tl,16,tl);
3414     emit_writebyte_indexed(tl,1,temp);
3415     if(rs2[i]) emit_rorimm(tl,8,tl);
3416   }
3417   if (opcode[i]==0x2E) { // SWR
3418     // Write two lsb into two most significant bytes
3419     emit_writehword_indexed(tl,1,temp);
3420   }
3421   if (opcode[i]==0x2C) { // SDL
3422     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3423     // Write 3 msb into three least significant bytes
3424     if(rs2[i]) emit_rorimm(th,8,th);
3425     emit_writehword_indexed(th,-1,temp);
3426     if(rs2[i]) emit_rorimm(th,16,th);
3427     emit_writebyte_indexed(th,1,temp);
3428     if(rs2[i]) emit_rorimm(th,8,th);
3429   }
3430   if (opcode[i]==0x2D) { // SDR
3431     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3432     // Write two lsb into two most significant bytes
3433     emit_writehword_indexed(tl,1,temp);
3434   }
3435   done1=(int)out;
3436   emit_jmp(0);
3437   // 2
3438   set_jump_target(case2,(int)out);
3439   emit_testimm(temp,1);
3440   case3=(int)out;
3441   emit_jne(0);
3442   if (opcode[i]==0x2A) { // SWL
3443     // Write two msb into two least significant bytes
3444     if(rs2[i]) emit_rorimm(tl,16,tl);
3445     emit_writehword_indexed(tl,-2,temp);
3446     if(rs2[i]) emit_rorimm(tl,16,tl);
3447   }
3448   if (opcode[i]==0x2E) { // SWR
3449     // Write 3 lsb into three most significant bytes
3450     emit_writebyte_indexed(tl,-1,temp);
3451     if(rs2[i]) emit_rorimm(tl,8,tl);
3452     emit_writehword_indexed(tl,0,temp);
3453     if(rs2[i]) emit_rorimm(tl,24,tl);
3454   }
3455   if (opcode[i]==0x2C) { // SDL
3456     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3457     // Write two msb into two least significant bytes
3458     if(rs2[i]) emit_rorimm(th,16,th);
3459     emit_writehword_indexed(th,-2,temp);
3460     if(rs2[i]) emit_rorimm(th,16,th);
3461   }
3462   if (opcode[i]==0x2D) { // SDR
3463     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3464     // Write 3 lsb into three most significant bytes
3465     emit_writebyte_indexed(tl,-1,temp);
3466     if(rs2[i]) emit_rorimm(tl,8,tl);
3467     emit_writehword_indexed(tl,0,temp);
3468     if(rs2[i]) emit_rorimm(tl,24,tl);
3469   }
3470   done2=(int)out;
3471   emit_jmp(0);
3472   // 3
3473   set_jump_target(case3,(int)out);
3474   if (opcode[i]==0x2A) { // SWL
3475     // Write msb into least significant byte
3476     if(rs2[i]) emit_rorimm(tl,24,tl);
3477     emit_writebyte_indexed(tl,-3,temp);
3478     if(rs2[i]) emit_rorimm(tl,8,tl);
3479   }
3480   if (opcode[i]==0x2E) { // SWR
3481     // Write entire word
3482     emit_writeword_indexed(tl,-3,temp);
3483   }
3484   if (opcode[i]==0x2C) { // SDL
3485     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3486     // Write msb into least significant byte
3487     if(rs2[i]) emit_rorimm(th,24,th);
3488     emit_writebyte_indexed(th,-3,temp);
3489     if(rs2[i]) emit_rorimm(th,8,th);
3490   }
3491   if (opcode[i]==0x2D) { // SDR
3492     if(rs2[i]) emit_mov(th,temp2);
3493     // Write entire word
3494     emit_writeword_indexed(tl,-3,temp);
3495   }
3496   set_jump_target(done0,(int)out);
3497   set_jump_target(done1,(int)out);
3498   set_jump_target(done2,(int)out);
3499   if (opcode[i]==0x2C) { // SDL
3500     emit_testimm(temp,4);
3501     done0=(int)out;
3502     emit_jne(0);
3503     emit_andimm(temp,~3,temp);
3504     emit_writeword_indexed(temp2,4,temp);
3505     set_jump_target(done0,(int)out);
3506   }
3507   if (opcode[i]==0x2D) { // SDR
3508     emit_testimm(temp,4);
3509     done0=(int)out;
3510     emit_jeq(0);
3511     emit_andimm(temp,~3,temp);
3512     emit_writeword_indexed(temp2,-4,temp);
3513     set_jump_target(done0,(int)out);
3514   }
3515   if(!c||!memtarget)
3516     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3517   if(!using_tlb) {
3518     #ifdef RAM_OFFSET
3519     int map=get_reg(i_regs->regmap,ROREG);
3520     if(map<0) map=HOST_TEMPREG;
3521     gen_orig_addr_w(temp,map);
3522     #else
3523     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3524     #endif
3525     #if defined(HOST_IMM8)
3526     int ir=get_reg(i_regs->regmap,INVCP);
3527     assert(ir>=0);
3528     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3529     #else
3530     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3531     #endif
3532     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3533     emit_callne(invalidate_addr_reg[temp]);
3534     #else
3535     jaddr2=(int)out;
3536     emit_jne(0);
3537     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3538     #endif
3539   }
3540   /*
3541     emit_pusha();
3542     //save_regs(0x100f);
3543         emit_readword((int)&last_count,ECX);
3544         if(get_reg(i_regs->regmap,CCREG)<0)
3545           emit_loadreg(CCREG,HOST_CCREG);
3546         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3547         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3548         emit_writeword(HOST_CCREG,(int)&Count);
3549     emit_call((int)memdebug);
3550     emit_popa();
3551     //restore_regs(0x100f);
3552   /**/
3553 }
3554
3555 void c1ls_assemble(int i,struct regstat *i_regs)
3556 {
3557 #ifndef DISABLE_COP1
3558   int s,th,tl;
3559   int temp,ar;
3560   int map=-1;
3561   int offset;
3562   int c=0;
3563   int jaddr,jaddr2=0,jaddr3,type;
3564   int agr=AGEN1+(i&1);
3565   u_int hr,reglist=0;
3566   th=get_reg(i_regs->regmap,FTEMP|64);
3567   tl=get_reg(i_regs->regmap,FTEMP);
3568   s=get_reg(i_regs->regmap,rs1[i]);
3569   temp=get_reg(i_regs->regmap,agr);
3570   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3571   offset=imm[i];
3572   assert(tl>=0);
3573   assert(rs1[i]>0);
3574   assert(temp>=0);
3575   for(hr=0;hr<HOST_REGS;hr++) {
3576     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3577   }
3578   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3579   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3580   {
3581     // Loads use a temporary register which we need to save
3582     reglist|=1<<temp;
3583   }
3584   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3585     ar=temp;
3586   else // LWC1/LDC1
3587     ar=tl;
3588   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3589   //else c=(i_regs->wasconst>>s)&1;
3590   if(s>=0) c=(i_regs->wasconst>>s)&1;
3591   // Check cop1 unusable
3592   if(!cop1_usable) {
3593     signed char rs=get_reg(i_regs->regmap,CSREG);
3594     assert(rs>=0);
3595     emit_testimm(rs,0x20000000);
3596     jaddr=(int)out;
3597     emit_jeq(0);
3598     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3599     cop1_usable=1;
3600   }
3601   if (opcode[i]==0x39) { // SWC1 (get float address)
3602     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3603   }
3604   if (opcode[i]==0x3D) { // SDC1 (get double address)
3605     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3606   }
3607   // Generate address + offset
3608   if(!using_tlb) {
3609     if(!c)
3610       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3611   }
3612   else
3613   {
3614     map=get_reg(i_regs->regmap,TLREG);
3615     assert(map>=0);
3616     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3617       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3618     }
3619     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3620       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3621     }
3622   }
3623   if (opcode[i]==0x39) { // SWC1 (read float)
3624     emit_readword_indexed(0,tl,tl);
3625   }
3626   if (opcode[i]==0x3D) { // SDC1 (read double)
3627     emit_readword_indexed(4,tl,th);
3628     emit_readword_indexed(0,tl,tl);
3629   }
3630   if (opcode[i]==0x31) { // LWC1 (get target address)
3631     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3632   }
3633   if (opcode[i]==0x35) { // LDC1 (get target address)
3634     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3635   }
3636   if(!using_tlb) {
3637     if(!c) {
3638       jaddr2=(int)out;
3639       emit_jno(0);
3640     }
3641     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3642       jaddr2=(int)out;
3643       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3644     }
3645     #ifdef DESTRUCTIVE_SHIFT
3646     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3647       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3648     }
3649     #endif
3650   }else{
3651     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3652       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3653     }
3654     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3655       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3656     }
3657   }
3658   if (opcode[i]==0x31) { // LWC1
3659     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3660     //gen_tlb_addr_r(ar,map);
3661     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3662     #ifdef HOST_IMM_ADDR32
3663     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3664     else
3665     #endif
3666     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3667     type=LOADW_STUB;
3668   }
3669   if (opcode[i]==0x35) { // LDC1
3670     assert(th>=0);
3671     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3672     //gen_tlb_addr_r(ar,map);
3673     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3674     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3675     #ifdef HOST_IMM_ADDR32
3676     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3677     else
3678     #endif
3679     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3680     type=LOADD_STUB;
3681   }
3682   if (opcode[i]==0x39) { // SWC1
3683     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3684     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3685     type=STOREW_STUB;
3686   }
3687   if (opcode[i]==0x3D) { // SDC1
3688     assert(th>=0);
3689     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3690     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3691     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3692     type=STORED_STUB;
3693   }
3694   if(!using_tlb) {
3695     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3696       #ifndef DESTRUCTIVE_SHIFT
3697       temp=offset||c||s<0?ar:s;
3698       #endif
3699       #if defined(HOST_IMM8)
3700       int ir=get_reg(i_regs->regmap,INVCP);
3701       assert(ir>=0);
3702       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3703       #else
3704       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3705       #endif
3706       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3707       emit_callne(invalidate_addr_reg[temp]);
3708       #else
3709       jaddr3=(int)out;
3710       emit_jne(0);
3711       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3712       #endif
3713     }
3714   }
3715   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3716   if (opcode[i]==0x31) { // LWC1 (write float)
3717     emit_writeword_indexed(tl,0,temp);
3718   }
3719   if (opcode[i]==0x35) { // LDC1 (write double)
3720     emit_writeword_indexed(th,4,temp);
3721     emit_writeword_indexed(tl,0,temp);
3722   }
3723   //if(opcode[i]==0x39)
3724   /*if(opcode[i]==0x39||opcode[i]==0x31)
3725   {
3726     emit_pusha();
3727         emit_readword((int)&last_count,ECX);
3728         if(get_reg(i_regs->regmap,CCREG)<0)
3729           emit_loadreg(CCREG,HOST_CCREG);
3730         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3731         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3732         emit_writeword(HOST_CCREG,(int)&Count);
3733     emit_call((int)memdebug);
3734     emit_popa();
3735   }/**/
3736 #else
3737   cop1_unusable(i, i_regs);
3738 #endif
3739 }
3740
3741 void c2ls_assemble(int i,struct regstat *i_regs)
3742 {
3743   int s,tl;
3744   int ar;
3745   int offset;
3746   int memtarget=0,c=0;
3747   int jaddr,jaddr2=0,jaddr3,type;
3748   int agr=AGEN1+(i&1);
3749   u_int hr,reglist=0;
3750   u_int copr=(source[i]>>16)&0x1f;
3751   s=get_reg(i_regs->regmap,rs1[i]);
3752   tl=get_reg(i_regs->regmap,FTEMP);
3753   offset=imm[i];
3754   assert(rs1[i]>0);
3755   assert(tl>=0);
3756   assert(!using_tlb);
3757
3758   for(hr=0;hr<HOST_REGS;hr++) {
3759     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3760   }
3761   if(i_regs->regmap[HOST_CCREG]==CCREG)
3762     reglist&=~(1<<HOST_CCREG);
3763
3764   // get the address
3765   if (opcode[i]==0x3a) { // SWC2
3766     ar=get_reg(i_regs->regmap,agr);
3767     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3768     reglist|=1<<ar;
3769   } else { // LWC2
3770     ar=tl;
3771   }
3772   if(s>=0) c=(i_regs->wasconst>>s)&1;
3773   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3774   if (!offset&&!c&&s>=0) ar=s;
3775   assert(ar>=0);
3776
3777   if (opcode[i]==0x3a) { // SWC2
3778     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3779     type=STOREW_STUB;
3780   }
3781   else
3782     type=LOADW_STUB;
3783
3784   if(c&&!memtarget) {
3785     jaddr2=(int)out;
3786     emit_jmp(0); // inline_readstub/inline_writestub?
3787   }
3788   else {
3789     if(!c) {
3790       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3791       jaddr2=(int)out;
3792       emit_jno(0);
3793     }
3794     if (opcode[i]==0x32) { // LWC2
3795       #ifdef HOST_IMM_ADDR32
3796       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3797       else
3798       #endif
3799       emit_readword_indexed(0,ar,tl);
3800     }
3801     if (opcode[i]==0x3a) { // SWC2
3802       #ifdef DESTRUCTIVE_SHIFT
3803       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3804       #endif
3805       emit_writeword_indexed(tl,0,ar);
3806     }
3807   }
3808   if(jaddr2)
3809     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3810   if (opcode[i]==0x3a) { // SWC2
3811 #if defined(HOST_IMM8)
3812     int ir=get_reg(i_regs->regmap,INVCP);
3813     assert(ir>=0);
3814     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3815 #else
3816     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3817 #endif
3818     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3819     emit_callne(invalidate_addr_reg[ar]);
3820     #else
3821     jaddr3=(int)out;
3822     emit_jne(0);
3823     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3824     #endif
3825   }
3826   if (opcode[i]==0x32) { // LWC2
3827     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3828   }
3829 }
3830
3831 #ifndef multdiv_assemble
3832 void multdiv_assemble(int i,struct regstat *i_regs)
3833 {
3834   printf("Need multdiv_assemble for this architecture.\n");
3835   exit(1);
3836 }
3837 #endif
3838
3839 void mov_assemble(int i,struct regstat *i_regs)
3840 {
3841   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3842   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3843   if(rt1[i]) {
3844     signed char sh,sl,th,tl;
3845     th=get_reg(i_regs->regmap,rt1[i]|64);
3846     tl=get_reg(i_regs->regmap,rt1[i]);
3847     //assert(tl>=0);
3848     if(tl>=0) {
3849       sh=get_reg(i_regs->regmap,rs1[i]|64);
3850       sl=get_reg(i_regs->regmap,rs1[i]);
3851       if(sl>=0) emit_mov(sl,tl);
3852       else emit_loadreg(rs1[i],tl);
3853       if(th>=0) {
3854         if(sh>=0) emit_mov(sh,th);
3855         else emit_loadreg(rs1[i]|64,th);
3856       }
3857     }
3858   }
3859 }
3860
3861 #ifndef fconv_assemble
3862 void fconv_assemble(int i,struct regstat *i_regs)
3863 {
3864   printf("Need fconv_assemble for this architecture.\n");
3865   exit(1);
3866 }
3867 #endif
3868
3869 #if 0
3870 void float_assemble(int i,struct regstat *i_regs)
3871 {
3872   printf("Need float_assemble for this architecture.\n");
3873   exit(1);
3874 }
3875 #endif
3876
3877 void syscall_assemble(int i,struct regstat *i_regs)
3878 {
3879   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3880   assert(ccreg==HOST_CCREG);
3881   assert(!is_delayslot);
3882   emit_movimm(start+i*4,EAX); // Get PC
3883   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3884   emit_jmp((int)jump_syscall_hle); // XXX
3885 }
3886
3887 void hlecall_assemble(int i,struct regstat *i_regs)
3888 {
3889   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3890   assert(ccreg==HOST_CCREG);
3891   assert(!is_delayslot);
3892   emit_movimm(start+i*4+4,0); // Get PC
3893   emit_movimm((int)psxHLEt[source[i]&7],1);
3894   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3895   emit_jmp((int)jump_hlecall);
3896 }
3897
3898 void intcall_assemble(int i,struct regstat *i_regs)
3899 {
3900   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3901   assert(ccreg==HOST_CCREG);
3902   assert(!is_delayslot);
3903   emit_movimm(start+i*4,0); // Get PC
3904   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3905   emit_jmp((int)jump_intcall);
3906 }
3907
3908 void ds_assemble(int i,struct regstat *i_regs)
3909 {
3910   is_delayslot=1;
3911   switch(itype[i]) {
3912     case ALU:
3913       alu_assemble(i,i_regs);break;
3914     case IMM16:
3915       imm16_assemble(i,i_regs);break;
3916     case SHIFT:
3917       shift_assemble(i,i_regs);break;
3918     case SHIFTIMM:
3919       shiftimm_assemble(i,i_regs);break;
3920     case LOAD:
3921       load_assemble(i,i_regs);break;
3922     case LOADLR:
3923       loadlr_assemble(i,i_regs);break;
3924     case STORE:
3925       store_assemble(i,i_regs);break;
3926     case STORELR:
3927       storelr_assemble(i,i_regs);break;
3928     case COP0:
3929       cop0_assemble(i,i_regs);break;
3930     case COP1:
3931       cop1_assemble(i,i_regs);break;
3932     case C1LS:
3933       c1ls_assemble(i,i_regs);break;
3934     case COP2:
3935       cop2_assemble(i,i_regs);break;
3936     case C2LS:
3937       c2ls_assemble(i,i_regs);break;
3938     case C2OP:
3939       c2op_assemble(i,i_regs);break;
3940     case FCONV:
3941       fconv_assemble(i,i_regs);break;
3942     case FLOAT:
3943       float_assemble(i,i_regs);break;
3944     case FCOMP:
3945       fcomp_assemble(i,i_regs);break;
3946     case MULTDIV:
3947       multdiv_assemble(i,i_regs);break;
3948     case MOV:
3949       mov_assemble(i,i_regs);break;
3950     case SYSCALL:
3951     case HLECALL:
3952     case INTCALL:
3953     case SPAN:
3954     case UJUMP:
3955     case RJUMP:
3956     case CJUMP:
3957     case SJUMP:
3958     case FJUMP:
3959       printf("Jump in the delay slot.  This is probably a bug.\n");
3960   }
3961   is_delayslot=0;
3962 }
3963
3964 // Is the branch target a valid internal jump?
3965 int internal_branch(uint64_t i_is32,int addr)
3966 {
3967   if(addr&1) return 0; // Indirect (register) jump
3968   if(addr>=start && addr<start+slen*4-4)
3969   {
3970     int t=(addr-start)>>2;
3971     // Delay slots are not valid branch targets
3972     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3973     // 64 -> 32 bit transition requires a recompile
3974     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3975     {
3976       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3977       else printf("optimizable: yes\n");
3978     }*/
3979     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3980 #ifndef FORCE32
3981     if(requires_32bit[t]&~i_is32) return 0;
3982     else
3983 #endif
3984       return 1;
3985   }
3986   return 0;
3987 }
3988
3989 #ifndef wb_invalidate
3990 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3991   uint64_t u,uint64_t uu)
3992 {
3993   int hr;
3994   for(hr=0;hr<HOST_REGS;hr++) {
3995     if(hr!=EXCLUDE_REG) {
3996       if(pre[hr]!=entry[hr]) {
3997         if(pre[hr]>=0) {
3998           if((dirty>>hr)&1) {
3999             if(get_reg(entry,pre[hr])<0) {
4000               if(pre[hr]<64) {
4001                 if(!((u>>pre[hr])&1)) {
4002                   emit_storereg(pre[hr],hr);
4003                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4004                     emit_sarimm(hr,31,hr);
4005                     emit_storereg(pre[hr]|64,hr);
4006                   }
4007                 }
4008               }else{
4009                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4010                   emit_storereg(pre[hr],hr);
4011                 }
4012               }
4013             }
4014           }
4015         }
4016       }
4017     }
4018   }
4019   // Move from one register to another (no writeback)
4020   for(hr=0;hr<HOST_REGS;hr++) {
4021     if(hr!=EXCLUDE_REG) {
4022       if(pre[hr]!=entry[hr]) {
4023         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4024           int nr;
4025           if((nr=get_reg(entry,pre[hr]))>=0) {
4026             emit_mov(hr,nr);
4027           }
4028         }
4029       }
4030     }
4031   }
4032 }
4033 #endif
4034
4035 // Load the specified registers
4036 // This only loads the registers given as arguments because
4037 // we don't want to load things that will be overwritten
4038 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4039 {
4040   int hr;
4041   // Load 32-bit regs
4042   for(hr=0;hr<HOST_REGS;hr++) {
4043     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4044       if(entry[hr]!=regmap[hr]) {
4045         if(regmap[hr]==rs1||regmap[hr]==rs2)
4046         {
4047           if(regmap[hr]==0) {
4048             emit_zeroreg(hr);
4049           }
4050           else
4051           {
4052             emit_loadreg(regmap[hr],hr);
4053           }
4054         }
4055       }
4056     }
4057   }
4058   //Load 64-bit regs
4059   for(hr=0;hr<HOST_REGS;hr++) {
4060     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4061       if(entry[hr]!=regmap[hr]) {
4062         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4063         {
4064           assert(regmap[hr]!=64);
4065           if((is32>>(regmap[hr]&63))&1) {
4066             int lr=get_reg(regmap,regmap[hr]-64);
4067             if(lr>=0)
4068               emit_sarimm(lr,31,hr);
4069             else
4070               emit_loadreg(regmap[hr],hr);
4071           }
4072           else
4073           {
4074             emit_loadreg(regmap[hr],hr);
4075           }
4076         }
4077       }
4078     }
4079   }
4080 }
4081
4082 // Load registers prior to the start of a loop
4083 // so that they are not loaded within the loop
4084 static void loop_preload(signed char pre[],signed char entry[])
4085 {
4086   int hr;
4087   for(hr=0;hr<HOST_REGS;hr++) {
4088     if(hr!=EXCLUDE_REG) {
4089       if(pre[hr]!=entry[hr]) {
4090         if(entry[hr]>=0) {
4091           if(get_reg(pre,entry[hr])<0) {
4092             assem_debug("loop preload:\n");
4093             //printf("loop preload: %d\n",hr);
4094             if(entry[hr]==0) {
4095               emit_zeroreg(hr);
4096             }
4097             else if(entry[hr]<TEMPREG)
4098             {
4099               emit_loadreg(entry[hr],hr);
4100             }
4101             else if(entry[hr]-64<TEMPREG)
4102             {
4103               emit_loadreg(entry[hr],hr);
4104             }
4105           }
4106         }
4107       }
4108     }
4109   }
4110 }
4111
4112 // Generate address for load/store instruction
4113 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4114 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4115 {
4116   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4117     int ra;
4118     int agr=AGEN1+(i&1);
4119     int mgr=MGEN1+(i&1);
4120     if(itype[i]==LOAD) {
4121       ra=get_reg(i_regs->regmap,rt1[i]);
4122       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4123       assert(ra>=0);
4124     }
4125     if(itype[i]==LOADLR) {
4126       ra=get_reg(i_regs->regmap,FTEMP);
4127     }
4128     if(itype[i]==STORE||itype[i]==STORELR) {
4129       ra=get_reg(i_regs->regmap,agr);
4130       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4131     }
4132     if(itype[i]==C1LS||itype[i]==C2LS) {
4133       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4134         ra=get_reg(i_regs->regmap,FTEMP);
4135       else { // SWC1/SDC1/SWC2/SDC2
4136         ra=get_reg(i_regs->regmap,agr);
4137         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4138       }
4139     }
4140     int rs=get_reg(i_regs->regmap,rs1[i]);
4141     int rm=get_reg(i_regs->regmap,TLREG);
4142     if(ra>=0) {
4143       int offset=imm[i];
4144       int c=(i_regs->wasconst>>rs)&1;
4145       if(rs1[i]==0) {
4146         // Using r0 as a base address
4147         /*if(rm>=0) {
4148           if(!entry||entry[rm]!=mgr) {
4149             generate_map_const(offset,rm);
4150           } // else did it in the previous cycle
4151         }*/
4152         if(!entry||entry[ra]!=agr) {
4153           if (opcode[i]==0x22||opcode[i]==0x26) {
4154             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4155           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4156             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4157           }else{
4158             emit_movimm(offset,ra);
4159           }
4160         } // else did it in the previous cycle
4161       }
4162       else if(rs<0) {
4163         if(!entry||entry[ra]!=rs1[i])
4164           emit_loadreg(rs1[i],ra);
4165         //if(!entry||entry[ra]!=rs1[i])
4166         //  printf("poor load scheduling!\n");
4167       }
4168       else if(c) {
4169         if(rm>=0) {
4170           if(!entry||entry[rm]!=mgr) {
4171             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4172               // Stores to memory go thru the mapper to detect self-modifying
4173               // code, loads don't.
4174               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4175                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4176                 generate_map_const(constmap[i][rs]+offset,rm);
4177             }else{
4178               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4179                 generate_map_const(constmap[i][rs]+offset,rm);
4180             }
4181           }
4182         }
4183         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4184           if(!entry||entry[ra]!=agr) {
4185             if (opcode[i]==0x22||opcode[i]==0x26) {
4186               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4187             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4188               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4189             }else{
4190               #ifdef HOST_IMM_ADDR32
4191               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4192                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4193               #endif
4194               emit_movimm(constmap[i][rs]+offset,ra);
4195             }
4196           } // else did it in the previous cycle
4197         } // else load_consts already did it
4198       }
4199       if(offset&&!c&&rs1[i]) {
4200         if(rs>=0) {
4201           emit_addimm(rs,offset,ra);
4202         }else{
4203           emit_addimm(ra,offset,ra);
4204         }
4205       }
4206     }
4207   }
4208   // Preload constants for next instruction
4209   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4210     int agr,ra;
4211     #ifndef HOST_IMM_ADDR32
4212     // Mapper entry
4213     agr=MGEN1+((i+1)&1);
4214     ra=get_reg(i_regs->regmap,agr);
4215     if(ra>=0) {
4216       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4217       int offset=imm[i+1];
4218       int c=(regs[i+1].wasconst>>rs)&1;
4219       if(c) {
4220         if(itype[i+1]==STORE||itype[i+1]==STORELR
4221            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4222           // Stores to memory go thru the mapper to detect self-modifying
4223           // code, loads don't.
4224           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4225              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4226             generate_map_const(constmap[i+1][rs]+offset,ra);
4227         }else{
4228           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4229             generate_map_const(constmap[i+1][rs]+offset,ra);
4230         }
4231       }
4232       /*else if(rs1[i]==0) {
4233         generate_map_const(offset,ra);
4234       }*/
4235     }
4236     #endif
4237     // Actual address
4238     agr=AGEN1+((i+1)&1);
4239     ra=get_reg(i_regs->regmap,agr);
4240     if(ra>=0) {
4241       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4242       int offset=imm[i+1];
4243       int c=(regs[i+1].wasconst>>rs)&1;
4244       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4245         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4246           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4247         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4248           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4249         }else{
4250           #ifdef HOST_IMM_ADDR32
4251           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4252              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4253           #endif
4254           emit_movimm(constmap[i+1][rs]+offset,ra);
4255         }
4256       }
4257       else if(rs1[i+1]==0) {
4258         // Using r0 as a base address
4259         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4260           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4261         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4262           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4263         }else{
4264           emit_movimm(offset,ra);
4265         }
4266       }
4267     }
4268   }
4269 }
4270
4271 int get_final_value(int hr, int i, int *value)
4272 {
4273   int reg=regs[i].regmap[hr];
4274   while(i<slen-1) {
4275     if(regs[i+1].regmap[hr]!=reg) break;
4276     if(!((regs[i+1].isconst>>hr)&1)) break;
4277     if(bt[i+1]) break;
4278     i++;
4279   }
4280   if(i<slen-1) {
4281     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4282       *value=constmap[i][hr];
4283       return 1;
4284     }
4285     if(!bt[i+1]) {
4286       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4287         // Load in delay slot, out-of-order execution
4288         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4289         {
4290           #ifdef HOST_IMM_ADDR32
4291           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4292           #endif
4293           // Precompute load address
4294           *value=constmap[i][hr]+imm[i+2];
4295           return 1;
4296         }
4297       }
4298       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4299       {
4300         #ifdef HOST_IMM_ADDR32
4301         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4302         #endif
4303         // Precompute load address
4304         *value=constmap[i][hr]+imm[i+1];
4305         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4306         return 1;
4307       }
4308     }
4309   }
4310   *value=constmap[i][hr];
4311   //printf("c=%x\n",(int)constmap[i][hr]);
4312   if(i==slen-1) return 1;
4313   if(reg<64) {
4314     return !((unneeded_reg[i+1]>>reg)&1);
4315   }else{
4316     return !((unneeded_reg_upper[i+1]>>reg)&1);
4317   }
4318 }
4319
4320 // Load registers with known constants
4321 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4322 {
4323   int hr;
4324   // Load 32-bit regs
4325   for(hr=0;hr<HOST_REGS;hr++) {
4326     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4327       //if(entry[hr]!=regmap[hr]) {
4328       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4329         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4330           int value;
4331           if(get_final_value(hr,i,&value)) {
4332             if(value==0) {
4333               emit_zeroreg(hr);
4334             }
4335             else {
4336               emit_movimm(value,hr);
4337             }
4338           }
4339         }
4340       }
4341     }
4342   }
4343   // Load 64-bit regs
4344   for(hr=0;hr<HOST_REGS;hr++) {
4345     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4346       //if(entry[hr]!=regmap[hr]) {
4347       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4348         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4349           if((is32>>(regmap[hr]&63))&1) {
4350             int lr=get_reg(regmap,regmap[hr]-64);
4351             assert(lr>=0);
4352             emit_sarimm(lr,31,hr);
4353           }
4354           else
4355           {
4356             int value;
4357             if(get_final_value(hr,i,&value)) {
4358               if(value==0) {
4359                 emit_zeroreg(hr);
4360               }
4361               else {
4362                 emit_movimm(value,hr);
4363               }
4364             }
4365           }
4366         }
4367       }
4368     }
4369   }
4370 }
4371 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4372 {
4373   int hr;
4374   // Load 32-bit regs
4375   for(hr=0;hr<HOST_REGS;hr++) {
4376     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4377       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4378         int value=constmap[i][hr];
4379         if(value==0) {
4380           emit_zeroreg(hr);
4381         }
4382         else {
4383           emit_movimm(value,hr);
4384         }
4385       }
4386     }
4387   }
4388   // Load 64-bit regs
4389   for(hr=0;hr<HOST_REGS;hr++) {
4390     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4391       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4392         if((is32>>(regmap[hr]&63))&1) {
4393           int lr=get_reg(regmap,regmap[hr]-64);
4394           assert(lr>=0);
4395           emit_sarimm(lr,31,hr);
4396         }
4397         else
4398         {
4399           int value=constmap[i][hr];
4400           if(value==0) {
4401             emit_zeroreg(hr);
4402           }
4403           else {
4404             emit_movimm(value,hr);
4405           }
4406         }
4407       }
4408     }
4409   }
4410 }
4411
4412 // Write out all dirty registers (except cycle count)
4413 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4414 {
4415   int hr;
4416   for(hr=0;hr<HOST_REGS;hr++) {
4417     if(hr!=EXCLUDE_REG) {
4418       if(i_regmap[hr]>0) {
4419         if(i_regmap[hr]!=CCREG) {
4420           if((i_dirty>>hr)&1) {
4421             if(i_regmap[hr]<64) {
4422               emit_storereg(i_regmap[hr],hr);
4423 #ifndef FORCE32
4424               if( ((i_is32>>i_regmap[hr])&1) ) {
4425                 #ifdef DESTRUCTIVE_WRITEBACK
4426                 emit_sarimm(hr,31,hr);
4427                 emit_storereg(i_regmap[hr]|64,hr);
4428                 #else
4429                 emit_sarimm(hr,31,HOST_TEMPREG);
4430                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4431                 #endif
4432               }
4433 #endif
4434             }else{
4435               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4436                 emit_storereg(i_regmap[hr],hr);
4437               }
4438             }
4439           }
4440         }
4441       }
4442     }
4443   }
4444 }
4445 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4446 // This writes the registers not written by store_regs_bt
4447 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4448 {
4449   int hr;
4450   int t=(addr-start)>>2;
4451   for(hr=0;hr<HOST_REGS;hr++) {
4452     if(hr!=EXCLUDE_REG) {
4453       if(i_regmap[hr]>0) {
4454         if(i_regmap[hr]!=CCREG) {
4455           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4456             if((i_dirty>>hr)&1) {
4457               if(i_regmap[hr]<64) {
4458                 emit_storereg(i_regmap[hr],hr);
4459 #ifndef FORCE32
4460                 if( ((i_is32>>i_regmap[hr])&1) ) {
4461                   #ifdef DESTRUCTIVE_WRITEBACK
4462                   emit_sarimm(hr,31,hr);
4463                   emit_storereg(i_regmap[hr]|64,hr);
4464                   #else
4465                   emit_sarimm(hr,31,HOST_TEMPREG);
4466                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4467                   #endif
4468                 }
4469 #endif
4470               }else{
4471                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4472                   emit_storereg(i_regmap[hr],hr);
4473                 }
4474               }
4475             }
4476           }
4477         }
4478       }
4479     }
4480   }
4481 }
4482
4483 // Load all registers (except cycle count)
4484 void load_all_regs(signed char i_regmap[])
4485 {
4486   int hr;
4487   for(hr=0;hr<HOST_REGS;hr++) {
4488     if(hr!=EXCLUDE_REG) {
4489       if(i_regmap[hr]==0) {
4490         emit_zeroreg(hr);
4491       }
4492       else
4493       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4494       {
4495         emit_loadreg(i_regmap[hr],hr);
4496       }
4497     }
4498   }
4499 }
4500
4501 // Load all current registers also needed by next instruction
4502 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4503 {
4504   int hr;
4505   for(hr=0;hr<HOST_REGS;hr++) {
4506     if(hr!=EXCLUDE_REG) {
4507       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4508         if(i_regmap[hr]==0) {
4509           emit_zeroreg(hr);
4510         }
4511         else
4512         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4513         {
4514           emit_loadreg(i_regmap[hr],hr);
4515         }
4516       }
4517     }
4518   }
4519 }
4520
4521 // Load all regs, storing cycle count if necessary
4522 void load_regs_entry(int t)
4523 {
4524   int hr;
4525   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4526   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4527   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4528     emit_storereg(CCREG,HOST_CCREG);
4529   }
4530   // Load 32-bit regs
4531   for(hr=0;hr<HOST_REGS;hr++) {
4532     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4533       if(regs[t].regmap_entry[hr]==0) {
4534         emit_zeroreg(hr);
4535       }
4536       else if(regs[t].regmap_entry[hr]!=CCREG)
4537       {
4538         emit_loadreg(regs[t].regmap_entry[hr],hr);
4539       }
4540     }
4541   }
4542   // Load 64-bit regs
4543   for(hr=0;hr<HOST_REGS;hr++) {
4544     if(regs[t].regmap_entry[hr]>=64) {
4545       assert(regs[t].regmap_entry[hr]!=64);
4546       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4547         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4548         if(lr<0) {
4549           emit_loadreg(regs[t].regmap_entry[hr],hr);
4550         }
4551         else
4552         {
4553           emit_sarimm(lr,31,hr);
4554         }
4555       }
4556       else
4557       {
4558         emit_loadreg(regs[t].regmap_entry[hr],hr);
4559       }
4560     }
4561   }
4562 }
4563
4564 // Store dirty registers prior to branch
4565 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4566 {
4567   if(internal_branch(i_is32,addr))
4568   {
4569     int t=(addr-start)>>2;
4570     int hr;
4571     for(hr=0;hr<HOST_REGS;hr++) {
4572       if(hr!=EXCLUDE_REG) {
4573         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4574           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4575             if((i_dirty>>hr)&1) {
4576               if(i_regmap[hr]<64) {
4577                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4578                   emit_storereg(i_regmap[hr],hr);
4579                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4580                     #ifdef DESTRUCTIVE_WRITEBACK
4581                     emit_sarimm(hr,31,hr);
4582                     emit_storereg(i_regmap[hr]|64,hr);
4583                     #else
4584                     emit_sarimm(hr,31,HOST_TEMPREG);
4585                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4586                     #endif
4587                   }
4588                 }
4589               }else{
4590                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4591                   emit_storereg(i_regmap[hr],hr);
4592                 }
4593               }
4594             }
4595           }
4596         }
4597       }
4598     }
4599   }
4600   else
4601   {
4602     // Branch out of this block, write out all dirty regs
4603     wb_dirtys(i_regmap,i_is32,i_dirty);
4604   }
4605 }
4606
4607 // Load all needed registers for branch target
4608 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4609 {
4610   //if(addr>=start && addr<(start+slen*4))
4611   if(internal_branch(i_is32,addr))
4612   {
4613     int t=(addr-start)>>2;
4614     int hr;
4615     // Store the cycle count before loading something else
4616     if(i_regmap[HOST_CCREG]!=CCREG) {
4617       assert(i_regmap[HOST_CCREG]==-1);
4618     }
4619     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4620       emit_storereg(CCREG,HOST_CCREG);
4621     }
4622     // Load 32-bit regs
4623     for(hr=0;hr<HOST_REGS;hr++) {
4624       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4625         #ifdef DESTRUCTIVE_WRITEBACK
4626         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4627         #else
4628         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4629         #endif
4630           if(regs[t].regmap_entry[hr]==0) {
4631             emit_zeroreg(hr);
4632           }
4633           else if(regs[t].regmap_entry[hr]!=CCREG)
4634           {
4635             emit_loadreg(regs[t].regmap_entry[hr],hr);
4636           }
4637         }
4638       }
4639     }
4640     //Load 64-bit regs
4641     for(hr=0;hr<HOST_REGS;hr++) {
4642       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4643         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4644           assert(regs[t].regmap_entry[hr]!=64);
4645           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4646             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4647             if(lr<0) {
4648               emit_loadreg(regs[t].regmap_entry[hr],hr);
4649             }
4650             else
4651             {
4652               emit_sarimm(lr,31,hr);
4653             }
4654           }
4655           else
4656           {
4657             emit_loadreg(regs[t].regmap_entry[hr],hr);
4658           }
4659         }
4660         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4661           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4662           assert(lr>=0);
4663           emit_sarimm(lr,31,hr);
4664         }
4665       }
4666     }
4667   }
4668 }
4669
4670 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4671 {
4672   if(addr>=start && addr<start+slen*4-4)
4673   {
4674     int t=(addr-start)>>2;
4675     int hr;
4676     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4677     for(hr=0;hr<HOST_REGS;hr++)
4678     {
4679       if(hr!=EXCLUDE_REG)
4680       {
4681         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4682         {
4683           if(regs[t].regmap_entry[hr]!=-1)
4684           {
4685             return 0;
4686           }
4687           else 
4688           if((i_dirty>>hr)&1)
4689           {
4690             if(i_regmap[hr]<64)
4691             {
4692               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4693                 return 0;
4694             }
4695             else
4696             {
4697               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4698                 return 0;
4699             }
4700           }
4701         }
4702         else // Same register but is it 32-bit or dirty?
4703         if(i_regmap[hr]>=0)
4704         {
4705           if(!((regs[t].dirty>>hr)&1))
4706           {
4707             if((i_dirty>>hr)&1)
4708             {
4709               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4710               {
4711                 //printf("%x: dirty no match\n",addr);
4712                 return 0;
4713               }
4714             }
4715           }
4716           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4717           {
4718             //printf("%x: is32 no match\n",addr);
4719             return 0;
4720           }
4721         }
4722       }
4723     }
4724     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4725 #ifndef FORCE32
4726     if(requires_32bit[t]&~i_is32) return 0;
4727 #endif
4728     // Delay slots are not valid branch targets
4729     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4730     // Delay slots require additional processing, so do not match
4731     if(is_ds[t]) return 0;
4732   }
4733   else
4734   {
4735     int hr;
4736     for(hr=0;hr<HOST_REGS;hr++)
4737     {
4738       if(hr!=EXCLUDE_REG)
4739       {
4740         if(i_regmap[hr]>=0)
4741         {
4742           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4743           {
4744             if((i_dirty>>hr)&1)
4745             {
4746               return 0;
4747             }
4748           }
4749         }
4750       }
4751     }
4752   }
4753   return 1;
4754 }
4755
4756 // Used when a branch jumps into the delay slot of another branch
4757 void ds_assemble_entry(int i)
4758 {
4759   int t=(ba[i]-start)>>2;
4760   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4761   assem_debug("Assemble delay slot at %x\n",ba[i]);
4762   assem_debug("<->\n");
4763   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4764     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4765   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4766   address_generation(t,&regs[t],regs[t].regmap_entry);
4767   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4768     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4769   cop1_usable=0;
4770   is_delayslot=0;
4771   switch(itype[t]) {
4772     case ALU:
4773       alu_assemble(t,&regs[t]);break;
4774     case IMM16:
4775       imm16_assemble(t,&regs[t]);break;
4776     case SHIFT:
4777       shift_assemble(t,&regs[t]);break;
4778     case SHIFTIMM:
4779       shiftimm_assemble(t,&regs[t]);break;
4780     case LOAD:
4781       load_assemble(t,&regs[t]);break;
4782     case LOADLR:
4783       loadlr_assemble(t,&regs[t]);break;
4784     case STORE:
4785       store_assemble(t,&regs[t]);break;
4786     case STORELR:
4787       storelr_assemble(t,&regs[t]);break;
4788     case COP0:
4789       cop0_assemble(t,&regs[t]);break;
4790     case COP1:
4791       cop1_assemble(t,&regs[t]);break;
4792     case C1LS:
4793       c1ls_assemble(t,&regs[t]);break;
4794     case COP2:
4795       cop2_assemble(t,&regs[t]);break;
4796     case C2LS:
4797       c2ls_assemble(t,&regs[t]);break;
4798     case C2OP:
4799       c2op_assemble(t,&regs[t]);break;
4800     case FCONV:
4801       fconv_assemble(t,&regs[t]);break;
4802     case FLOAT:
4803       float_assemble(t,&regs[t]);break;
4804     case FCOMP:
4805       fcomp_assemble(t,&regs[t]);break;
4806     case MULTDIV:
4807       multdiv_assemble(t,&regs[t]);break;
4808     case MOV:
4809       mov_assemble(t,&regs[t]);break;
4810     case SYSCALL:
4811     case HLECALL:
4812     case INTCALL:
4813     case SPAN:
4814     case UJUMP:
4815     case RJUMP:
4816     case CJUMP:
4817     case SJUMP:
4818     case FJUMP:
4819       printf("Jump in the delay slot.  This is probably a bug.\n");
4820   }
4821   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4822   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4823   if(internal_branch(regs[t].is32,ba[i]+4))
4824     assem_debug("branch: internal\n");
4825   else
4826     assem_debug("branch: external\n");
4827   assert(internal_branch(regs[t].is32,ba[i]+4));
4828   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4829   emit_jmp(0);
4830 }
4831
4832 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4833 {
4834   int count;
4835   int jaddr;
4836   int idle=0;
4837   if(itype[i]==RJUMP)
4838   {
4839     *adj=0;
4840   }
4841   //if(ba[i]>=start && ba[i]<(start+slen*4))
4842   if(internal_branch(branch_regs[i].is32,ba[i]))
4843   {
4844     int t=(ba[i]-start)>>2;
4845     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4846     else *adj=ccadj[t];
4847   }
4848   else
4849   {
4850     *adj=0;
4851   }
4852   count=ccadj[i];
4853   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4854     // Idle loop
4855     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4856     idle=(int)out;
4857     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4858     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4859     jaddr=(int)out;
4860     emit_jmp(0);
4861   }
4862   else if(*adj==0||invert) {
4863     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4864     jaddr=(int)out;
4865     emit_jns(0);
4866   }
4867   else
4868   {
4869     emit_cmpimm(HOST_CCREG,-2*(count+2));
4870     jaddr=(int)out;
4871     emit_jns(0);
4872   }
4873   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4874 }
4875
4876 void do_ccstub(int n)
4877 {
4878   literal_pool(256);
4879   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4880   set_jump_target(stubs[n][1],(int)out);
4881   int i=stubs[n][4];
4882   if(stubs[n][6]==NULLDS) {
4883     // Delay slot instruction is nullified ("likely" branch)
4884     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4885   }
4886   else if(stubs[n][6]!=TAKEN) {
4887     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4888   }
4889   else {
4890     if(internal_branch(branch_regs[i].is32,ba[i]))
4891       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4892   }
4893   if(stubs[n][5]!=-1)
4894   {
4895     // Save PC as return address
4896     emit_movimm(stubs[n][5],EAX);
4897     emit_writeword(EAX,(int)&pcaddr);
4898   }
4899   else
4900   {
4901     // Return address depends on which way the branch goes
4902     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4903     {
4904       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4905       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4906       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4907       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4908       if(rs1[i]==0)
4909       {
4910         s1l=s2l;s1h=s2h;
4911         s2l=s2h=-1;
4912       }
4913       else if(rs2[i]==0)
4914       {
4915         s2l=s2h=-1;
4916       }
4917       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4918         s1h=s2h=-1;
4919       }
4920       assert(s1l>=0);
4921       #ifdef DESTRUCTIVE_WRITEBACK
4922       if(rs1[i]) {
4923         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4924           emit_loadreg(rs1[i],s1l);
4925       } 
4926       else {
4927         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4928           emit_loadreg(rs2[i],s1l);
4929       }
4930       if(s2l>=0)
4931         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4932           emit_loadreg(rs2[i],s2l);
4933       #endif
4934       int hr=0;
4935       int addr,alt,ntaddr;
4936       while(hr<HOST_REGS)
4937       {
4938         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4939            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4940            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4941         {
4942           addr=hr++;break;
4943         }
4944         hr++;
4945       }
4946       while(hr<HOST_REGS)
4947       {
4948         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4949            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4950            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4951         {
4952           alt=hr++;break;
4953         }
4954         hr++;
4955       }
4956       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4957       {
4958         while(hr<HOST_REGS)
4959         {
4960           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4961              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4962              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4963           {
4964             ntaddr=hr;break;
4965           }
4966           hr++;
4967         }
4968         assert(hr<HOST_REGS);
4969       }
4970       if((opcode[i]&0x2f)==4) // BEQ
4971       {
4972         #ifdef HAVE_CMOV_IMM
4973         if(s1h<0) {
4974           if(s2l>=0) emit_cmp(s1l,s2l);
4975           else emit_test(s1l,s1l);
4976           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4977         }
4978         else
4979         #endif
4980         {
4981           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4982           if(s1h>=0) {
4983             if(s2h>=0) emit_cmp(s1h,s2h);
4984             else emit_test(s1h,s1h);
4985             emit_cmovne_reg(alt,addr);
4986           }
4987           if(s2l>=0) emit_cmp(s1l,s2l);
4988           else emit_test(s1l,s1l);
4989           emit_cmovne_reg(alt,addr);
4990         }
4991       }
4992       if((opcode[i]&0x2f)==5) // BNE
4993       {
4994         #ifdef HAVE_CMOV_IMM
4995         if(s1h<0) {
4996           if(s2l>=0) emit_cmp(s1l,s2l);
4997           else emit_test(s1l,s1l);
4998           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4999         }
5000         else
5001         #endif
5002         {
5003           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5004           if(s1h>=0) {
5005             if(s2h>=0) emit_cmp(s1h,s2h);
5006             else emit_test(s1h,s1h);
5007             emit_cmovne_reg(alt,addr);
5008           }
5009           if(s2l>=0) emit_cmp(s1l,s2l);
5010           else emit_test(s1l,s1l);
5011           emit_cmovne_reg(alt,addr);
5012         }
5013       }
5014       if((opcode[i]&0x2f)==6) // BLEZ
5015       {
5016         //emit_movimm(ba[i],alt);
5017         //emit_movimm(start+i*4+8,addr);
5018         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5019         emit_cmpimm(s1l,1);
5020         if(s1h>=0) emit_mov(addr,ntaddr);
5021         emit_cmovl_reg(alt,addr);
5022         if(s1h>=0) {
5023           emit_test(s1h,s1h);
5024           emit_cmovne_reg(ntaddr,addr);
5025           emit_cmovs_reg(alt,addr);
5026         }
5027       }
5028       if((opcode[i]&0x2f)==7) // BGTZ
5029       {
5030         //emit_movimm(ba[i],addr);
5031         //emit_movimm(start+i*4+8,ntaddr);
5032         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5033         emit_cmpimm(s1l,1);
5034         if(s1h>=0) emit_mov(addr,alt);
5035         emit_cmovl_reg(ntaddr,addr);
5036         if(s1h>=0) {
5037           emit_test(s1h,s1h);
5038           emit_cmovne_reg(alt,addr);
5039           emit_cmovs_reg(ntaddr,addr);
5040         }
5041       }
5042       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5043       {
5044         //emit_movimm(ba[i],alt);
5045         //emit_movimm(start+i*4+8,addr);
5046         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5047         if(s1h>=0) emit_test(s1h,s1h);
5048         else emit_test(s1l,s1l);
5049         emit_cmovs_reg(alt,addr);
5050       }
5051       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5052       {
5053         //emit_movimm(ba[i],addr);
5054         //emit_movimm(start+i*4+8,alt);
5055         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5056         if(s1h>=0) emit_test(s1h,s1h);
5057         else emit_test(s1l,s1l);
5058         emit_cmovs_reg(alt,addr);
5059       }
5060       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5061         if(source[i]&0x10000) // BC1T
5062         {
5063           //emit_movimm(ba[i],alt);
5064           //emit_movimm(start+i*4+8,addr);
5065           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5066           emit_testimm(s1l,0x800000);
5067           emit_cmovne_reg(alt,addr);
5068         }
5069         else // BC1F
5070         {
5071           //emit_movimm(ba[i],addr);
5072           //emit_movimm(start+i*4+8,alt);
5073           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5074           emit_testimm(s1l,0x800000);
5075           emit_cmovne_reg(alt,addr);
5076         }
5077       }
5078       emit_writeword(addr,(int)&pcaddr);
5079     }
5080     else
5081     if(itype[i]==RJUMP)
5082     {
5083       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5084       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5085         r=get_reg(branch_regs[i].regmap,RTEMP);
5086       }
5087       emit_writeword(r,(int)&pcaddr);
5088     }
5089     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5090   }
5091   // Update cycle count
5092   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5093   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5094   emit_call((int)cc_interrupt);
5095   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5096   if(stubs[n][6]==TAKEN) {
5097     if(internal_branch(branch_regs[i].is32,ba[i]))
5098       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5099     else if(itype[i]==RJUMP) {
5100       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5101         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5102       else
5103         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5104     }
5105   }else if(stubs[n][6]==NOTTAKEN) {
5106     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5107     else load_all_regs(branch_regs[i].regmap);
5108   }else if(stubs[n][6]==NULLDS) {
5109     // Delay slot instruction is nullified ("likely" branch)
5110     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5111     else load_all_regs(regs[i].regmap);
5112   }else{
5113     load_all_regs(branch_regs[i].regmap);
5114   }
5115   emit_jmp(stubs[n][2]); // return address
5116   
5117   /* This works but uses a lot of memory...
5118   emit_readword((int)&last_count,ECX);
5119   emit_add(HOST_CCREG,ECX,EAX);
5120   emit_writeword(EAX,(int)&Count);
5121   emit_call((int)gen_interupt);
5122   emit_readword((int)&Count,HOST_CCREG);
5123   emit_readword((int)&next_interupt,EAX);
5124   emit_readword((int)&pending_exception,EBX);
5125   emit_writeword(EAX,(int)&last_count);
5126   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5127   emit_test(EBX,EBX);
5128   int jne_instr=(int)out;
5129   emit_jne(0);
5130   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5131   load_all_regs(branch_regs[i].regmap);
5132   emit_jmp(stubs[n][2]); // return address
5133   set_jump_target(jne_instr,(int)out);
5134   emit_readword((int)&pcaddr,EAX);
5135   // Call get_addr_ht instead of doing the hash table here.
5136   // This code is executed infrequently and takes up a lot of space
5137   // so smaller is better.
5138   emit_storereg(CCREG,HOST_CCREG);
5139   emit_pushreg(EAX);
5140   emit_call((int)get_addr_ht);
5141   emit_loadreg(CCREG,HOST_CCREG);
5142   emit_addimm(ESP,4,ESP);
5143   emit_jmpreg(EAX);*/
5144 }
5145
5146 add_to_linker(int addr,int target,int ext)
5147 {
5148   link_addr[linkcount][0]=addr;
5149   link_addr[linkcount][1]=target;
5150   link_addr[linkcount][2]=ext;  
5151   linkcount++;
5152 }
5153
5154 void ujump_assemble(int i,struct regstat *i_regs)
5155 {
5156   signed char *i_regmap=i_regs->regmap;
5157   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5158   address_generation(i+1,i_regs,regs[i].regmap_entry);
5159   #ifdef REG_PREFETCH
5160   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5161   if(rt1[i]==31&&temp>=0) 
5162   {
5163     int return_address=start+i*4+8;
5164     if(get_reg(branch_regs[i].regmap,31)>0) 
5165     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5166   }
5167   #endif
5168   if(rt1[i]==31) {
5169     int rt;
5170     unsigned int return_address;
5171     rt=get_reg(branch_regs[i].regmap,31);
5172     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5173     //assert(rt>=0);
5174     return_address=start+i*4+8;
5175     if(rt>=0) {
5176       #ifdef USE_MINI_HT
5177       if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5178         int temp=-1; // note: must be ds-safe
5179         #ifdef HOST_TEMPREG
5180         temp=HOST_TEMPREG;
5181         #endif
5182         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5183         else emit_movimm(return_address,rt);
5184       }
5185       else
5186       #endif
5187       {
5188         #ifdef REG_PREFETCH
5189         if(temp>=0) 
5190         {
5191           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5192         }
5193         #endif
5194         emit_movimm(return_address,rt); // PC into link register
5195         #ifdef IMM_PREFETCH
5196         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5197         #endif
5198       }
5199     }
5200   }
5201   ds_assemble(i+1,i_regs);
5202   uint64_t bc_unneeded=branch_regs[i].u;
5203   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5204   bc_unneeded|=1|(1LL<<rt1[i]);
5205   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5206   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5207                 bc_unneeded,bc_unneeded_upper);
5208   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5209   int cc,adj;
5210   cc=get_reg(branch_regs[i].regmap,CCREG);
5211   assert(cc==HOST_CCREG);
5212   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5213   #ifdef REG_PREFETCH
5214   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5215   #endif
5216   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5217   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5218   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5219   if(internal_branch(branch_regs[i].is32,ba[i]))
5220     assem_debug("branch: internal\n");
5221   else
5222     assem_debug("branch: external\n");
5223   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5224     ds_assemble_entry(i);
5225   }
5226   else {
5227     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5228     emit_jmp(0);
5229   }
5230 }
5231
5232 void rjump_assemble(int i,struct regstat *i_regs)
5233 {
5234   signed char *i_regmap=i_regs->regmap;
5235   int temp;
5236   int rs,cc,adj;
5237   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5238   assert(rs>=0);
5239   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5240     // Delay slot abuse, make a copy of the branch address register
5241     temp=get_reg(branch_regs[i].regmap,RTEMP);
5242     assert(temp>=0);
5243     assert(regs[i].regmap[temp]==RTEMP);
5244     emit_mov(rs,temp);
5245     rs=temp;
5246   }
5247   address_generation(i+1,i_regs,regs[i].regmap_entry);
5248   #ifdef REG_PREFETCH
5249   if(rt1[i]==31) 
5250   {
5251     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5252       int return_address=start+i*4+8;
5253       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5254     }
5255   }
5256   #endif
5257   #ifdef USE_MINI_HT
5258   if(rs1[i]==31) {
5259     int rh=get_reg(regs[i].regmap,RHASH);
5260     if(rh>=0) do_preload_rhash(rh);
5261   }
5262   #endif
5263   ds_assemble(i+1,i_regs);
5264   uint64_t bc_unneeded=branch_regs[i].u;
5265   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5266   bc_unneeded|=1|(1LL<<rt1[i]);
5267   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5268   bc_unneeded&=~(1LL<<rs1[i]);
5269   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5270                 bc_unneeded,bc_unneeded_upper);
5271   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5272   if(rt1[i]!=0) {
5273     int rt,return_address;
5274     assert(rt1[i+1]!=rt1[i]);
5275     assert(rt2[i+1]!=rt1[i]);
5276     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5277     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5278     assert(rt>=0);
5279     return_address=start+i*4+8;
5280     #ifdef REG_PREFETCH
5281     if(temp>=0) 
5282     {
5283       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5284     }
5285     #endif
5286     emit_movimm(return_address,rt); // PC into link register
5287     #ifdef IMM_PREFETCH
5288     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5289     #endif
5290   }
5291   cc=get_reg(branch_regs[i].regmap,CCREG);
5292   assert(cc==HOST_CCREG);
5293   #ifdef USE_MINI_HT
5294   int rh=get_reg(branch_regs[i].regmap,RHASH);
5295   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5296   if(rs1[i]==31) {
5297     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5298     do_preload_rhtbl(ht);
5299     do_rhash(rs,rh);
5300   }
5301   #endif
5302   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5303   #ifdef DESTRUCTIVE_WRITEBACK
5304   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5305     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5306       emit_loadreg(rs1[i],rs);
5307     }
5308   }
5309   #endif
5310   #ifdef REG_PREFETCH
5311   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5312   #endif
5313   #ifdef USE_MINI_HT
5314   if(rs1[i]==31) {
5315     do_miniht_load(ht,rh);
5316   }
5317   #endif
5318   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5319   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5320   //assert(adj==0);
5321   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5322   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5323   emit_jns(0);
5324   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5325   #ifdef USE_MINI_HT
5326   if(rs1[i]==31) {
5327     do_miniht_jump(rs,rh,ht);
5328   }
5329   else
5330   #endif
5331   {
5332     //if(rs!=EAX) emit_mov(rs,EAX);
5333     //emit_jmp((int)jump_vaddr_eax);
5334     emit_jmp(jump_vaddr_reg[rs]);
5335   }
5336   /* Check hash table
5337   temp=!rs;
5338   emit_mov(rs,temp);
5339   emit_shrimm(rs,16,rs);
5340   emit_xor(temp,rs,rs);
5341   emit_movzwl_reg(rs,rs);
5342   emit_shlimm(rs,4,rs);
5343   emit_cmpmem_indexed((int)hash_table,rs,temp);
5344   emit_jne((int)out+14);
5345   emit_readword_indexed((int)hash_table+4,rs,rs);
5346   emit_jmpreg(rs);
5347   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5348   emit_addimm_no_flags(8,rs);
5349   emit_jeq((int)out-17);
5350   // No hit on hash table, call compiler
5351   emit_pushreg(temp);
5352 //DEBUG >
5353 #ifdef DEBUG_CYCLE_COUNT
5354   emit_readword((int)&last_count,ECX);
5355   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5356   emit_readword((int)&next_interupt,ECX);
5357   emit_writeword(HOST_CCREG,(int)&Count);
5358   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5359   emit_writeword(ECX,(int)&last_count);
5360 #endif
5361 //DEBUG <
5362   emit_storereg(CCREG,HOST_CCREG);
5363   emit_call((int)get_addr);
5364   emit_loadreg(CCREG,HOST_CCREG);
5365   emit_addimm(ESP,4,ESP);
5366   emit_jmpreg(EAX);*/
5367   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5368   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5369   #endif
5370 }
5371
5372 void cjump_assemble(int i,struct regstat *i_regs)
5373 {
5374   signed char *i_regmap=i_regs->regmap;
5375   int cc;
5376   int match;
5377   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5378   assem_debug("match=%d\n",match);
5379   int s1h,s1l,s2h,s2l;
5380   int prev_cop1_usable=cop1_usable;
5381   int unconditional=0,nop=0;
5382   int only32=0;
5383   int invert=0;
5384   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5385   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5386   if(!match) invert=1;
5387   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5388   if(i>(ba[i]-start)>>2) invert=1;
5389   #endif
5390   
5391   if(ooo[i]) {
5392     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5393     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5394     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5395     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5396   }
5397   else {
5398     s1l=get_reg(i_regmap,rs1[i]);
5399     s1h=get_reg(i_regmap,rs1[i]|64);
5400     s2l=get_reg(i_regmap,rs2[i]);
5401     s2h=get_reg(i_regmap,rs2[i]|64);
5402   }
5403   if(rs1[i]==0&&rs2[i]==0)
5404   {
5405     if(opcode[i]&1) nop=1;
5406     else unconditional=1;
5407     //assert(opcode[i]!=5);
5408     //assert(opcode[i]!=7);
5409     //assert(opcode[i]!=0x15);
5410     //assert(opcode[i]!=0x17);
5411   }
5412   else if(rs1[i]==0)
5413   {
5414     s1l=s2l;s1h=s2h;
5415     s2l=s2h=-1;
5416     only32=(regs[i].was32>>rs2[i])&1;
5417   }
5418   else if(rs2[i]==0)
5419   {
5420     s2l=s2h=-1;
5421     only32=(regs[i].was32>>rs1[i])&1;
5422   }
5423   else {
5424     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5425   }
5426
5427   if(ooo[i]) {
5428     // Out of order execution (delay slot first)
5429     //printf("OOOE\n");
5430     address_generation(i+1,i_regs,regs[i].regmap_entry);
5431     ds_assemble(i+1,i_regs);
5432     int adj;
5433     uint64_t bc_unneeded=branch_regs[i].u;
5434     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5435     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5436     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5437     bc_unneeded|=1;
5438     bc_unneeded_upper|=1;
5439     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5440                   bc_unneeded,bc_unneeded_upper);
5441     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5442     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5443     cc=get_reg(branch_regs[i].regmap,CCREG);
5444     assert(cc==HOST_CCREG);
5445     if(unconditional) 
5446       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5447     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5448     //assem_debug("cycle count (adj)\n");
5449     if(unconditional) {
5450       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5451       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5452         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5453         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5454         if(internal)
5455           assem_debug("branch: internal\n");
5456         else
5457           assem_debug("branch: external\n");
5458         if(internal&&is_ds[(ba[i]-start)>>2]) {
5459           ds_assemble_entry(i);
5460         }
5461         else {
5462           add_to_linker((int)out,ba[i],internal);
5463           emit_jmp(0);
5464         }
5465         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5466         if(((u_int)out)&7) emit_addnop(0);
5467         #endif
5468       }
5469     }
5470     else if(nop) {
5471       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5472       int jaddr=(int)out;
5473       emit_jns(0);
5474       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5475     }
5476     else {
5477       int taken=0,nottaken=0,nottaken1=0;
5478       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5479       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5480       if(!only32)
5481       {
5482         assert(s1h>=0);
5483         if(opcode[i]==4) // BEQ
5484         {
5485           if(s2h>=0) emit_cmp(s1h,s2h);
5486           else emit_test(s1h,s1h);
5487           nottaken1=(int)out;
5488           emit_jne(1);
5489         }
5490         if(opcode[i]==5) // BNE
5491         {
5492           if(s2h>=0) emit_cmp(s1h,s2h);
5493           else emit_test(s1h,s1h);
5494           if(invert) taken=(int)out;
5495           else add_to_linker((int)out,ba[i],internal);
5496           emit_jne(0);
5497         }
5498         if(opcode[i]==6) // BLEZ
5499         {
5500           emit_test(s1h,s1h);
5501           if(invert) taken=(int)out;
5502           else add_to_linker((int)out,ba[i],internal);
5503           emit_js(0);
5504           nottaken1=(int)out;
5505           emit_jne(1);
5506         }
5507         if(opcode[i]==7) // BGTZ
5508         {
5509           emit_test(s1h,s1h);
5510           nottaken1=(int)out;
5511           emit_js(1);
5512           if(invert) taken=(int)out;
5513           else add_to_linker((int)out,ba[i],internal);
5514           emit_jne(0);
5515         }
5516       } // if(!only32)
5517           
5518       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5519       assert(s1l>=0);
5520       if(opcode[i]==4) // BEQ
5521       {
5522         if(s2l>=0) emit_cmp(s1l,s2l);
5523         else emit_test(s1l,s1l);
5524         if(invert){
5525           nottaken=(int)out;
5526           emit_jne(1);
5527         }else{
5528           add_to_linker((int)out,ba[i],internal);
5529           emit_jeq(0);
5530         }
5531       }
5532       if(opcode[i]==5) // BNE
5533       {
5534         if(s2l>=0) emit_cmp(s1l,s2l);
5535         else emit_test(s1l,s1l);
5536         if(invert){
5537           nottaken=(int)out;
5538           emit_jeq(1);
5539         }else{
5540           add_to_linker((int)out,ba[i],internal);
5541           emit_jne(0);
5542         }
5543       }
5544       if(opcode[i]==6) // BLEZ
5545       {
5546         emit_cmpimm(s1l,1);
5547         if(invert){
5548           nottaken=(int)out;
5549           emit_jge(1);
5550         }else{
5551           add_to_linker((int)out,ba[i],internal);
5552           emit_jl(0);
5553         }
5554       }
5555       if(opcode[i]==7) // BGTZ
5556       {
5557         emit_cmpimm(s1l,1);
5558         if(invert){
5559           nottaken=(int)out;
5560           emit_jl(1);
5561         }else{
5562           add_to_linker((int)out,ba[i],internal);
5563           emit_jge(0);
5564         }
5565       }
5566       if(invert) {
5567         if(taken) set_jump_target(taken,(int)out);
5568         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5569         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5570           if(adj) {
5571             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5572             add_to_linker((int)out,ba[i],internal);
5573           }else{
5574             emit_addnop(13);
5575             add_to_linker((int)out,ba[i],internal*2);
5576           }
5577           emit_jmp(0);
5578         }else
5579         #endif
5580         {
5581           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5582           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5583           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5584           if(internal)
5585             assem_debug("branch: internal\n");
5586           else
5587             assem_debug("branch: external\n");
5588           if(internal&&is_ds[(ba[i]-start)>>2]) {
5589             ds_assemble_entry(i);
5590           }
5591           else {
5592             add_to_linker((int)out,ba[i],internal);
5593             emit_jmp(0);
5594           }
5595         }
5596         set_jump_target(nottaken,(int)out);
5597       }
5598
5599       if(nottaken1) set_jump_target(nottaken1,(int)out);
5600       if(adj) {
5601         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5602       }
5603     } // (!unconditional)
5604   } // if(ooo)
5605   else
5606   {
5607     // In-order execution (branch first)
5608     //if(likely[i]) printf("IOL\n");
5609     //else
5610     //printf("IOE\n");
5611     int taken=0,nottaken=0,nottaken1=0;
5612     if(!unconditional&&!nop) {
5613       if(!only32)
5614       {
5615         assert(s1h>=0);
5616         if((opcode[i]&0x2f)==4) // BEQ
5617         {
5618           if(s2h>=0) emit_cmp(s1h,s2h);
5619           else emit_test(s1h,s1h);
5620           nottaken1=(int)out;
5621           emit_jne(2);
5622         }
5623         if((opcode[i]&0x2f)==5) // BNE
5624         {
5625           if(s2h>=0) emit_cmp(s1h,s2h);
5626           else emit_test(s1h,s1h);
5627           taken=(int)out;
5628           emit_jne(1);
5629         }
5630         if((opcode[i]&0x2f)==6) // BLEZ
5631         {
5632           emit_test(s1h,s1h);
5633           taken=(int)out;
5634           emit_js(1);
5635           nottaken1=(int)out;
5636           emit_jne(2);
5637         }
5638         if((opcode[i]&0x2f)==7) // BGTZ
5639         {
5640           emit_test(s1h,s1h);
5641           nottaken1=(int)out;
5642           emit_js(2);
5643           taken=(int)out;
5644           emit_jne(1);
5645         }
5646       } // if(!only32)
5647           
5648       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5649       assert(s1l>=0);
5650       if((opcode[i]&0x2f)==4) // BEQ
5651       {
5652         if(s2l>=0) emit_cmp(s1l,s2l);
5653         else emit_test(s1l,s1l);
5654         nottaken=(int)out;
5655         emit_jne(2);
5656       }
5657       if((opcode[i]&0x2f)==5) // BNE
5658       {
5659         if(s2l>=0) emit_cmp(s1l,s2l);
5660         else emit_test(s1l,s1l);
5661         nottaken=(int)out;
5662         emit_jeq(2);
5663       }
5664       if((opcode[i]&0x2f)==6) // BLEZ
5665       {
5666         emit_cmpimm(s1l,1);
5667         nottaken=(int)out;
5668         emit_jge(2);
5669       }
5670       if((opcode[i]&0x2f)==7) // BGTZ
5671       {
5672         emit_cmpimm(s1l,1);
5673         nottaken=(int)out;
5674         emit_jl(2);
5675       }
5676     } // if(!unconditional)
5677     int adj;
5678     uint64_t ds_unneeded=branch_regs[i].u;
5679     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5680     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5681     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5682     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5683     ds_unneeded|=1;
5684     ds_unneeded_upper|=1;
5685     // branch taken
5686     if(!nop) {
5687       if(taken) set_jump_target(taken,(int)out);
5688       assem_debug("1:\n");
5689       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5690                     ds_unneeded,ds_unneeded_upper);
5691       // load regs
5692       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5693       address_generation(i+1,&branch_regs[i],0);
5694       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5695       ds_assemble(i+1,&branch_regs[i]);
5696       cc=get_reg(branch_regs[i].regmap,CCREG);
5697       if(cc==-1) {
5698         emit_loadreg(CCREG,cc=HOST_CCREG);
5699         // CHECK: Is the following instruction (fall thru) allocated ok?
5700       }
5701       assert(cc==HOST_CCREG);
5702       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5703       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5704       assem_debug("cycle count (adj)\n");
5705       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5706       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5707       if(internal)
5708         assem_debug("branch: internal\n");
5709       else
5710         assem_debug("branch: external\n");
5711       if(internal&&is_ds[(ba[i]-start)>>2]) {
5712         ds_assemble_entry(i);
5713       }
5714       else {
5715         add_to_linker((int)out,ba[i],internal);
5716         emit_jmp(0);
5717       }
5718     }
5719     // branch not taken
5720     cop1_usable=prev_cop1_usable;
5721     if(!unconditional) {
5722       if(nottaken1) set_jump_target(nottaken1,(int)out);
5723       set_jump_target(nottaken,(int)out);
5724       assem_debug("2:\n");
5725       if(!likely[i]) {
5726         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5727                       ds_unneeded,ds_unneeded_upper);
5728         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5729         address_generation(i+1,&branch_regs[i],0);
5730         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5731         ds_assemble(i+1,&branch_regs[i]);
5732       }
5733       cc=get_reg(branch_regs[i].regmap,CCREG);
5734       if(cc==-1&&!likely[i]) {
5735         // Cycle count isn't in a register, temporarily load it then write it out
5736         emit_loadreg(CCREG,HOST_CCREG);
5737         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5738         int jaddr=(int)out;
5739         emit_jns(0);
5740         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5741         emit_storereg(CCREG,HOST_CCREG);
5742       }
5743       else{
5744         cc=get_reg(i_regmap,CCREG);
5745         assert(cc==HOST_CCREG);
5746         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5747         int jaddr=(int)out;
5748         emit_jns(0);
5749         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5750       }
5751     }
5752   }
5753 }
5754
5755 void sjump_assemble(int i,struct regstat *i_regs)
5756 {
5757   signed char *i_regmap=i_regs->regmap;
5758   int cc;
5759   int match;
5760   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5761   assem_debug("smatch=%d\n",match);
5762   int s1h,s1l;
5763   int prev_cop1_usable=cop1_usable;
5764   int unconditional=0,nevertaken=0;
5765   int only32=0;
5766   int invert=0;
5767   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5768   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5769   if(!match) invert=1;
5770   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5771   if(i>(ba[i]-start)>>2) invert=1;
5772   #endif
5773
5774   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5775   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5776
5777   if(ooo[i]) {
5778     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5779     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5780   }
5781   else {
5782     s1l=get_reg(i_regmap,rs1[i]);
5783     s1h=get_reg(i_regmap,rs1[i]|64);
5784   }
5785   if(rs1[i]==0)
5786   {
5787     if(opcode2[i]&1) unconditional=1;
5788     else nevertaken=1;
5789     // These are never taken (r0 is never less than zero)
5790     //assert(opcode2[i]!=0);
5791     //assert(opcode2[i]!=2);
5792     //assert(opcode2[i]!=0x10);
5793     //assert(opcode2[i]!=0x12);
5794   }
5795   else {
5796     only32=(regs[i].was32>>rs1[i])&1;
5797   }
5798
5799   if(ooo[i]) {
5800     // Out of order execution (delay slot first)
5801     //printf("OOOE\n");
5802     address_generation(i+1,i_regs,regs[i].regmap_entry);
5803     ds_assemble(i+1,i_regs);
5804     int adj;
5805     uint64_t bc_unneeded=branch_regs[i].u;
5806     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5807     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5808     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5809     bc_unneeded|=1;
5810     bc_unneeded_upper|=1;
5811     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5812                   bc_unneeded,bc_unneeded_upper);
5813     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5814     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5815     if(rt1[i]==31) {
5816       int rt,return_address;
5817       rt=get_reg(branch_regs[i].regmap,31);
5818       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5819       if(rt>=0) {
5820         // Save the PC even if the branch is not taken
5821         return_address=start+i*4+8;
5822         emit_movimm(return_address,rt); // PC into link register
5823         #ifdef IMM_PREFETCH
5824         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5825         #endif
5826       }
5827     }
5828     cc=get_reg(branch_regs[i].regmap,CCREG);
5829     assert(cc==HOST_CCREG);
5830     if(unconditional) 
5831       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5832     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5833     assem_debug("cycle count (adj)\n");
5834     if(unconditional) {
5835       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5836       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5837         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5838         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5839         if(internal)
5840           assem_debug("branch: internal\n");
5841         else
5842           assem_debug("branch: external\n");
5843         if(internal&&is_ds[(ba[i]-start)>>2]) {
5844           ds_assemble_entry(i);
5845         }
5846         else {
5847           add_to_linker((int)out,ba[i],internal);
5848           emit_jmp(0);
5849         }
5850         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5851         if(((u_int)out)&7) emit_addnop(0);
5852         #endif
5853       }
5854     }
5855     else if(nevertaken) {
5856       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5857       int jaddr=(int)out;
5858       emit_jns(0);
5859       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5860     }
5861     else {
5862       int nottaken=0;
5863       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5864       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5865       if(!only32)
5866       {
5867         assert(s1h>=0);
5868         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5869         {
5870           emit_test(s1h,s1h);
5871           if(invert){
5872             nottaken=(int)out;
5873             emit_jns(1);
5874           }else{
5875             add_to_linker((int)out,ba[i],internal);
5876             emit_js(0);
5877           }
5878         }
5879         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5880         {
5881           emit_test(s1h,s1h);
5882           if(invert){
5883             nottaken=(int)out;
5884             emit_js(1);
5885           }else{
5886             add_to_linker((int)out,ba[i],internal);
5887             emit_jns(0);
5888           }
5889         }
5890       } // if(!only32)
5891       else
5892       {
5893         assert(s1l>=0);
5894         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5895         {
5896           emit_test(s1l,s1l);
5897           if(invert){
5898             nottaken=(int)out;
5899             emit_jns(1);
5900           }else{
5901             add_to_linker((int)out,ba[i],internal);
5902             emit_js(0);
5903           }
5904         }
5905         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5906         {
5907           emit_test(s1l,s1l);
5908           if(invert){
5909             nottaken=(int)out;
5910             emit_js(1);
5911           }else{
5912             add_to_linker((int)out,ba[i],internal);
5913             emit_jns(0);
5914           }
5915         }
5916       } // if(!only32)
5917           
5918       if(invert) {
5919         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5920         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5921           if(adj) {
5922             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5923             add_to_linker((int)out,ba[i],internal);
5924           }else{
5925             emit_addnop(13);
5926             add_to_linker((int)out,ba[i],internal*2);
5927           }
5928           emit_jmp(0);
5929         }else
5930         #endif
5931         {
5932           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5933           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5934           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5935           if(internal)
5936             assem_debug("branch: internal\n");
5937           else
5938             assem_debug("branch: external\n");
5939           if(internal&&is_ds[(ba[i]-start)>>2]) {
5940             ds_assemble_entry(i);
5941           }
5942           else {
5943             add_to_linker((int)out,ba[i],internal);
5944             emit_jmp(0);
5945           }
5946         }
5947         set_jump_target(nottaken,(int)out);
5948       }
5949
5950       if(adj) {
5951         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5952       }
5953     } // (!unconditional)
5954   } // if(ooo)
5955   else
5956   {
5957     // In-order execution (branch first)
5958     //printf("IOE\n");
5959     int nottaken=0;
5960     if(rt1[i]==31) {
5961       int rt,return_address;
5962       rt=get_reg(branch_regs[i].regmap,31);
5963       if(rt>=0) {
5964         // Save the PC even if the branch is not taken
5965         return_address=start+i*4+8;
5966         emit_movimm(return_address,rt); // PC into link register
5967         #ifdef IMM_PREFETCH
5968         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5969         #endif
5970       }
5971     }
5972     if(!unconditional) {
5973       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5974       if(!only32)
5975       {
5976         assert(s1h>=0);
5977         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5978         {
5979           emit_test(s1h,s1h);
5980           nottaken=(int)out;
5981           emit_jns(1);
5982         }
5983         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5984         {
5985           emit_test(s1h,s1h);
5986           nottaken=(int)out;
5987           emit_js(1);
5988         }
5989       } // if(!only32)
5990       else
5991       {
5992         assert(s1l>=0);
5993         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5994         {
5995           emit_test(s1l,s1l);
5996           nottaken=(int)out;
5997           emit_jns(1);
5998         }
5999         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6000         {
6001           emit_test(s1l,s1l);
6002           nottaken=(int)out;
6003           emit_js(1);
6004         }
6005       }
6006     } // if(!unconditional)
6007     int adj;
6008     uint64_t ds_unneeded=branch_regs[i].u;
6009     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6010     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6011     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6012     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6013     ds_unneeded|=1;
6014     ds_unneeded_upper|=1;
6015     // branch taken
6016     if(!nevertaken) {
6017       //assem_debug("1:\n");
6018       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6019                     ds_unneeded,ds_unneeded_upper);
6020       // load regs
6021       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6022       address_generation(i+1,&branch_regs[i],0);
6023       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6024       ds_assemble(i+1,&branch_regs[i]);
6025       cc=get_reg(branch_regs[i].regmap,CCREG);
6026       if(cc==-1) {
6027         emit_loadreg(CCREG,cc=HOST_CCREG);
6028         // CHECK: Is the following instruction (fall thru) allocated ok?
6029       }
6030       assert(cc==HOST_CCREG);
6031       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6032       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6033       assem_debug("cycle count (adj)\n");
6034       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6035       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6036       if(internal)
6037         assem_debug("branch: internal\n");
6038       else
6039         assem_debug("branch: external\n");
6040       if(internal&&is_ds[(ba[i]-start)>>2]) {
6041         ds_assemble_entry(i);
6042       }
6043       else {
6044         add_to_linker((int)out,ba[i],internal);
6045         emit_jmp(0);
6046       }
6047     }
6048     // branch not taken
6049     cop1_usable=prev_cop1_usable;
6050     if(!unconditional) {
6051       set_jump_target(nottaken,(int)out);
6052       assem_debug("1:\n");
6053       if(!likely[i]) {
6054         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6055                       ds_unneeded,ds_unneeded_upper);
6056         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6057         address_generation(i+1,&branch_regs[i],0);
6058         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6059         ds_assemble(i+1,&branch_regs[i]);
6060       }
6061       cc=get_reg(branch_regs[i].regmap,CCREG);
6062       if(cc==-1&&!likely[i]) {
6063         // Cycle count isn't in a register, temporarily load it then write it out
6064         emit_loadreg(CCREG,HOST_CCREG);
6065         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6066         int jaddr=(int)out;
6067         emit_jns(0);
6068         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6069         emit_storereg(CCREG,HOST_CCREG);
6070       }
6071       else{
6072         cc=get_reg(i_regmap,CCREG);
6073         assert(cc==HOST_CCREG);
6074         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6075         int jaddr=(int)out;
6076         emit_jns(0);
6077         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6078       }
6079     }
6080   }
6081 }
6082
6083 void fjump_assemble(int i,struct regstat *i_regs)
6084 {
6085   signed char *i_regmap=i_regs->regmap;
6086   int cc;
6087   int match;
6088   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6089   assem_debug("fmatch=%d\n",match);
6090   int fs,cs;
6091   int eaddr;
6092   int invert=0;
6093   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6094   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6095   if(!match) invert=1;
6096   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6097   if(i>(ba[i]-start)>>2) invert=1;
6098   #endif
6099
6100   if(ooo[i]) {
6101     fs=get_reg(branch_regs[i].regmap,FSREG);
6102     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6103   }
6104   else {
6105     fs=get_reg(i_regmap,FSREG);
6106   }
6107
6108   // Check cop1 unusable
6109   if(!cop1_usable) {
6110     cs=get_reg(i_regmap,CSREG);
6111     assert(cs>=0);
6112     emit_testimm(cs,0x20000000);
6113     eaddr=(int)out;
6114     emit_jeq(0);
6115     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6116     cop1_usable=1;
6117   }
6118
6119   if(ooo[i]) {
6120     // Out of order execution (delay slot first)
6121     //printf("OOOE\n");
6122     ds_assemble(i+1,i_regs);
6123     int adj;
6124     uint64_t bc_unneeded=branch_regs[i].u;
6125     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6126     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6127     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6128     bc_unneeded|=1;
6129     bc_unneeded_upper|=1;
6130     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6131                   bc_unneeded,bc_unneeded_upper);
6132     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6133     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6134     cc=get_reg(branch_regs[i].regmap,CCREG);
6135     assert(cc==HOST_CCREG);
6136     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6137     assem_debug("cycle count (adj)\n");
6138     if(1) {
6139       int nottaken=0;
6140       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6141       if(1) {
6142         assert(fs>=0);
6143         emit_testimm(fs,0x800000);
6144         if(source[i]&0x10000) // BC1T
6145         {
6146           if(invert){
6147             nottaken=(int)out;
6148             emit_jeq(1);
6149           }else{
6150             add_to_linker((int)out,ba[i],internal);
6151             emit_jne(0);
6152           }
6153         }
6154         else // BC1F
6155           if(invert){
6156             nottaken=(int)out;
6157             emit_jne(1);
6158           }else{
6159             add_to_linker((int)out,ba[i],internal);
6160             emit_jeq(0);
6161           }
6162         {
6163         }
6164       } // if(!only32)
6165           
6166       if(invert) {
6167         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6168         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6169         else if(match) emit_addnop(13);
6170         #endif
6171         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6172         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6173         if(internal)
6174           assem_debug("branch: internal\n");
6175         else
6176           assem_debug("branch: external\n");
6177         if(internal&&is_ds[(ba[i]-start)>>2]) {
6178           ds_assemble_entry(i);
6179         }
6180         else {
6181           add_to_linker((int)out,ba[i],internal);
6182           emit_jmp(0);
6183         }
6184         set_jump_target(nottaken,(int)out);
6185       }
6186
6187       if(adj) {
6188         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6189       }
6190     } // (!unconditional)
6191   } // if(ooo)
6192   else
6193   {
6194     // In-order execution (branch first)
6195     //printf("IOE\n");
6196     int nottaken=0;
6197     if(1) {
6198       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6199       if(1) {
6200         assert(fs>=0);
6201         emit_testimm(fs,0x800000);
6202         if(source[i]&0x10000) // BC1T
6203         {
6204           nottaken=(int)out;
6205           emit_jeq(1);
6206         }
6207         else // BC1F
6208         {
6209           nottaken=(int)out;
6210           emit_jne(1);
6211         }
6212       }
6213     } // if(!unconditional)
6214     int adj;
6215     uint64_t ds_unneeded=branch_regs[i].u;
6216     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6217     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6218     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6219     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6220     ds_unneeded|=1;
6221     ds_unneeded_upper|=1;
6222     // branch taken
6223     //assem_debug("1:\n");
6224     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6225                   ds_unneeded,ds_unneeded_upper);
6226     // load regs
6227     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6228     address_generation(i+1,&branch_regs[i],0);
6229     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6230     ds_assemble(i+1,&branch_regs[i]);
6231     cc=get_reg(branch_regs[i].regmap,CCREG);
6232     if(cc==-1) {
6233       emit_loadreg(CCREG,cc=HOST_CCREG);
6234       // CHECK: Is the following instruction (fall thru) allocated ok?
6235     }
6236     assert(cc==HOST_CCREG);
6237     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6238     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6239     assem_debug("cycle count (adj)\n");
6240     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6241     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6242     if(internal)
6243       assem_debug("branch: internal\n");
6244     else
6245       assem_debug("branch: external\n");
6246     if(internal&&is_ds[(ba[i]-start)>>2]) {
6247       ds_assemble_entry(i);
6248     }
6249     else {
6250       add_to_linker((int)out,ba[i],internal);
6251       emit_jmp(0);
6252     }
6253
6254     // branch not taken
6255     if(1) { // <- FIXME (don't need this)
6256       set_jump_target(nottaken,(int)out);
6257       assem_debug("1:\n");
6258       if(!likely[i]) {
6259         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6260                       ds_unneeded,ds_unneeded_upper);
6261         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6262         address_generation(i+1,&branch_regs[i],0);
6263         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6264         ds_assemble(i+1,&branch_regs[i]);
6265       }
6266       cc=get_reg(branch_regs[i].regmap,CCREG);
6267       if(cc==-1&&!likely[i]) {
6268         // Cycle count isn't in a register, temporarily load it then write it out
6269         emit_loadreg(CCREG,HOST_CCREG);
6270         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6271         int jaddr=(int)out;
6272         emit_jns(0);
6273         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6274         emit_storereg(CCREG,HOST_CCREG);
6275       }
6276       else{
6277         cc=get_reg(i_regmap,CCREG);
6278         assert(cc==HOST_CCREG);
6279         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6280         int jaddr=(int)out;
6281         emit_jns(0);
6282         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6283       }
6284     }
6285   }
6286 }
6287
6288 static void pagespan_assemble(int i,struct regstat *i_regs)
6289 {
6290   int s1l=get_reg(i_regs->regmap,rs1[i]);
6291   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6292   int s2l=get_reg(i_regs->regmap,rs2[i]);
6293   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6294   void *nt_branch=NULL;
6295   int taken=0;
6296   int nottaken=0;
6297   int unconditional=0;
6298   if(rs1[i]==0)
6299   {
6300     s1l=s2l;s1h=s2h;
6301     s2l=s2h=-1;
6302   }
6303   else if(rs2[i]==0)
6304   {
6305     s2l=s2h=-1;
6306   }
6307   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6308     s1h=s2h=-1;
6309   }
6310   int hr=0;
6311   int addr,alt,ntaddr;
6312   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6313   else {
6314     while(hr<HOST_REGS)
6315     {
6316       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6317          (i_regs->regmap[hr]&63)!=rs1[i] &&
6318          (i_regs->regmap[hr]&63)!=rs2[i] )
6319       {
6320         addr=hr++;break;
6321       }
6322       hr++;
6323     }
6324   }
6325   while(hr<HOST_REGS)
6326   {
6327     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6328        (i_regs->regmap[hr]&63)!=rs1[i] &&
6329        (i_regs->regmap[hr]&63)!=rs2[i] )
6330     {
6331       alt=hr++;break;
6332     }
6333     hr++;
6334   }
6335   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6336   {
6337     while(hr<HOST_REGS)
6338     {
6339       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6340          (i_regs->regmap[hr]&63)!=rs1[i] &&
6341          (i_regs->regmap[hr]&63)!=rs2[i] )
6342       {
6343         ntaddr=hr;break;
6344       }
6345       hr++;
6346     }
6347   }
6348   assert(hr<HOST_REGS);
6349   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6350     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6351   }
6352   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6353   if(opcode[i]==2) // J
6354   {
6355     unconditional=1;
6356   }
6357   if(opcode[i]==3) // JAL
6358   {
6359     // TODO: mini_ht
6360     int rt=get_reg(i_regs->regmap,31);
6361     emit_movimm(start+i*4+8,rt);
6362     unconditional=1;
6363   }
6364   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6365   {
6366     emit_mov(s1l,addr);
6367     if(opcode2[i]==9) // JALR
6368     {
6369       int rt=get_reg(i_regs->regmap,rt1[i]);
6370       emit_movimm(start+i*4+8,rt);
6371     }
6372   }
6373   if((opcode[i]&0x3f)==4) // BEQ
6374   {
6375     if(rs1[i]==rs2[i])
6376     {
6377       unconditional=1;
6378     }
6379     else
6380     #ifdef HAVE_CMOV_IMM
6381     if(s1h<0) {
6382       if(s2l>=0) emit_cmp(s1l,s2l);
6383       else emit_test(s1l,s1l);
6384       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6385     }
6386     else
6387     #endif
6388     {
6389       assert(s1l>=0);
6390       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6391       if(s1h>=0) {
6392         if(s2h>=0) emit_cmp(s1h,s2h);
6393         else emit_test(s1h,s1h);
6394         emit_cmovne_reg(alt,addr);
6395       }
6396       if(s2l>=0) emit_cmp(s1l,s2l);
6397       else emit_test(s1l,s1l);
6398       emit_cmovne_reg(alt,addr);
6399     }
6400   }
6401   if((opcode[i]&0x3f)==5) // BNE
6402   {
6403     #ifdef HAVE_CMOV_IMM
6404     if(s1h<0) {
6405       if(s2l>=0) emit_cmp(s1l,s2l);
6406       else emit_test(s1l,s1l);
6407       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6408     }
6409     else
6410     #endif
6411     {
6412       assert(s1l>=0);
6413       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6414       if(s1h>=0) {
6415         if(s2h>=0) emit_cmp(s1h,s2h);
6416         else emit_test(s1h,s1h);
6417         emit_cmovne_reg(alt,addr);
6418       }
6419       if(s2l>=0) emit_cmp(s1l,s2l);
6420       else emit_test(s1l,s1l);
6421       emit_cmovne_reg(alt,addr);
6422     }
6423   }
6424   if((opcode[i]&0x3f)==0x14) // BEQL
6425   {
6426     if(s1h>=0) {
6427       if(s2h>=0) emit_cmp(s1h,s2h);
6428       else emit_test(s1h,s1h);
6429       nottaken=(int)out;
6430       emit_jne(0);
6431     }
6432     if(s2l>=0) emit_cmp(s1l,s2l);
6433     else emit_test(s1l,s1l);
6434     if(nottaken) set_jump_target(nottaken,(int)out);
6435     nottaken=(int)out;
6436     emit_jne(0);
6437   }
6438   if((opcode[i]&0x3f)==0x15) // BNEL
6439   {
6440     if(s1h>=0) {
6441       if(s2h>=0) emit_cmp(s1h,s2h);
6442       else emit_test(s1h,s1h);
6443       taken=(int)out;
6444       emit_jne(0);
6445     }
6446     if(s2l>=0) emit_cmp(s1l,s2l);
6447     else emit_test(s1l,s1l);
6448     nottaken=(int)out;
6449     emit_jeq(0);
6450     if(taken) set_jump_target(taken,(int)out);
6451   }
6452   if((opcode[i]&0x3f)==6) // BLEZ
6453   {
6454     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6455     emit_cmpimm(s1l,1);
6456     if(s1h>=0) emit_mov(addr,ntaddr);
6457     emit_cmovl_reg(alt,addr);
6458     if(s1h>=0) {
6459       emit_test(s1h,s1h);
6460       emit_cmovne_reg(ntaddr,addr);
6461       emit_cmovs_reg(alt,addr);
6462     }
6463   }
6464   if((opcode[i]&0x3f)==7) // BGTZ
6465   {
6466     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6467     emit_cmpimm(s1l,1);
6468     if(s1h>=0) emit_mov(addr,alt);
6469     emit_cmovl_reg(ntaddr,addr);
6470     if(s1h>=0) {
6471       emit_test(s1h,s1h);
6472       emit_cmovne_reg(alt,addr);
6473       emit_cmovs_reg(ntaddr,addr);
6474     }
6475   }
6476   if((opcode[i]&0x3f)==0x16) // BLEZL
6477   {
6478     assert((opcode[i]&0x3f)!=0x16);
6479   }
6480   if((opcode[i]&0x3f)==0x17) // BGTZL
6481   {
6482     assert((opcode[i]&0x3f)!=0x17);
6483   }
6484   assert(opcode[i]!=1); // BLTZ/BGEZ
6485
6486   //FIXME: Check CSREG
6487   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6488     if((source[i]&0x30000)==0) // BC1F
6489     {
6490       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6491       emit_testimm(s1l,0x800000);
6492       emit_cmovne_reg(alt,addr);
6493     }
6494     if((source[i]&0x30000)==0x10000) // BC1T
6495     {
6496       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6497       emit_testimm(s1l,0x800000);
6498       emit_cmovne_reg(alt,addr);
6499     }
6500     if((source[i]&0x30000)==0x20000) // BC1FL
6501     {
6502       emit_testimm(s1l,0x800000);
6503       nottaken=(int)out;
6504       emit_jne(0);
6505     }
6506     if((source[i]&0x30000)==0x30000) // BC1TL
6507     {
6508       emit_testimm(s1l,0x800000);
6509       nottaken=(int)out;
6510       emit_jeq(0);
6511     }
6512   }
6513
6514   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6515   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6516   if(likely[i]||unconditional)
6517   {
6518     emit_movimm(ba[i],HOST_BTREG);
6519   }
6520   else if(addr!=HOST_BTREG)
6521   {
6522     emit_mov(addr,HOST_BTREG);
6523   }
6524   void *branch_addr=out;
6525   emit_jmp(0);
6526   int target_addr=start+i*4+5;
6527   void *stub=out;
6528   void *compiled_target_addr=check_addr(target_addr);
6529   emit_extjump_ds((int)branch_addr,target_addr);
6530   if(compiled_target_addr) {
6531     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6532     add_link(target_addr,stub);
6533   }
6534   else set_jump_target((int)branch_addr,(int)stub);
6535   if(likely[i]) {
6536     // Not-taken path
6537     set_jump_target((int)nottaken,(int)out);
6538     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6539     void *branch_addr=out;
6540     emit_jmp(0);
6541     int target_addr=start+i*4+8;
6542     void *stub=out;
6543     void *compiled_target_addr=check_addr(target_addr);
6544     emit_extjump_ds((int)branch_addr,target_addr);
6545     if(compiled_target_addr) {
6546       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6547       add_link(target_addr,stub);
6548     }
6549     else set_jump_target((int)branch_addr,(int)stub);
6550   }
6551 }
6552
6553 // Assemble the delay slot for the above
6554 static void pagespan_ds()
6555 {
6556   assem_debug("initial delay slot:\n");
6557   u_int vaddr=start+1;
6558   u_int page=get_page(vaddr);
6559   u_int vpage=get_vpage(vaddr);
6560   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6561   do_dirty_stub_ds();
6562   ll_add(jump_in+page,vaddr,(void *)out);
6563   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6564   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6565     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6566   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6567     emit_writeword(HOST_BTREG,(int)&branch_target);
6568   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6569   address_generation(0,&regs[0],regs[0].regmap_entry);
6570   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6571     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6572   cop1_usable=0;
6573   is_delayslot=0;
6574   switch(itype[0]) {
6575     case ALU:
6576       alu_assemble(0,&regs[0]);break;
6577     case IMM16:
6578       imm16_assemble(0,&regs[0]);break;
6579     case SHIFT:
6580       shift_assemble(0,&regs[0]);break;
6581     case SHIFTIMM:
6582       shiftimm_assemble(0,&regs[0]);break;
6583     case LOAD:
6584       load_assemble(0,&regs[0]);break;
6585     case LOADLR:
6586       loadlr_assemble(0,&regs[0]);break;
6587     case STORE:
6588       store_assemble(0,&regs[0]);break;
6589     case STORELR:
6590       storelr_assemble(0,&regs[0]);break;
6591     case COP0:
6592       cop0_assemble(0,&regs[0]);break;
6593     case COP1:
6594       cop1_assemble(0,&regs[0]);break;
6595     case C1LS:
6596       c1ls_assemble(0,&regs[0]);break;
6597     case COP2:
6598       cop2_assemble(0,&regs[0]);break;
6599     case C2LS:
6600       c2ls_assemble(0,&regs[0]);break;
6601     case C2OP:
6602       c2op_assemble(0,&regs[0]);break;
6603     case FCONV:
6604       fconv_assemble(0,&regs[0]);break;
6605     case FLOAT:
6606       float_assemble(0,&regs[0]);break;
6607     case FCOMP:
6608       fcomp_assemble(0,&regs[0]);break;
6609     case MULTDIV:
6610       multdiv_assemble(0,&regs[0]);break;
6611     case MOV:
6612       mov_assemble(0,&regs[0]);break;
6613     case SYSCALL:
6614     case HLECALL:
6615     case INTCALL:
6616     case SPAN:
6617     case UJUMP:
6618     case RJUMP:
6619     case CJUMP:
6620     case SJUMP:
6621     case FJUMP:
6622       printf("Jump in the delay slot.  This is probably a bug.\n");
6623   }
6624   int btaddr=get_reg(regs[0].regmap,BTREG);
6625   if(btaddr<0) {
6626     btaddr=get_reg(regs[0].regmap,-1);
6627     emit_readword((int)&branch_target,btaddr);
6628   }
6629   assert(btaddr!=HOST_CCREG);
6630   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6631 #ifdef HOST_IMM8
6632   emit_movimm(start+4,HOST_TEMPREG);
6633   emit_cmp(btaddr,HOST_TEMPREG);
6634 #else
6635   emit_cmpimm(btaddr,start+4);
6636 #endif
6637   int branch=(int)out;
6638   emit_jeq(0);
6639   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6640   emit_jmp(jump_vaddr_reg[btaddr]);
6641   set_jump_target(branch,(int)out);
6642   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6643   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6644 }
6645
6646 // Basic liveness analysis for MIPS registers
6647 void unneeded_registers(int istart,int iend,int r)
6648 {
6649   int i;
6650   uint64_t u,uu,b,bu;
6651   uint64_t temp_u,temp_uu;
6652   uint64_t tdep;
6653   if(iend==slen-1) {
6654     u=1;uu=1;
6655   }else{
6656     u=unneeded_reg[iend+1];
6657     uu=unneeded_reg_upper[iend+1];
6658     u=1;uu=1;
6659   }
6660   for (i=iend;i>=istart;i--)
6661   {
6662     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6663     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6664     {
6665       // If subroutine call, flag return address as a possible branch target
6666       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6667       
6668       if(ba[i]<start || ba[i]>=(start+slen*4))
6669       {
6670         // Branch out of this block, flush all regs
6671         u=1;
6672         uu=1;
6673         /* Hexagon hack 
6674         if(itype[i]==UJUMP&&rt1[i]==31)
6675         {
6676           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6677         }
6678         if(itype[i]==RJUMP&&rs1[i]==31)
6679         {
6680           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6681         }
6682         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6683           if(itype[i]==UJUMP&&rt1[i]==31)
6684           {
6685             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6686             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6687           }
6688           if(itype[i]==RJUMP&&rs1[i]==31)
6689           {
6690             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6691             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6692           }
6693         }*/
6694         branch_unneeded_reg[i]=u;
6695         branch_unneeded_reg_upper[i]=uu;
6696         // Merge in delay slot
6697         tdep=(~uu>>rt1[i+1])&1;
6698         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6699         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6700         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6701         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6702         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6703         u|=1;uu|=1;
6704         // If branch is "likely" (and conditional)
6705         // then we skip the delay slot on the fall-thru path
6706         if(likely[i]) {
6707           if(i<slen-1) {
6708             u&=unneeded_reg[i+2];
6709             uu&=unneeded_reg_upper[i+2];
6710           }
6711           else
6712           {
6713             u=1;
6714             uu=1;
6715           }
6716         }
6717       }
6718       else
6719       {
6720         // Internal branch, flag target
6721         bt[(ba[i]-start)>>2]=1;
6722         if(ba[i]<=start+i*4) {
6723           // Backward branch
6724           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6725           {
6726             // Unconditional branch
6727             temp_u=1;temp_uu=1;
6728           } else {
6729             // Conditional branch (not taken case)
6730             temp_u=unneeded_reg[i+2];
6731             temp_uu=unneeded_reg_upper[i+2];
6732           }
6733           // Merge in delay slot
6734           tdep=(~temp_uu>>rt1[i+1])&1;
6735           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6736           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6737           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6738           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6739           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6740           temp_u|=1;temp_uu|=1;
6741           // If branch is "likely" (and conditional)
6742           // then we skip the delay slot on the fall-thru path
6743           if(likely[i]) {
6744             if(i<slen-1) {
6745               temp_u&=unneeded_reg[i+2];
6746               temp_uu&=unneeded_reg_upper[i+2];
6747             }
6748             else
6749             {
6750               temp_u=1;
6751               temp_uu=1;
6752             }
6753           }
6754           tdep=(~temp_uu>>rt1[i])&1;
6755           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6756           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6757           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6758           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6759           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6760           temp_u|=1;temp_uu|=1;
6761           unneeded_reg[i]=temp_u;
6762           unneeded_reg_upper[i]=temp_uu;
6763           // Only go three levels deep.  This recursion can take an
6764           // excessive amount of time if there are a lot of nested loops.
6765           if(r<2) {
6766             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6767           }else{
6768             unneeded_reg[(ba[i]-start)>>2]=1;
6769             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6770           }
6771         } /*else*/ if(1) {
6772           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6773           {
6774             // Unconditional branch
6775             u=unneeded_reg[(ba[i]-start)>>2];
6776             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6777             branch_unneeded_reg[i]=u;
6778             branch_unneeded_reg_upper[i]=uu;
6779         //u=1;
6780         //uu=1;
6781         //branch_unneeded_reg[i]=u;
6782         //branch_unneeded_reg_upper[i]=uu;
6783             // Merge in delay slot
6784             tdep=(~uu>>rt1[i+1])&1;
6785             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6786             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6787             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6788             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6789             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6790             u|=1;uu|=1;
6791           } else {
6792             // Conditional branch
6793             b=unneeded_reg[(ba[i]-start)>>2];
6794             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6795             branch_unneeded_reg[i]=b;
6796             branch_unneeded_reg_upper[i]=bu;
6797         //b=1;
6798         //bu=1;
6799         //branch_unneeded_reg[i]=b;
6800         //branch_unneeded_reg_upper[i]=bu;
6801             // Branch delay slot
6802             tdep=(~uu>>rt1[i+1])&1;
6803             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6804             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6805             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6806             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6807             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6808             b|=1;bu|=1;
6809             // If branch is "likely" then we skip the
6810             // delay slot on the fall-thru path
6811             if(likely[i]) {
6812               u=b;
6813               uu=bu;
6814               if(i<slen-1) {
6815                 u&=unneeded_reg[i+2];
6816                 uu&=unneeded_reg_upper[i+2];
6817         //u=1;
6818         //uu=1;
6819               }
6820             } else {
6821               u&=b;
6822               uu&=bu;
6823         //u=1;
6824         //uu=1;
6825             }
6826             if(i<slen-1) {
6827               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6828               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6829         //branch_unneeded_reg[i]=1;
6830         //branch_unneeded_reg_upper[i]=1;
6831             } else {
6832               branch_unneeded_reg[i]=1;
6833               branch_unneeded_reg_upper[i]=1;
6834             }
6835           }
6836         }
6837       }
6838     }
6839     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6840     {
6841       // SYSCALL instruction (software interrupt)
6842       u=1;
6843       uu=1;
6844     }
6845     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6846     {
6847       // ERET instruction (return from interrupt)
6848       u=1;
6849       uu=1;
6850     }
6851     //u=uu=1; // DEBUG
6852     tdep=(~uu>>rt1[i])&1;
6853     // Written registers are unneeded
6854     u|=1LL<<rt1[i];
6855     u|=1LL<<rt2[i];
6856     uu|=1LL<<rt1[i];
6857     uu|=1LL<<rt2[i];
6858     // Accessed registers are needed
6859     u&=~(1LL<<rs1[i]);
6860     u&=~(1LL<<rs2[i]);
6861     uu&=~(1LL<<us1[i]);
6862     uu&=~(1LL<<us2[i]);
6863     // Source-target dependencies
6864     uu&=~(tdep<<dep1[i]);
6865     uu&=~(tdep<<dep2[i]);
6866     // R0 is always unneeded
6867     u|=1;uu|=1;
6868     // Save it
6869     unneeded_reg[i]=u;
6870     unneeded_reg_upper[i]=uu;
6871     /*
6872     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6873     printf("U:");
6874     int r;
6875     for(r=1;r<=CCREG;r++) {
6876       if((unneeded_reg[i]>>r)&1) {
6877         if(r==HIREG) printf(" HI");
6878         else if(r==LOREG) printf(" LO");
6879         else printf(" r%d",r);
6880       }
6881     }
6882     printf(" UU:");
6883     for(r=1;r<=CCREG;r++) {
6884       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6885         if(r==HIREG) printf(" HI");
6886         else if(r==LOREG) printf(" LO");
6887         else printf(" r%d",r);
6888       }
6889     }
6890     printf("\n");*/
6891   }
6892 #ifdef FORCE32
6893   for (i=iend;i>=istart;i--)
6894   {
6895     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6896   }
6897 #endif
6898 }
6899
6900 // Identify registers which are likely to contain 32-bit values
6901 // This is used to predict whether any branches will jump to a
6902 // location with 64-bit values in registers.
6903 static void provisional_32bit()
6904 {
6905   int i,j;
6906   uint64_t is32=1;
6907   uint64_t lastbranch=1;
6908   
6909   for(i=0;i<slen;i++)
6910   {
6911     if(i>0) {
6912       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6913         if(i>1) is32=lastbranch;
6914         else is32=1;
6915       }
6916     }
6917     if(i>1)
6918     {
6919       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6920         if(likely[i-2]) {
6921           if(i>2) is32=lastbranch;
6922           else is32=1;
6923         }
6924       }
6925       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6926       {
6927         if(rs1[i-2]==0||rs2[i-2]==0)
6928         {
6929           if(rs1[i-2]) {
6930             is32|=1LL<<rs1[i-2];
6931           }
6932           if(rs2[i-2]) {
6933             is32|=1LL<<rs2[i-2];
6934           }
6935         }
6936       }
6937     }
6938     // If something jumps here with 64-bit values
6939     // then promote those registers to 64 bits
6940     if(bt[i])
6941     {
6942       uint64_t temp_is32=is32;
6943       for(j=i-1;j>=0;j--)
6944       {
6945         if(ba[j]==start+i*4) 
6946           //temp_is32&=branch_regs[j].is32;
6947           temp_is32&=p32[j];
6948       }
6949       for(j=i;j<slen;j++)
6950       {
6951         if(ba[j]==start+i*4) 
6952           temp_is32=1;
6953       }
6954       is32=temp_is32;
6955     }
6956     int type=itype[i];
6957     int op=opcode[i];
6958     int op2=opcode2[i];
6959     int rt=rt1[i];
6960     int s1=rs1[i];
6961     int s2=rs2[i];
6962     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6963       // Branches don't write registers, consider the delay slot instead.
6964       type=itype[i+1];
6965       op=opcode[i+1];
6966       op2=opcode2[i+1];
6967       rt=rt1[i+1];
6968       s1=rs1[i+1];
6969       s2=rs2[i+1];
6970       lastbranch=is32;
6971     }
6972     switch(type) {
6973       case LOAD:
6974         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6975            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6976           is32&=~(1LL<<rt);
6977         else
6978           is32|=1LL<<rt;
6979         break;
6980       case STORE:
6981       case STORELR:
6982         break;
6983       case LOADLR:
6984         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6985         if(op==0x22) is32|=1LL<<rt; // LWL
6986         break;
6987       case IMM16:
6988         if (op==0x08||op==0x09|| // ADDI/ADDIU
6989             op==0x0a||op==0x0b|| // SLTI/SLTIU
6990             op==0x0c|| // ANDI
6991             op==0x0f)  // LUI
6992         {
6993           is32|=1LL<<rt;
6994         }
6995         if(op==0x18||op==0x19) { // DADDI/DADDIU
6996           is32&=~(1LL<<rt);
6997           //if(imm[i]==0)
6998           //  is32|=((is32>>s1)&1LL)<<rt;
6999         }
7000         if(op==0x0d||op==0x0e) { // ORI/XORI
7001           uint64_t sr=((is32>>s1)&1LL);
7002           is32&=~(1LL<<rt);
7003           is32|=sr<<rt;
7004         }
7005         break;
7006       case UJUMP:
7007         break;
7008       case RJUMP:
7009         break;
7010       case CJUMP:
7011         break;
7012       case SJUMP:
7013         break;
7014       case FJUMP:
7015         break;
7016       case ALU:
7017         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7018           is32|=1LL<<rt;
7019         }
7020         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7021           is32|=1LL<<rt;
7022         }
7023         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7024           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7025           is32&=~(1LL<<rt);
7026           is32|=sr<<rt;
7027         }
7028         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7029           if(s1==0&&s2==0) {
7030             is32|=1LL<<rt;
7031           }
7032           else if(s2==0) {
7033             uint64_t sr=((is32>>s1)&1LL);
7034             is32&=~(1LL<<rt);
7035             is32|=sr<<rt;
7036           }
7037           else if(s1==0) {
7038             uint64_t sr=((is32>>s2)&1LL);
7039             is32&=~(1LL<<rt);
7040             is32|=sr<<rt;
7041           }
7042           else {
7043             is32&=~(1LL<<rt);
7044           }
7045         }
7046         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7047           if(s1==0&&s2==0) {
7048             is32|=1LL<<rt;
7049           }
7050           else if(s2==0) {
7051             uint64_t sr=((is32>>s1)&1LL);
7052             is32&=~(1LL<<rt);
7053             is32|=sr<<rt;
7054           }
7055           else {
7056             is32&=~(1LL<<rt);
7057           }
7058         }
7059         break;
7060       case MULTDIV:
7061         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7062           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7063         }
7064         else {
7065           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7066         }
7067         break;
7068       case MOV:
7069         {
7070           uint64_t sr=((is32>>s1)&1LL);
7071           is32&=~(1LL<<rt);
7072           is32|=sr<<rt;
7073         }
7074         break;
7075       case SHIFT:
7076         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7077         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7078         break;
7079       case SHIFTIMM:
7080         is32|=1LL<<rt;
7081         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7082         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7083         break;
7084       case COP0:
7085         if(op2==0) is32|=1LL<<rt; // MFC0
7086         break;
7087       case COP1:
7088       case COP2:
7089         if(op2==0) is32|=1LL<<rt; // MFC1
7090         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7091         if(op2==2) is32|=1LL<<rt; // CFC1
7092         break;
7093       case C1LS:
7094       case C2LS:
7095         break;
7096       case FLOAT:
7097       case FCONV:
7098         break;
7099       case FCOMP:
7100         break;
7101       case C2OP:
7102       case SYSCALL:
7103       case HLECALL:
7104         break;
7105       default:
7106         break;
7107     }
7108     is32|=1;
7109     p32[i]=is32;
7110
7111     if(i>0)
7112     {
7113       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7114       {
7115         if(rt1[i-1]==31) // JAL/JALR
7116         {
7117           // Subroutine call will return here, don't alloc any registers
7118           is32=1;
7119         }
7120         else if(i+1<slen)
7121         {
7122           // Internal branch will jump here, match registers to caller
7123           is32=0x3FFFFFFFFLL;
7124         }
7125       }
7126     }
7127   }
7128 }
7129
7130 // Identify registers which may be assumed to contain 32-bit values
7131 // and where optimizations will rely on this.
7132 // This is used to determine whether backward branches can safely
7133 // jump to a location with 64-bit values in registers.
7134 static void provisional_r32()
7135 {
7136   u_int r32=0;
7137   int i;
7138   
7139   for (i=slen-1;i>=0;i--)
7140   {
7141     int hr;
7142     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7143     {
7144       if(ba[i]<start || ba[i]>=(start+slen*4))
7145       {
7146         // Branch out of this block, don't need anything
7147         r32=0;
7148       }
7149       else
7150       {
7151         // Internal branch
7152         // Need whatever matches the target
7153         // (and doesn't get overwritten by the delay slot instruction)
7154         r32=0;
7155         int t=(ba[i]-start)>>2;
7156         if(ba[i]>start+i*4) {
7157           // Forward branch
7158           //if(!(requires_32bit[t]&~regs[i].was32))
7159           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7160           if(!(pr32[t]&~regs[i].was32))
7161             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7162         }else{
7163           // Backward branch
7164           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7165             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7166         }
7167       }
7168       // Conditional branch may need registers for following instructions
7169       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7170       {
7171         if(i<slen-2) {
7172           //r32|=requires_32bit[i+2];
7173           r32|=pr32[i+2];
7174           r32&=regs[i].was32;
7175           // Mark this address as a branch target since it may be called
7176           // upon return from interrupt
7177           //bt[i+2]=1;
7178         }
7179       }
7180       // Merge in delay slot
7181       if(!likely[i]) {
7182         // These are overwritten unless the branch is "likely"
7183         // and the delay slot is nullified if not taken
7184         r32&=~(1LL<<rt1[i+1]);
7185         r32&=~(1LL<<rt2[i+1]);
7186       }
7187       // Assume these are needed (delay slot)
7188       if(us1[i+1]>0)
7189       {
7190         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7191       }
7192       if(us2[i+1]>0)
7193       {
7194         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7195       }
7196       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7197       {
7198         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7199       }
7200       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7201       {
7202         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7203       }
7204     }
7205     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7206     {
7207       // SYSCALL instruction (software interrupt)
7208       r32=0;
7209     }
7210     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7211     {
7212       // ERET instruction (return from interrupt)
7213       r32=0;
7214     }
7215     // Check 32 bits
7216     r32&=~(1LL<<rt1[i]);
7217     r32&=~(1LL<<rt2[i]);
7218     if(us1[i]>0)
7219     {
7220       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7221     }
7222     if(us2[i]>0)
7223     {
7224       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7225     }
7226     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7227     {
7228       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7229     }
7230     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7231     {
7232       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7233     }
7234     //requires_32bit[i]=r32;
7235     pr32[i]=r32;
7236     
7237     // Dirty registers which are 32-bit, require 32-bit input
7238     // as they will be written as 32-bit values
7239     for(hr=0;hr<HOST_REGS;hr++)
7240     {
7241       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7242         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7243           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7244           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7245           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7246         }
7247       }
7248     }
7249   }
7250 }
7251
7252 // Write back dirty registers as soon as we will no longer modify them,
7253 // so that we don't end up with lots of writes at the branches.
7254 void clean_registers(int istart,int iend,int wr)
7255 {
7256   int i;
7257   int r;
7258   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7259   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7260   if(iend==slen-1) {
7261     will_dirty_i=will_dirty_next=0;
7262     wont_dirty_i=wont_dirty_next=0;
7263   }else{
7264     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7265     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7266   }
7267   for (i=iend;i>=istart;i--)
7268   {
7269     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7270     {
7271       if(ba[i]<start || ba[i]>=(start+slen*4))
7272       {
7273         // Branch out of this block, flush all regs
7274         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7275         {
7276           // Unconditional branch
7277           will_dirty_i=0;
7278           wont_dirty_i=0;
7279           // Merge in delay slot (will dirty)
7280           for(r=0;r<HOST_REGS;r++) {
7281             if(r!=EXCLUDE_REG) {
7282               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7283               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7284               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7285               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7286               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7287               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7288               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7289               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7290               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7291               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7292               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7293               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7294               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7295               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7296             }
7297           }
7298         }
7299         else
7300         {
7301           // Conditional branch
7302           will_dirty_i=0;
7303           wont_dirty_i=wont_dirty_next;
7304           // Merge in delay slot (will dirty)
7305           for(r=0;r<HOST_REGS;r++) {
7306             if(r!=EXCLUDE_REG) {
7307               if(!likely[i]) {
7308                 // Might not dirty if likely branch is not taken
7309                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7310                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7311                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7312                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7313                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7314                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7315                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7316                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7317                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7318                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7319                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7320                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7321                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7322                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7323               }
7324             }
7325           }
7326         }
7327         // Merge in delay slot (wont dirty)
7328         for(r=0;r<HOST_REGS;r++) {
7329           if(r!=EXCLUDE_REG) {
7330             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7331             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7332             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7333             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7334             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7335             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7336             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7337             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7338             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7339             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7340           }
7341         }
7342         if(wr) {
7343           #ifndef DESTRUCTIVE_WRITEBACK
7344           branch_regs[i].dirty&=wont_dirty_i;
7345           #endif
7346           branch_regs[i].dirty|=will_dirty_i;
7347         }
7348       }
7349       else
7350       {
7351         // Internal branch
7352         if(ba[i]<=start+i*4) {
7353           // Backward branch
7354           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7355           {
7356             // Unconditional branch
7357             temp_will_dirty=0;
7358             temp_wont_dirty=0;
7359             // Merge in delay slot (will dirty)
7360             for(r=0;r<HOST_REGS;r++) {
7361               if(r!=EXCLUDE_REG) {
7362                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7363                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7364                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7365                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7366                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7367                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7368                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7369                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7370                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7371                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7372                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7373                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7374                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7375                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7376               }
7377             }
7378           } else {
7379             // Conditional branch (not taken case)
7380             temp_will_dirty=will_dirty_next;
7381             temp_wont_dirty=wont_dirty_next;
7382             // Merge in delay slot (will dirty)
7383             for(r=0;r<HOST_REGS;r++) {
7384               if(r!=EXCLUDE_REG) {
7385                 if(!likely[i]) {
7386                   // Will not dirty if likely branch is not taken
7387                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7388                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7389                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7390                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7391                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7392                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7393                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7394                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7395                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7396                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7397                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7398                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7399                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7400                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7401                 }
7402               }
7403             }
7404           }
7405           // Merge in delay slot (wont dirty)
7406           for(r=0;r<HOST_REGS;r++) {
7407             if(r!=EXCLUDE_REG) {
7408               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7409               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7410               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7411               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7412               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7413               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7414               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7415               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7416               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7417               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7418             }
7419           }
7420           // Deal with changed mappings
7421           if(i<iend) {
7422             for(r=0;r<HOST_REGS;r++) {
7423               if(r!=EXCLUDE_REG) {
7424                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7425                   temp_will_dirty&=~(1<<r);
7426                   temp_wont_dirty&=~(1<<r);
7427                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7428                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7429                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7430                   } else {
7431                     temp_will_dirty|=1<<r;
7432                     temp_wont_dirty|=1<<r;
7433                   }
7434                 }
7435               }
7436             }
7437           }
7438           if(wr) {
7439             will_dirty[i]=temp_will_dirty;
7440             wont_dirty[i]=temp_wont_dirty;
7441             clean_registers((ba[i]-start)>>2,i-1,0);
7442           }else{
7443             // Limit recursion.  It can take an excessive amount
7444             // of time if there are a lot of nested loops.
7445             will_dirty[(ba[i]-start)>>2]=0;
7446             wont_dirty[(ba[i]-start)>>2]=-1;
7447           }
7448         }
7449         /*else*/ if(1)
7450         {
7451           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7452           {
7453             // Unconditional branch
7454             will_dirty_i=0;
7455             wont_dirty_i=0;
7456           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7457             for(r=0;r<HOST_REGS;r++) {
7458               if(r!=EXCLUDE_REG) {
7459                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7460                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7461                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7462                 }
7463               }
7464             }
7465           //}
7466             // Merge in delay slot
7467             for(r=0;r<HOST_REGS;r++) {
7468               if(r!=EXCLUDE_REG) {
7469                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7470                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7471                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7472                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7473                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7474                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7475                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7476                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7477                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7478                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7479                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7480                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7481                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7482                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7483               }
7484             }
7485           } else {
7486             // Conditional branch
7487             will_dirty_i=will_dirty_next;
7488             wont_dirty_i=wont_dirty_next;
7489           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7490             for(r=0;r<HOST_REGS;r++) {
7491               if(r!=EXCLUDE_REG) {
7492                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7493                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7494                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7495                 }
7496                 else
7497                 {
7498                   will_dirty_i&=~(1<<r);
7499                 }
7500                 // Treat delay slot as part of branch too
7501                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7502                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7503                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7504                 }
7505                 else
7506                 {
7507                   will_dirty[i+1]&=~(1<<r);
7508                 }*/
7509               }
7510             }
7511           //}
7512             // Merge in delay slot
7513             for(r=0;r<HOST_REGS;r++) {
7514               if(r!=EXCLUDE_REG) {
7515                 if(!likely[i]) {
7516                   // Might not dirty if likely branch is not taken
7517                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7518                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7519                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7520                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7521                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7522                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7523                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7524                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7525                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7526                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7527                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7528                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7529                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7530                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7531                 }
7532               }
7533             }
7534           }
7535           // Merge in delay slot
7536           for(r=0;r<HOST_REGS;r++) {
7537             if(r!=EXCLUDE_REG) {
7538               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7539               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7540               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7541               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7542               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7543               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7544               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7545               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7546               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7547               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7548             }
7549           }
7550           if(wr) {
7551             #ifndef DESTRUCTIVE_WRITEBACK
7552             branch_regs[i].dirty&=wont_dirty_i;
7553             #endif
7554             branch_regs[i].dirty|=will_dirty_i;
7555           }
7556         }
7557       }
7558     }
7559     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7560     {
7561       // SYSCALL instruction (software interrupt)
7562       will_dirty_i=0;
7563       wont_dirty_i=0;
7564     }
7565     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7566     {
7567       // ERET instruction (return from interrupt)
7568       will_dirty_i=0;
7569       wont_dirty_i=0;
7570     }
7571     will_dirty_next=will_dirty_i;
7572     wont_dirty_next=wont_dirty_i;
7573     for(r=0;r<HOST_REGS;r++) {
7574       if(r!=EXCLUDE_REG) {
7575         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7576         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7577         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7578         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7579         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7580         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7581         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7582         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7583         if(i>istart) {
7584           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7585           {
7586             // Don't store a register immediately after writing it,
7587             // may prevent dual-issue.
7588             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7589             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7590           }
7591         }
7592       }
7593     }
7594     // Save it
7595     will_dirty[i]=will_dirty_i;
7596     wont_dirty[i]=wont_dirty_i;
7597     // Mark registers that won't be dirtied as not dirty
7598     if(wr) {
7599       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7600       for(r=0;r<HOST_REGS;r++) {
7601         if((will_dirty_i>>r)&1) {
7602           printf(" r%d",r);
7603         }
7604       }
7605       printf("\n");*/
7606
7607       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7608         regs[i].dirty|=will_dirty_i;
7609         #ifndef DESTRUCTIVE_WRITEBACK
7610         regs[i].dirty&=wont_dirty_i;
7611         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7612         {
7613           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7614             for(r=0;r<HOST_REGS;r++) {
7615               if(r!=EXCLUDE_REG) {
7616                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7617                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7618                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7619               }
7620             }
7621           }
7622         }
7623         else
7624         {
7625           if(i<iend) {
7626             for(r=0;r<HOST_REGS;r++) {
7627               if(r!=EXCLUDE_REG) {
7628                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7629                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7630                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7631               }
7632             }
7633           }
7634         }
7635         #endif
7636       //}
7637     }
7638     // Deal with changed mappings
7639     temp_will_dirty=will_dirty_i;
7640     temp_wont_dirty=wont_dirty_i;
7641     for(r=0;r<HOST_REGS;r++) {
7642       if(r!=EXCLUDE_REG) {
7643         int nr;
7644         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7645           if(wr) {
7646             #ifndef DESTRUCTIVE_WRITEBACK
7647             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7648             #endif
7649             regs[i].wasdirty|=will_dirty_i&(1<<r);
7650           }
7651         }
7652         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7653           // Register moved to a different register
7654           will_dirty_i&=~(1<<r);
7655           wont_dirty_i&=~(1<<r);
7656           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7657           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7658           if(wr) {
7659             #ifndef DESTRUCTIVE_WRITEBACK
7660             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7661             #endif
7662             regs[i].wasdirty|=will_dirty_i&(1<<r);
7663           }
7664         }
7665         else {
7666           will_dirty_i&=~(1<<r);
7667           wont_dirty_i&=~(1<<r);
7668           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7669             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7670             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7671           } else {
7672             wont_dirty_i|=1<<r;
7673             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7674           }
7675         }
7676       }
7677     }
7678   }
7679 }
7680
7681   /* disassembly */
7682 void disassemble_inst(int i)
7683 {
7684     if (bt[i]) printf("*"); else printf(" ");
7685     switch(itype[i]) {
7686       case UJUMP:
7687         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7688       case CJUMP:
7689         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7690       case SJUMP:
7691         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7692       case FJUMP:
7693         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7694       case RJUMP:
7695         if (opcode[i]==0x9&&rt1[i]!=31)
7696           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7697         else
7698           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7699         break;
7700       case SPAN:
7701         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7702       case IMM16:
7703         if(opcode[i]==0xf) //LUI
7704           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7705         else
7706           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7707         break;
7708       case LOAD:
7709       case LOADLR:
7710         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7711         break;
7712       case STORE:
7713       case STORELR:
7714         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7715         break;
7716       case ALU:
7717       case SHIFT:
7718         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7719         break;
7720       case MULTDIV:
7721         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7722         break;
7723       case SHIFTIMM:
7724         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7725         break;
7726       case MOV:
7727         if((opcode2[i]&0x1d)==0x10)
7728           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7729         else if((opcode2[i]&0x1d)==0x11)
7730           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7731         else
7732           printf (" %x: %s\n",start+i*4,insn[i]);
7733         break;
7734       case COP0:
7735         if(opcode2[i]==0)
7736           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7737         else if(opcode2[i]==4)
7738           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7739         else printf (" %x: %s\n",start+i*4,insn[i]);
7740         break;
7741       case COP1:
7742         if(opcode2[i]<3)
7743           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7744         else if(opcode2[i]>3)
7745           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7746         else printf (" %x: %s\n",start+i*4,insn[i]);
7747         break;
7748       case COP2:
7749         if(opcode2[i]<3)
7750           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7751         else if(opcode2[i]>3)
7752           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7753         else printf (" %x: %s\n",start+i*4,insn[i]);
7754         break;
7755       case C1LS:
7756         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7757         break;
7758       case C2LS:
7759         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7760         break;
7761       case INTCALL:
7762         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7763         break;
7764       default:
7765         //printf (" %s %8x\n",insn[i],source[i]);
7766         printf (" %x: %s\n",start+i*4,insn[i]);
7767     }
7768 }
7769
7770 // clear the state completely, instead of just marking
7771 // things invalid like invalidate_all_pages() does
7772 void new_dynarec_clear_full()
7773 {
7774   int n;
7775   for(n=0x80000;n<0x80800;n++)
7776     invalid_code[n]=1;
7777   for(n=0;n<65536;n++)
7778     hash_table[n][0]=hash_table[n][2]=-1;
7779   memset(mini_ht,-1,sizeof(mini_ht));
7780   memset(restore_candidate,0,sizeof(restore_candidate));
7781   memset(shadow,0,sizeof(shadow));
7782   copy=shadow;
7783   expirep=16384; // Expiry pointer, +2 blocks
7784   pending_exception=0;
7785   literalcount=0;
7786   stop_after_jal=0;
7787   // TLB
7788 #ifndef DISABLE_TLB
7789   using_tlb=0;
7790 #endif
7791   sp_in_mirror=0;
7792   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7793     memory_map[n]=-1;
7794   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7795     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7796   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7797     memory_map[n]=-1;
7798   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7799   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7800   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7801 }
7802
7803 void new_dynarec_init()
7804 {
7805   printf("Init new dynarec\n");
7806   out=(u_char *)BASE_ADDR;
7807   if (mmap (out, 1<<TARGET_SIZE_2,
7808             PROT_READ | PROT_WRITE | PROT_EXEC,
7809             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7810             -1, 0) <= 0) {printf("mmap() failed\n");}
7811 #ifdef MUPEN64
7812   rdword=&readmem_dword;
7813   fake_pc.f.r.rs=&readmem_dword;
7814   fake_pc.f.r.rt=&readmem_dword;
7815   fake_pc.f.r.rd=&readmem_dword;
7816 #endif
7817   int n;
7818   new_dynarec_clear_full();
7819 #ifdef HOST_IMM8
7820   // Copy this into local area so we don't have to put it in every literal pool
7821   invc_ptr=invalid_code;
7822 #endif
7823 #ifdef MUPEN64
7824   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7825     writemem[n] = write_nomem_new;
7826     writememb[n] = write_nomemb_new;
7827     writememh[n] = write_nomemh_new;
7828 #ifndef FORCE32
7829     writememd[n] = write_nomemd_new;
7830 #endif
7831     readmem[n] = read_nomem_new;
7832     readmemb[n] = read_nomemb_new;
7833     readmemh[n] = read_nomemh_new;
7834 #ifndef FORCE32
7835     readmemd[n] = read_nomemd_new;
7836 #endif
7837   }
7838   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7839     writemem[n] = write_rdram_new;
7840     writememb[n] = write_rdramb_new;
7841     writememh[n] = write_rdramh_new;
7842 #ifndef FORCE32
7843     writememd[n] = write_rdramd_new;
7844 #endif
7845   }
7846   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7847     writemem[n] = write_nomem_new;
7848     writememb[n] = write_nomemb_new;
7849     writememh[n] = write_nomemh_new;
7850 #ifndef FORCE32
7851     writememd[n] = write_nomemd_new;
7852 #endif
7853     readmem[n] = read_nomem_new;
7854     readmemb[n] = read_nomemb_new;
7855     readmemh[n] = read_nomemh_new;
7856 #ifndef FORCE32
7857     readmemd[n] = read_nomemd_new;
7858 #endif
7859   }
7860 #endif
7861   tlb_hacks();
7862   arch_init();
7863 }
7864
7865 void new_dynarec_cleanup()
7866 {
7867   int n;
7868   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7869   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7870   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7871   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7872   #ifdef ROM_COPY
7873   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7874   #endif
7875 }
7876
7877 int new_recompile_block(int addr)
7878 {
7879 /*
7880   if(addr==0x800cd050) {
7881     int block;
7882     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7883     int n;
7884     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7885   }
7886 */
7887   //if(Count==365117028) tracedebug=1;
7888   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7889   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7890   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7891   //if(debug) 
7892   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7893   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7894   /*if(Count>=312978186) {
7895     rlist();
7896   }*/
7897   //rlist();
7898   start = (u_int)addr&~3;
7899   //assert(((u_int)addr&1)==0);
7900 #ifdef PCSX
7901   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
7902      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
7903     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp);
7904     sp_in_mirror=1;
7905   }
7906   if (Config.HLE && start == 0x80001000) // hlecall
7907   {
7908     // XXX: is this enough? Maybe check hleSoftCall?
7909     u_int beginning=(u_int)out;
7910     u_int page=get_page(start);
7911     invalid_code[start>>12]=0;
7912     emit_movimm(start,0);
7913     emit_writeword(0,(int)&pcaddr);
7914     emit_jmp((int)new_dyna_leave);
7915 #ifdef __arm__
7916     __clear_cache((void *)beginning,out);
7917 #endif
7918     ll_add(jump_in+page,start,(void *)beginning);
7919     return 0;
7920   }
7921   else if ((u_int)addr < 0x00200000 ||
7922     (0xa0000000 <= addr && addr < 0xa0200000)) {
7923     // used for BIOS calls mostly?
7924     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7925     pagelimit = (addr&0xa0000000)|0x00200000;
7926   }
7927   else if (!Config.HLE && (
7928 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7929     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7930     // BIOS
7931     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7932     pagelimit = (addr&0xfff00000)|0x80000;
7933   }
7934   else
7935 #endif
7936 #ifdef MUPEN64
7937   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7938     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7939     pagelimit = 0xa4001000;
7940   }
7941   else
7942 #endif
7943   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7944     source = (u_int *)((u_int)rdram+start-0x80000000);
7945     pagelimit = 0x80000000+RAM_SIZE;
7946   }
7947 #ifndef DISABLE_TLB
7948   else if ((signed int)addr >= (signed int)0xC0000000) {
7949     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7950     //if(tlb_LUT_r[start>>12])
7951       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7952     if((signed int)memory_map[start>>12]>=0) {
7953       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7954       pagelimit=(start+4096)&0xFFFFF000;
7955       int map=memory_map[start>>12];
7956       int i;
7957       for(i=0;i<5;i++) {
7958         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7959         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7960       }
7961       assem_debug("pagelimit=%x\n",pagelimit);
7962       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7963     }
7964     else {
7965       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7966       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7967       return -1; // Caller will invoke exception handler
7968     }
7969     //printf("source= %x\n",(int)source);
7970   }
7971 #endif
7972   else {
7973     printf("Compile at bogus memory address: %x \n", (int)addr);
7974     exit(1);
7975   }
7976
7977   /* Pass 1: disassemble */
7978   /* Pass 2: register dependencies, branch targets */
7979   /* Pass 3: register allocation */
7980   /* Pass 4: branch dependencies */
7981   /* Pass 5: pre-alloc */
7982   /* Pass 6: optimize clean/dirty state */
7983   /* Pass 7: flag 32-bit registers */
7984   /* Pass 8: assembly */
7985   /* Pass 9: linker */
7986   /* Pass 10: garbage collection / free memory */
7987
7988   int i,j;
7989   int done=0;
7990   unsigned int type,op,op2;
7991
7992   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7993   
7994   /* Pass 1 disassembly */
7995
7996   for(i=0;!done;i++) {
7997     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7998     minimum_free_regs[i]=0;
7999     opcode[i]=op=source[i]>>26;
8000     switch(op)
8001     {
8002       case 0x00: strcpy(insn[i],"special"); type=NI;
8003         op2=source[i]&0x3f;
8004         switch(op2)
8005         {
8006           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8007           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8008           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8009           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8010           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8011           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8012           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8013           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8014           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8015           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8016           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8017           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8018           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8019           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8020           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8021           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8022           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8023           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8024           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8025           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8026           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8027           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8028           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8029           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8030           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8031           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8032           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8033           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8034           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8035           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8036           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8037           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8038           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8039           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8040           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8041           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8042           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8043           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8044           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8045           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8046           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8047           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8048           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8049           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8050           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8051           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8052           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8053           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8054           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8055           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8056           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8057           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8058         }
8059         break;
8060       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8061         op2=(source[i]>>16)&0x1f;
8062         switch(op2)
8063         {
8064           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8065           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8066           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8067           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8068           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8069           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8070           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8071           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8072           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8073           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8074           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8075           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8076           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8077           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8078         }
8079         break;
8080       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8081       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8082       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8083       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8084       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8085       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8086       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8087       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8088       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8089       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8090       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8091       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8092       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8093       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8094       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8095         op2=(source[i]>>21)&0x1f;
8096         switch(op2)
8097         {
8098           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8099           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8100           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8101           switch(source[i]&0x3f)
8102           {
8103             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8104             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8105             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8106             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8107 #ifdef PCSX
8108             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8109 #else
8110             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8111 #endif
8112           }
8113         }
8114         break;
8115       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8116         op2=(source[i]>>21)&0x1f;
8117         switch(op2)
8118         {
8119           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8120           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8121           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8122           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8123           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8124           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8125           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8126           switch((source[i]>>16)&0x3)
8127           {
8128             case 0x00: strcpy(insn[i],"BC1F"); break;
8129             case 0x01: strcpy(insn[i],"BC1T"); break;
8130             case 0x02: strcpy(insn[i],"BC1FL"); break;
8131             case 0x03: strcpy(insn[i],"BC1TL"); break;
8132           }
8133           break;
8134           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8135           switch(source[i]&0x3f)
8136           {
8137             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8138             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8139             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8140             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8141             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8142             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8143             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8144             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8145             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8146             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8147             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8148             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8149             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8150             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8151             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8152             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8153             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8154             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8155             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8156             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8157             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8158             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8159             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8160             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8161             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8162             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8163             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8164             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8165             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8166             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8167             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8168             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8169             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8170             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8171             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8172           }
8173           break;
8174           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8175           switch(source[i]&0x3f)
8176           {
8177             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8178             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8179             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8180             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8181             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8182             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8183             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8184             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8185             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8186             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8187             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8188             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8189             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8190             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8191             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8192             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8193             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8194             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8195             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8196             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8197             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8198             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8199             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8200             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8201             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8202             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8203             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8204             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8205             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8206             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8207             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8208             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8209             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8210             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8211             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8212           }
8213           break;
8214           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8215           switch(source[i]&0x3f)
8216           {
8217             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8218             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8219           }
8220           break;
8221           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8222           switch(source[i]&0x3f)
8223           {
8224             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8225             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8226           }
8227           break;
8228         }
8229         break;
8230 #ifndef FORCE32
8231       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8232       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8233       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8234       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8235       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8236       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8237       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8238       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8239 #endif
8240       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8241       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8242       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8243       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8244       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8245       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8246       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8247       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8248       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8249       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8250       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8251       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8252 #ifndef FORCE32
8253       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8254       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8255 #endif
8256       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8257       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8258       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8259       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8260 #ifndef FORCE32
8261       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8262       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8263       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8264 #endif
8265       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8266       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8267 #ifndef FORCE32
8268       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8269       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8270       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8271 #endif
8272 #ifdef PCSX
8273       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8274         // note: COP MIPS-1 encoding differs from MIPS32
8275         op2=(source[i]>>21)&0x1f;
8276         if (source[i]&0x3f) {
8277           if (gte_handlers[source[i]&0x3f]!=NULL) {
8278             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8279             type=C2OP;
8280           }
8281         }
8282         else switch(op2)
8283         {
8284           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8285           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8286           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8287           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8288         }
8289         break;
8290       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8291       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8292       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8293 #endif
8294       default: strcpy(insn[i],"???"); type=NI;
8295         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8296         break;
8297     }
8298     itype[i]=type;
8299     opcode2[i]=op2;
8300     /* Get registers/immediates */
8301     lt1[i]=0;
8302     us1[i]=0;
8303     us2[i]=0;
8304     dep1[i]=0;
8305     dep2[i]=0;
8306     switch(type) {
8307       case LOAD:
8308         rs1[i]=(source[i]>>21)&0x1f;
8309         rs2[i]=0;
8310         rt1[i]=(source[i]>>16)&0x1f;
8311         rt2[i]=0;
8312         imm[i]=(short)source[i];
8313         break;
8314       case STORE:
8315       case STORELR:
8316         rs1[i]=(source[i]>>21)&0x1f;
8317         rs2[i]=(source[i]>>16)&0x1f;
8318         rt1[i]=0;
8319         rt2[i]=0;
8320         imm[i]=(short)source[i];
8321         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8322         break;
8323       case LOADLR:
8324         // LWL/LWR only load part of the register,
8325         // therefore the target register must be treated as a source too
8326         rs1[i]=(source[i]>>21)&0x1f;
8327         rs2[i]=(source[i]>>16)&0x1f;
8328         rt1[i]=(source[i]>>16)&0x1f;
8329         rt2[i]=0;
8330         imm[i]=(short)source[i];
8331         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8332         if(op==0x26) dep1[i]=rt1[i]; // LWR
8333         break;
8334       case IMM16:
8335         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8336         else rs1[i]=(source[i]>>21)&0x1f;
8337         rs2[i]=0;
8338         rt1[i]=(source[i]>>16)&0x1f;
8339         rt2[i]=0;
8340         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8341           imm[i]=(unsigned short)source[i];
8342         }else{
8343           imm[i]=(short)source[i];
8344         }
8345         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8346         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8347         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8348         break;
8349       case UJUMP:
8350         rs1[i]=0;
8351         rs2[i]=0;
8352         rt1[i]=0;
8353         rt2[i]=0;
8354         // The JAL instruction writes to r31.
8355         if (op&1) {
8356           rt1[i]=31;
8357         }
8358         rs2[i]=CCREG;
8359         break;
8360       case RJUMP:
8361         rs1[i]=(source[i]>>21)&0x1f;
8362         rs2[i]=0;
8363         rt1[i]=0;
8364         rt2[i]=0;
8365         // The JALR instruction writes to rd.
8366         if (op2&1) {
8367           rt1[i]=(source[i]>>11)&0x1f;
8368         }
8369         rs2[i]=CCREG;
8370         break;
8371       case CJUMP:
8372         rs1[i]=(source[i]>>21)&0x1f;
8373         rs2[i]=(source[i]>>16)&0x1f;
8374         rt1[i]=0;
8375         rt2[i]=0;
8376         if(op&2) { // BGTZ/BLEZ
8377           rs2[i]=0;
8378         }
8379         us1[i]=rs1[i];
8380         us2[i]=rs2[i];
8381         likely[i]=op>>4;
8382         break;
8383       case SJUMP:
8384         rs1[i]=(source[i]>>21)&0x1f;
8385         rs2[i]=CCREG;
8386         rt1[i]=0;
8387         rt2[i]=0;
8388         us1[i]=rs1[i];
8389         if(op2&0x10) { // BxxAL
8390           rt1[i]=31;
8391           // NOTE: If the branch is not taken, r31 is still overwritten
8392         }
8393         likely[i]=(op2&2)>>1;
8394         break;
8395       case FJUMP:
8396         rs1[i]=FSREG;
8397         rs2[i]=CSREG;
8398         rt1[i]=0;
8399         rt2[i]=0;
8400         likely[i]=((source[i])>>17)&1;
8401         break;
8402       case ALU:
8403         rs1[i]=(source[i]>>21)&0x1f; // source
8404         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8405         rt1[i]=(source[i]>>11)&0x1f; // destination
8406         rt2[i]=0;
8407         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8408           us1[i]=rs1[i];us2[i]=rs2[i];
8409         }
8410         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8411           dep1[i]=rs1[i];dep2[i]=rs2[i];
8412         }
8413         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8414           dep1[i]=rs1[i];dep2[i]=rs2[i];
8415         }
8416         break;
8417       case MULTDIV:
8418         rs1[i]=(source[i]>>21)&0x1f; // source
8419         rs2[i]=(source[i]>>16)&0x1f; // divisor
8420         rt1[i]=HIREG;
8421         rt2[i]=LOREG;
8422         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8423           us1[i]=rs1[i];us2[i]=rs2[i];
8424         }
8425         break;
8426       case MOV:
8427         rs1[i]=0;
8428         rs2[i]=0;
8429         rt1[i]=0;
8430         rt2[i]=0;
8431         if(op2==0x10) rs1[i]=HIREG; // MFHI
8432         if(op2==0x11) rt1[i]=HIREG; // MTHI
8433         if(op2==0x12) rs1[i]=LOREG; // MFLO
8434         if(op2==0x13) rt1[i]=LOREG; // MTLO
8435         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8436         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8437         dep1[i]=rs1[i];
8438         break;
8439       case SHIFT:
8440         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8441         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8442         rt1[i]=(source[i]>>11)&0x1f; // destination
8443         rt2[i]=0;
8444         // DSLLV/DSRLV/DSRAV are 64-bit
8445         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8446         break;
8447       case SHIFTIMM:
8448         rs1[i]=(source[i]>>16)&0x1f;
8449         rs2[i]=0;
8450         rt1[i]=(source[i]>>11)&0x1f;
8451         rt2[i]=0;
8452         imm[i]=(source[i]>>6)&0x1f;
8453         // DSxx32 instructions
8454         if(op2>=0x3c) imm[i]|=0x20;
8455         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8456         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8457         break;
8458       case COP0:
8459         rs1[i]=0;
8460         rs2[i]=0;
8461         rt1[i]=0;
8462         rt2[i]=0;
8463         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8464         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8465         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8466         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8467         break;
8468       case COP1:
8469       case COP2:
8470         rs1[i]=0;
8471         rs2[i]=0;
8472         rt1[i]=0;
8473         rt2[i]=0;
8474         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8475         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8476         if(op2==5) us1[i]=rs1[i]; // DMTC1
8477         rs2[i]=CSREG;
8478         break;
8479       case C1LS:
8480         rs1[i]=(source[i]>>21)&0x1F;
8481         rs2[i]=CSREG;
8482         rt1[i]=0;
8483         rt2[i]=0;
8484         imm[i]=(short)source[i];
8485         break;
8486       case C2LS:
8487         rs1[i]=(source[i]>>21)&0x1F;
8488         rs2[i]=0;
8489         rt1[i]=0;
8490         rt2[i]=0;
8491         imm[i]=(short)source[i];
8492         break;
8493       case FLOAT:
8494       case FCONV:
8495         rs1[i]=0;
8496         rs2[i]=CSREG;
8497         rt1[i]=0;
8498         rt2[i]=0;
8499         break;
8500       case FCOMP:
8501         rs1[i]=FSREG;
8502         rs2[i]=CSREG;
8503         rt1[i]=FSREG;
8504         rt2[i]=0;
8505         break;
8506       case SYSCALL:
8507       case HLECALL:
8508       case INTCALL:
8509         rs1[i]=CCREG;
8510         rs2[i]=0;
8511         rt1[i]=0;
8512         rt2[i]=0;
8513         break;
8514       default:
8515         rs1[i]=0;
8516         rs2[i]=0;
8517         rt1[i]=0;
8518         rt2[i]=0;
8519     }
8520     /* Calculate branch target addresses */
8521     if(type==UJUMP)
8522       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8523     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8524       ba[i]=start+i*4+8; // Ignore never taken branch
8525     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8526       ba[i]=start+i*4+8; // Ignore never taken branch
8527     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8528       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8529     else ba[i]=-1;
8530 #ifdef PCSX
8531     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8532       int do_in_intrp=0;
8533       // branch in delay slot?
8534       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8535         // don't handle first branch and call interpreter if it's hit
8536         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8537         do_in_intrp=1;
8538       }
8539       // basic load delay detection
8540       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8541         int t=(ba[i-1]-start)/4;
8542         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8543           // jump target wants DS result - potential load delay effect
8544           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8545           do_in_intrp=1;
8546           bt[t+1]=1; // expected return from interpreter
8547         }
8548         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8549               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8550           // v0 overwrite like this is a sign of trouble, bail out
8551           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8552           do_in_intrp=1;
8553         }
8554       }
8555       if(do_in_intrp) {
8556         rs1[i-1]=CCREG;
8557         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8558         ba[i-1]=-1;
8559         itype[i-1]=INTCALL;
8560         done=2;
8561         i--; // don't compile the DS
8562       }
8563     }
8564 #endif
8565     /* Is this the end of the block? */
8566     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8567       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8568         done=2;
8569       }
8570       else {
8571         if(stop_after_jal) done=1;
8572         // Stop on BREAK
8573         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8574       }
8575       // Don't recompile stuff that's already compiled
8576       if(check_addr(start+i*4+4)) done=1;
8577       // Don't get too close to the limit
8578       if(i>MAXBLOCK/2) done=1;
8579     }
8580     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8581     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8582     if(done==2) {
8583       // Does the block continue due to a branch?
8584       for(j=i-1;j>=0;j--)
8585       {
8586         if(ba[j]==start+i*4+4) done=j=0;
8587         if(ba[j]==start+i*4+8) done=j=0;
8588       }
8589     }
8590     //assert(i<MAXBLOCK-1);
8591     if(start+i*4==pagelimit-4) done=1;
8592     assert(start+i*4<pagelimit);
8593     if (i==MAXBLOCK-1) done=1;
8594     // Stop if we're compiling junk
8595     if(itype[i]==NI&&opcode[i]==0x11) {
8596       done=stop_after_jal=1;
8597       printf("Disabled speculative precompilation\n");
8598     }
8599   }
8600   slen=i;
8601   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8602     if(start+i*4==pagelimit) {
8603       itype[i-1]=SPAN;
8604     }
8605   }
8606   assert(slen>0);
8607
8608   /* Pass 2 - Register dependencies and branch targets */
8609
8610   unneeded_registers(0,slen-1,0);
8611   
8612   /* Pass 3 - Register allocation */
8613
8614   struct regstat current; // Current register allocations/status
8615   current.is32=1;
8616   current.dirty=0;
8617   current.u=unneeded_reg[0];
8618   current.uu=unneeded_reg_upper[0];
8619   clear_all_regs(current.regmap);
8620   alloc_reg(&current,0,CCREG);
8621   dirty_reg(&current,CCREG);
8622   current.isconst=0;
8623   current.wasconst=0;
8624   int ds=0;
8625   int cc=0;
8626   int hr;
8627
8628 #ifndef FORCE32
8629   provisional_32bit();
8630 #endif
8631   if((u_int)addr&1) {
8632     // First instruction is delay slot
8633     cc=-1;
8634     bt[1]=1;
8635     ds=1;
8636     unneeded_reg[0]=1;
8637     unneeded_reg_upper[0]=1;
8638     current.regmap[HOST_BTREG]=BTREG;
8639   }
8640   
8641   for(i=0;i<slen;i++)
8642   {
8643     if(bt[i])
8644     {
8645       int hr;
8646       for(hr=0;hr<HOST_REGS;hr++)
8647       {
8648         // Is this really necessary?
8649         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8650       }
8651       current.isconst=0;
8652     }
8653     if(i>1)
8654     {
8655       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8656       {
8657         if(rs1[i-2]==0||rs2[i-2]==0)
8658         {
8659           if(rs1[i-2]) {
8660             current.is32|=1LL<<rs1[i-2];
8661             int hr=get_reg(current.regmap,rs1[i-2]|64);
8662             if(hr>=0) current.regmap[hr]=-1;
8663           }
8664           if(rs2[i-2]) {
8665             current.is32|=1LL<<rs2[i-2];
8666             int hr=get_reg(current.regmap,rs2[i-2]|64);
8667             if(hr>=0) current.regmap[hr]=-1;
8668           }
8669         }
8670       }
8671     }
8672 #ifndef FORCE32
8673     // If something jumps here with 64-bit values
8674     // then promote those registers to 64 bits
8675     if(bt[i])
8676     {
8677       uint64_t temp_is32=current.is32;
8678       for(j=i-1;j>=0;j--)
8679       {
8680         if(ba[j]==start+i*4) 
8681           temp_is32&=branch_regs[j].is32;
8682       }
8683       for(j=i;j<slen;j++)
8684       {
8685         if(ba[j]==start+i*4) 
8686           //temp_is32=1;
8687           temp_is32&=p32[j];
8688       }
8689       if(temp_is32!=current.is32) {
8690         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8691         #ifdef DESTRUCTIVE_WRITEBACK
8692         for(hr=0;hr<HOST_REGS;hr++)
8693         {
8694           int r=current.regmap[hr];
8695           if(r>0&&r<64)
8696           {
8697             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8698               temp_is32|=1LL<<r;
8699               //printf("restore %d\n",r);
8700             }
8701           }
8702         }
8703         #endif
8704         current.is32=temp_is32;
8705       }
8706     }
8707 #else
8708     current.is32=-1LL;
8709 #endif
8710
8711     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8712     regs[i].wasconst=current.isconst;
8713     regs[i].was32=current.is32;
8714     regs[i].wasdirty=current.dirty;
8715     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8716     // To change a dirty register from 32 to 64 bits, we must write
8717     // it out during the previous cycle (for branches, 2 cycles)
8718     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8719     {
8720       uint64_t temp_is32=current.is32;
8721       for(j=i-1;j>=0;j--)
8722       {
8723         if(ba[j]==start+i*4+4) 
8724           temp_is32&=branch_regs[j].is32;
8725       }
8726       for(j=i;j<slen;j++)
8727       {
8728         if(ba[j]==start+i*4+4) 
8729           //temp_is32=1;
8730           temp_is32&=p32[j];
8731       }
8732       if(temp_is32!=current.is32) {
8733         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8734         for(hr=0;hr<HOST_REGS;hr++)
8735         {
8736           int r=current.regmap[hr];
8737           if(r>0)
8738           {
8739             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8740               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8741               {
8742                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8743                 {
8744                   //printf("dump %d/r%d\n",hr,r);
8745                   current.regmap[hr]=-1;
8746                   if(get_reg(current.regmap,r|64)>=0) 
8747                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8748                 }
8749               }
8750             }
8751           }
8752         }
8753       }
8754     }
8755     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8756     {
8757       uint64_t temp_is32=current.is32;
8758       for(j=i-1;j>=0;j--)
8759       {
8760         if(ba[j]==start+i*4+8) 
8761           temp_is32&=branch_regs[j].is32;
8762       }
8763       for(j=i;j<slen;j++)
8764       {
8765         if(ba[j]==start+i*4+8) 
8766           //temp_is32=1;
8767           temp_is32&=p32[j];
8768       }
8769       if(temp_is32!=current.is32) {
8770         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8771         for(hr=0;hr<HOST_REGS;hr++)
8772         {
8773           int r=current.regmap[hr];
8774           if(r>0)
8775           {
8776             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8777               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8778               {
8779                 //printf("dump %d/r%d\n",hr,r);
8780                 current.regmap[hr]=-1;
8781                 if(get_reg(current.regmap,r|64)>=0) 
8782                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8783               }
8784             }
8785           }
8786         }
8787       }
8788     }
8789     #endif
8790     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8791       if(i+1<slen) {
8792         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8793         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8794         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8795         current.u|=1;
8796         current.uu|=1;
8797       } else {
8798         current.u=1;
8799         current.uu=1;
8800       }
8801     } else {
8802       if(i+1<slen) {
8803         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8804         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8805         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8806         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8807         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8808         current.u|=1;
8809         current.uu|=1;
8810       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8811     }
8812     is_ds[i]=ds;
8813     if(ds) {
8814       ds=0; // Skip delay slot, already allocated as part of branch
8815       // ...but we need to alloc it in case something jumps here
8816       if(i+1<slen) {
8817         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8818         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8819       }else{
8820         current.u=branch_unneeded_reg[i-1];
8821         current.uu=branch_unneeded_reg_upper[i-1];
8822       }
8823       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8824       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8825       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8826       current.u|=1;
8827       current.uu|=1;
8828       struct regstat temp;
8829       memcpy(&temp,&current,sizeof(current));
8830       temp.wasdirty=temp.dirty;
8831       temp.was32=temp.is32;
8832       // TODO: Take into account unconditional branches, as below
8833       delayslot_alloc(&temp,i);
8834       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8835       regs[i].wasdirty=temp.wasdirty;
8836       regs[i].was32=temp.was32;
8837       regs[i].dirty=temp.dirty;
8838       regs[i].is32=temp.is32;
8839       regs[i].isconst=0;
8840       regs[i].wasconst=0;
8841       current.isconst=0;
8842       // Create entry (branch target) regmap
8843       for(hr=0;hr<HOST_REGS;hr++)
8844       {
8845         int r=temp.regmap[hr];
8846         if(r>=0) {
8847           if(r!=regmap_pre[i][hr]) {
8848             regs[i].regmap_entry[hr]=-1;
8849           }
8850           else
8851           {
8852             if(r<64){
8853               if((current.u>>r)&1) {
8854                 regs[i].regmap_entry[hr]=-1;
8855                 regs[i].regmap[hr]=-1;
8856                 //Don't clear regs in the delay slot as the branch might need them
8857                 //current.regmap[hr]=-1;
8858               }else
8859                 regs[i].regmap_entry[hr]=r;
8860             }
8861             else {
8862               if((current.uu>>(r&63))&1) {
8863                 regs[i].regmap_entry[hr]=-1;
8864                 regs[i].regmap[hr]=-1;
8865                 //Don't clear regs in the delay slot as the branch might need them
8866                 //current.regmap[hr]=-1;
8867               }else
8868                 regs[i].regmap_entry[hr]=r;
8869             }
8870           }
8871         } else {
8872           // First instruction expects CCREG to be allocated
8873           if(i==0&&hr==HOST_CCREG) 
8874             regs[i].regmap_entry[hr]=CCREG;
8875           else
8876             regs[i].regmap_entry[hr]=-1;
8877         }
8878       }
8879     }
8880     else { // Not delay slot
8881       switch(itype[i]) {
8882         case UJUMP:
8883           //current.isconst=0; // DEBUG
8884           //current.wasconst=0; // DEBUG
8885           //regs[i].wasconst=0; // DEBUG
8886           clear_const(&current,rt1[i]);
8887           alloc_cc(&current,i);
8888           dirty_reg(&current,CCREG);
8889           if (rt1[i]==31) {
8890             alloc_reg(&current,i,31);
8891             dirty_reg(&current,31);
8892             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8893             //assert(rt1[i+1]!=rt1[i]);
8894             #ifdef REG_PREFETCH
8895             alloc_reg(&current,i,PTEMP);
8896             #endif
8897             //current.is32|=1LL<<rt1[i];
8898           }
8899           ooo[i]=1;
8900           delayslot_alloc(&current,i+1);
8901           //current.isconst=0; // DEBUG
8902           ds=1;
8903           //printf("i=%d, isconst=%x\n",i,current.isconst);
8904           break;
8905         case RJUMP:
8906           //current.isconst=0;
8907           //current.wasconst=0;
8908           //regs[i].wasconst=0;
8909           clear_const(&current,rs1[i]);
8910           clear_const(&current,rt1[i]);
8911           alloc_cc(&current,i);
8912           dirty_reg(&current,CCREG);
8913           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8914             alloc_reg(&current,i,rs1[i]);
8915             if (rt1[i]!=0) {
8916               alloc_reg(&current,i,rt1[i]);
8917               dirty_reg(&current,rt1[i]);
8918               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8919               assert(rt1[i+1]!=rt1[i]);
8920               #ifdef REG_PREFETCH
8921               alloc_reg(&current,i,PTEMP);
8922               #endif
8923             }
8924             #ifdef USE_MINI_HT
8925             if(rs1[i]==31) { // JALR
8926               alloc_reg(&current,i,RHASH);
8927               #ifndef HOST_IMM_ADDR32
8928               alloc_reg(&current,i,RHTBL);
8929               #endif
8930             }
8931             #endif
8932             delayslot_alloc(&current,i+1);
8933           } else {
8934             // The delay slot overwrites our source register,
8935             // allocate a temporary register to hold the old value.
8936             current.isconst=0;
8937             current.wasconst=0;
8938             regs[i].wasconst=0;
8939             delayslot_alloc(&current,i+1);
8940             current.isconst=0;
8941             alloc_reg(&current,i,RTEMP);
8942           }
8943           //current.isconst=0; // DEBUG
8944           ooo[i]=1;
8945           ds=1;
8946           break;
8947         case CJUMP:
8948           //current.isconst=0;
8949           //current.wasconst=0;
8950           //regs[i].wasconst=0;
8951           clear_const(&current,rs1[i]);
8952           clear_const(&current,rs2[i]);
8953           if((opcode[i]&0x3E)==4) // BEQ/BNE
8954           {
8955             alloc_cc(&current,i);
8956             dirty_reg(&current,CCREG);
8957             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8958             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8959             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8960             {
8961               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8962               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8963             }
8964             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8965                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8966               // The delay slot overwrites one of our conditions.
8967               // Allocate the branch condition registers instead.
8968               current.isconst=0;
8969               current.wasconst=0;
8970               regs[i].wasconst=0;
8971               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8972               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8973               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8974               {
8975                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8976                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8977               }
8978             }
8979             else
8980             {
8981               ooo[i]=1;
8982               delayslot_alloc(&current,i+1);
8983             }
8984           }
8985           else
8986           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8987           {
8988             alloc_cc(&current,i);
8989             dirty_reg(&current,CCREG);
8990             alloc_reg(&current,i,rs1[i]);
8991             if(!(current.is32>>rs1[i]&1))
8992             {
8993               alloc_reg64(&current,i,rs1[i]);
8994             }
8995             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8996               // The delay slot overwrites one of our conditions.
8997               // Allocate the branch condition registers instead.
8998               current.isconst=0;
8999               current.wasconst=0;
9000               regs[i].wasconst=0;
9001               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9002               if(!((current.is32>>rs1[i])&1))
9003               {
9004                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9005               }
9006             }
9007             else
9008             {
9009               ooo[i]=1;
9010               delayslot_alloc(&current,i+1);
9011             }
9012           }
9013           else
9014           // Don't alloc the delay slot yet because we might not execute it
9015           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9016           {
9017             current.isconst=0;
9018             current.wasconst=0;
9019             regs[i].wasconst=0;
9020             alloc_cc(&current,i);
9021             dirty_reg(&current,CCREG);
9022             alloc_reg(&current,i,rs1[i]);
9023             alloc_reg(&current,i,rs2[i]);
9024             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9025             {
9026               alloc_reg64(&current,i,rs1[i]);
9027               alloc_reg64(&current,i,rs2[i]);
9028             }
9029           }
9030           else
9031           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9032           {
9033             current.isconst=0;
9034             current.wasconst=0;
9035             regs[i].wasconst=0;
9036             alloc_cc(&current,i);
9037             dirty_reg(&current,CCREG);
9038             alloc_reg(&current,i,rs1[i]);
9039             if(!(current.is32>>rs1[i]&1))
9040             {
9041               alloc_reg64(&current,i,rs1[i]);
9042             }
9043           }
9044           ds=1;
9045           //current.isconst=0;
9046           break;
9047         case SJUMP:
9048           //current.isconst=0;
9049           //current.wasconst=0;
9050           //regs[i].wasconst=0;
9051           clear_const(&current,rs1[i]);
9052           clear_const(&current,rt1[i]);
9053           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9054           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9055           {
9056             alloc_cc(&current,i);
9057             dirty_reg(&current,CCREG);
9058             alloc_reg(&current,i,rs1[i]);
9059             if(!(current.is32>>rs1[i]&1))
9060             {
9061               alloc_reg64(&current,i,rs1[i]);
9062             }
9063             if (rt1[i]==31) { // BLTZAL/BGEZAL
9064               alloc_reg(&current,i,31);
9065               dirty_reg(&current,31);
9066               //#ifdef REG_PREFETCH
9067               //alloc_reg(&current,i,PTEMP);
9068               //#endif
9069               //current.is32|=1LL<<rt1[i];
9070             }
9071             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9072                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9073               // Allocate the branch condition registers instead.
9074               current.isconst=0;
9075               current.wasconst=0;
9076               regs[i].wasconst=0;
9077               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9078               if(!((current.is32>>rs1[i])&1))
9079               {
9080                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9081               }
9082             }
9083             else
9084             {
9085               ooo[i]=1;
9086               delayslot_alloc(&current,i+1);
9087             }
9088           }
9089           else
9090           // Don't alloc the delay slot yet because we might not execute it
9091           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9092           {
9093             current.isconst=0;
9094             current.wasconst=0;
9095             regs[i].wasconst=0;
9096             alloc_cc(&current,i);
9097             dirty_reg(&current,CCREG);
9098             alloc_reg(&current,i,rs1[i]);
9099             if(!(current.is32>>rs1[i]&1))
9100             {
9101               alloc_reg64(&current,i,rs1[i]);
9102             }
9103           }
9104           ds=1;
9105           //current.isconst=0;
9106           break;
9107         case FJUMP:
9108           current.isconst=0;
9109           current.wasconst=0;
9110           regs[i].wasconst=0;
9111           if(likely[i]==0) // BC1F/BC1T
9112           {
9113             // TODO: Theoretically we can run out of registers here on x86.
9114             // The delay slot can allocate up to six, and we need to check
9115             // CSREG before executing the delay slot.  Possibly we can drop
9116             // the cycle count and then reload it after checking that the
9117             // FPU is in a usable state, or don't do out-of-order execution.
9118             alloc_cc(&current,i);
9119             dirty_reg(&current,CCREG);
9120             alloc_reg(&current,i,FSREG);
9121             alloc_reg(&current,i,CSREG);
9122             if(itype[i+1]==FCOMP) {
9123               // The delay slot overwrites the branch condition.
9124               // Allocate the branch condition registers instead.
9125               alloc_cc(&current,i);
9126               dirty_reg(&current,CCREG);
9127               alloc_reg(&current,i,CSREG);
9128               alloc_reg(&current,i,FSREG);
9129             }
9130             else {
9131               ooo[i]=1;
9132               delayslot_alloc(&current,i+1);
9133               alloc_reg(&current,i+1,CSREG);
9134             }
9135           }
9136           else
9137           // Don't alloc the delay slot yet because we might not execute it
9138           if(likely[i]) // BC1FL/BC1TL
9139           {
9140             alloc_cc(&current,i);
9141             dirty_reg(&current,CCREG);
9142             alloc_reg(&current,i,CSREG);
9143             alloc_reg(&current,i,FSREG);
9144           }
9145           ds=1;
9146           current.isconst=0;
9147           break;
9148         case IMM16:
9149           imm16_alloc(&current,i);
9150           break;
9151         case LOAD:
9152         case LOADLR:
9153           load_alloc(&current,i);
9154           break;
9155         case STORE:
9156         case STORELR:
9157           store_alloc(&current,i);
9158           break;
9159         case ALU:
9160           alu_alloc(&current,i);
9161           break;
9162         case SHIFT:
9163           shift_alloc(&current,i);
9164           break;
9165         case MULTDIV:
9166           multdiv_alloc(&current,i);
9167           break;
9168         case SHIFTIMM:
9169           shiftimm_alloc(&current,i);
9170           break;
9171         case MOV:
9172           mov_alloc(&current,i);
9173           break;
9174         case COP0:
9175           cop0_alloc(&current,i);
9176           break;
9177         case COP1:
9178         case COP2:
9179           cop1_alloc(&current,i);
9180           break;
9181         case C1LS:
9182           c1ls_alloc(&current,i);
9183           break;
9184         case C2LS:
9185           c2ls_alloc(&current,i);
9186           break;
9187         case C2OP:
9188           c2op_alloc(&current,i);
9189           break;
9190         case FCONV:
9191           fconv_alloc(&current,i);
9192           break;
9193         case FLOAT:
9194           float_alloc(&current,i);
9195           break;
9196         case FCOMP:
9197           fcomp_alloc(&current,i);
9198           break;
9199         case SYSCALL:
9200         case HLECALL:
9201         case INTCALL:
9202           syscall_alloc(&current,i);
9203           break;
9204         case SPAN:
9205           pagespan_alloc(&current,i);
9206           break;
9207       }
9208       
9209       // Drop the upper half of registers that have become 32-bit
9210       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9211       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9212         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9213         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9214         current.uu|=1;
9215       } else {
9216         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9217         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9218         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9219         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9220         current.uu|=1;
9221       }
9222
9223       // Create entry (branch target) regmap
9224       for(hr=0;hr<HOST_REGS;hr++)
9225       {
9226         int r,or,er;
9227         r=current.regmap[hr];
9228         if(r>=0) {
9229           if(r!=regmap_pre[i][hr]) {
9230             // TODO: delay slot (?)
9231             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9232             if(or<0||(r&63)>=TEMPREG){
9233               regs[i].regmap_entry[hr]=-1;
9234             }
9235             else
9236             {
9237               // Just move it to a different register
9238               regs[i].regmap_entry[hr]=r;
9239               // If it was dirty before, it's still dirty
9240               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9241             }
9242           }
9243           else
9244           {
9245             // Unneeded
9246             if(r==0){
9247               regs[i].regmap_entry[hr]=0;
9248             }
9249             else
9250             if(r<64){
9251               if((current.u>>r)&1) {
9252                 regs[i].regmap_entry[hr]=-1;
9253                 //regs[i].regmap[hr]=-1;
9254                 current.regmap[hr]=-1;
9255               }else
9256                 regs[i].regmap_entry[hr]=r;
9257             }
9258             else {
9259               if((current.uu>>(r&63))&1) {
9260                 regs[i].regmap_entry[hr]=-1;
9261                 //regs[i].regmap[hr]=-1;
9262                 current.regmap[hr]=-1;
9263               }else
9264                 regs[i].regmap_entry[hr]=r;
9265             }
9266           }
9267         } else {
9268           // Branches expect CCREG to be allocated at the target
9269           if(regmap_pre[i][hr]==CCREG) 
9270             regs[i].regmap_entry[hr]=CCREG;
9271           else
9272             regs[i].regmap_entry[hr]=-1;
9273         }
9274       }
9275       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9276     }
9277     /* Branch post-alloc */
9278     if(i>0)
9279     {
9280       current.was32=current.is32;
9281       current.wasdirty=current.dirty;
9282       switch(itype[i-1]) {
9283         case UJUMP:
9284           memcpy(&branch_regs[i-1],&current,sizeof(current));
9285           branch_regs[i-1].isconst=0;
9286           branch_regs[i-1].wasconst=0;
9287           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9288           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9289           alloc_cc(&branch_regs[i-1],i-1);
9290           dirty_reg(&branch_regs[i-1],CCREG);
9291           if(rt1[i-1]==31) { // JAL
9292             alloc_reg(&branch_regs[i-1],i-1,31);
9293             dirty_reg(&branch_regs[i-1],31);
9294             branch_regs[i-1].is32|=1LL<<31;
9295           }
9296           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9297           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9298           break;
9299         case RJUMP:
9300           memcpy(&branch_regs[i-1],&current,sizeof(current));
9301           branch_regs[i-1].isconst=0;
9302           branch_regs[i-1].wasconst=0;
9303           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9304           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9305           alloc_cc(&branch_regs[i-1],i-1);
9306           dirty_reg(&branch_regs[i-1],CCREG);
9307           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9308           if(rt1[i-1]!=0) { // JALR
9309             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9310             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9311             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9312           }
9313           #ifdef USE_MINI_HT
9314           if(rs1[i-1]==31) { // JALR
9315             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9316             #ifndef HOST_IMM_ADDR32
9317             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9318             #endif
9319           }
9320           #endif
9321           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9322           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9323           break;
9324         case CJUMP:
9325           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9326           {
9327             alloc_cc(&current,i-1);
9328             dirty_reg(&current,CCREG);
9329             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9330                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9331               // The delay slot overwrote one of our conditions
9332               // Delay slot goes after the test (in order)
9333               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9334               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9335               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9336               current.u|=1;
9337               current.uu|=1;
9338               delayslot_alloc(&current,i);
9339               current.isconst=0;
9340             }
9341             else
9342             {
9343               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9344               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9345               // Alloc the branch condition registers
9346               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9347               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9348               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9349               {
9350                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9351                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9352               }
9353             }
9354             memcpy(&branch_regs[i-1],&current,sizeof(current));
9355             branch_regs[i-1].isconst=0;
9356             branch_regs[i-1].wasconst=0;
9357             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9358             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9359           }
9360           else
9361           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9362           {
9363             alloc_cc(&current,i-1);
9364             dirty_reg(&current,CCREG);
9365             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9366               // The delay slot overwrote the branch condition
9367               // Delay slot goes after the test (in order)
9368               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9369               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9370               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9371               current.u|=1;
9372               current.uu|=1;
9373               delayslot_alloc(&current,i);
9374               current.isconst=0;
9375             }
9376             else
9377             {
9378               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9379               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9380               // Alloc the branch condition register
9381               alloc_reg(&current,i-1,rs1[i-1]);
9382               if(!(current.is32>>rs1[i-1]&1))
9383               {
9384                 alloc_reg64(&current,i-1,rs1[i-1]);
9385               }
9386             }
9387             memcpy(&branch_regs[i-1],&current,sizeof(current));
9388             branch_regs[i-1].isconst=0;
9389             branch_regs[i-1].wasconst=0;
9390             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9391             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9392           }
9393           else
9394           // Alloc the delay slot in case the branch is taken
9395           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9396           {
9397             memcpy(&branch_regs[i-1],&current,sizeof(current));
9398             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9399             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9400             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9401             alloc_cc(&branch_regs[i-1],i);
9402             dirty_reg(&branch_regs[i-1],CCREG);
9403             delayslot_alloc(&branch_regs[i-1],i);
9404             branch_regs[i-1].isconst=0;
9405             alloc_reg(&current,i,CCREG); // Not taken path
9406             dirty_reg(&current,CCREG);
9407             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9408           }
9409           else
9410           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9411           {
9412             memcpy(&branch_regs[i-1],&current,sizeof(current));
9413             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9414             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9415             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9416             alloc_cc(&branch_regs[i-1],i);
9417             dirty_reg(&branch_regs[i-1],CCREG);
9418             delayslot_alloc(&branch_regs[i-1],i);
9419             branch_regs[i-1].isconst=0;
9420             alloc_reg(&current,i,CCREG); // Not taken path
9421             dirty_reg(&current,CCREG);
9422             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9423           }
9424           break;
9425         case SJUMP:
9426           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9427           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9428           {
9429             alloc_cc(&current,i-1);
9430             dirty_reg(&current,CCREG);
9431             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9432               // The delay slot overwrote the branch condition
9433               // Delay slot goes after the test (in order)
9434               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9435               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9436               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9437               current.u|=1;
9438               current.uu|=1;
9439               delayslot_alloc(&current,i);
9440               current.isconst=0;
9441             }
9442             else
9443             {
9444               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9445               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9446               // Alloc the branch condition register
9447               alloc_reg(&current,i-1,rs1[i-1]);
9448               if(!(current.is32>>rs1[i-1]&1))
9449               {
9450                 alloc_reg64(&current,i-1,rs1[i-1]);
9451               }
9452             }
9453             memcpy(&branch_regs[i-1],&current,sizeof(current));
9454             branch_regs[i-1].isconst=0;
9455             branch_regs[i-1].wasconst=0;
9456             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9457             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9458           }
9459           else
9460           // Alloc the delay slot in case the branch is taken
9461           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9462           {
9463             memcpy(&branch_regs[i-1],&current,sizeof(current));
9464             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9465             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9466             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9467             alloc_cc(&branch_regs[i-1],i);
9468             dirty_reg(&branch_regs[i-1],CCREG);
9469             delayslot_alloc(&branch_regs[i-1],i);
9470             branch_regs[i-1].isconst=0;
9471             alloc_reg(&current,i,CCREG); // Not taken path
9472             dirty_reg(&current,CCREG);
9473             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9474           }
9475           // FIXME: BLTZAL/BGEZAL
9476           if(opcode2[i-1]&0x10) { // BxxZAL
9477             alloc_reg(&branch_regs[i-1],i-1,31);
9478             dirty_reg(&branch_regs[i-1],31);
9479             branch_regs[i-1].is32|=1LL<<31;
9480           }
9481           break;
9482         case FJUMP:
9483           if(likely[i-1]==0) // BC1F/BC1T
9484           {
9485             alloc_cc(&current,i-1);
9486             dirty_reg(&current,CCREG);
9487             if(itype[i]==FCOMP) {
9488               // The delay slot overwrote the branch condition
9489               // Delay slot goes after the test (in order)
9490               delayslot_alloc(&current,i);
9491               current.isconst=0;
9492             }
9493             else
9494             {
9495               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9496               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9497               // Alloc the branch condition register
9498               alloc_reg(&current,i-1,FSREG);
9499             }
9500             memcpy(&branch_regs[i-1],&current,sizeof(current));
9501             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9502           }
9503           else // BC1FL/BC1TL
9504           {
9505             // Alloc the delay slot in case the branch is taken
9506             memcpy(&branch_regs[i-1],&current,sizeof(current));
9507             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9508             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9509             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9510             alloc_cc(&branch_regs[i-1],i);
9511             dirty_reg(&branch_regs[i-1],CCREG);
9512             delayslot_alloc(&branch_regs[i-1],i);
9513             branch_regs[i-1].isconst=0;
9514             alloc_reg(&current,i,CCREG); // Not taken path
9515             dirty_reg(&current,CCREG);
9516             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9517           }
9518           break;
9519       }
9520
9521       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9522       {
9523         if(rt1[i-1]==31) // JAL/JALR
9524         {
9525           // Subroutine call will return here, don't alloc any registers
9526           current.is32=1;
9527           current.dirty=0;
9528           clear_all_regs(current.regmap);
9529           alloc_reg(&current,i,CCREG);
9530           dirty_reg(&current,CCREG);
9531         }
9532         else if(i+1<slen)
9533         {
9534           // Internal branch will jump here, match registers to caller
9535           current.is32=0x3FFFFFFFFLL;
9536           current.dirty=0;
9537           clear_all_regs(current.regmap);
9538           alloc_reg(&current,i,CCREG);
9539           dirty_reg(&current,CCREG);
9540           for(j=i-1;j>=0;j--)
9541           {
9542             if(ba[j]==start+i*4+4) {
9543               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9544               current.is32=branch_regs[j].is32;
9545               current.dirty=branch_regs[j].dirty;
9546               break;
9547             }
9548           }
9549           while(j>=0) {
9550             if(ba[j]==start+i*4+4) {
9551               for(hr=0;hr<HOST_REGS;hr++) {
9552                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9553                   current.regmap[hr]=-1;
9554                 }
9555                 current.is32&=branch_regs[j].is32;
9556                 current.dirty&=branch_regs[j].dirty;
9557               }
9558             }
9559             j--;
9560           }
9561         }
9562       }
9563     }
9564
9565     // Count cycles in between branches
9566     ccadj[i]=cc;
9567     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9568     {
9569       cc=0;
9570     }
9571 #ifdef PCSX
9572     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9573     {
9574       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9575     }
9576     else if(itype[i]==C2LS)
9577     {
9578       cc+=4;
9579     }
9580 #endif
9581     else
9582     {
9583       cc++;
9584     }
9585
9586     flush_dirty_uppers(&current);
9587     if(!is_ds[i]) {
9588       regs[i].is32=current.is32;
9589       regs[i].dirty=current.dirty;
9590       regs[i].isconst=current.isconst;
9591       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9592     }
9593     for(hr=0;hr<HOST_REGS;hr++) {
9594       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9595         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9596           regs[i].wasconst&=~(1<<hr);
9597         }
9598       }
9599     }
9600     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9601   }
9602   
9603   /* Pass 4 - Cull unused host registers */
9604   
9605   uint64_t nr=0;
9606   
9607   for (i=slen-1;i>=0;i--)
9608   {
9609     int hr;
9610     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9611     {
9612       if(ba[i]<start || ba[i]>=(start+slen*4))
9613       {
9614         // Branch out of this block, don't need anything
9615         nr=0;
9616       }
9617       else
9618       {
9619         // Internal branch
9620         // Need whatever matches the target
9621         nr=0;
9622         int t=(ba[i]-start)>>2;
9623         for(hr=0;hr<HOST_REGS;hr++)
9624         {
9625           if(regs[i].regmap_entry[hr]>=0) {
9626             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9627           }
9628         }
9629       }
9630       // Conditional branch may need registers for following instructions
9631       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9632       {
9633         if(i<slen-2) {
9634           nr|=needed_reg[i+2];
9635           for(hr=0;hr<HOST_REGS;hr++)
9636           {
9637             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9638             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9639           }
9640         }
9641       }
9642       // Don't need stuff which is overwritten
9643       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9644       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9645       // Merge in delay slot
9646       for(hr=0;hr<HOST_REGS;hr++)
9647       {
9648         if(!likely[i]) {
9649           // These are overwritten unless the branch is "likely"
9650           // and the delay slot is nullified if not taken
9651           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9652           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9653         }
9654         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9655         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9656         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9657         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9658         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9659         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9660         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9661         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9662         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9663           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9664           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9665         }
9666         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9667           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9668           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9669         }
9670         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9671           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9672           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9673         }
9674       }
9675     }
9676     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9677     {
9678       // SYSCALL instruction (software interrupt)
9679       nr=0;
9680     }
9681     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9682     {
9683       // ERET instruction (return from interrupt)
9684       nr=0;
9685     }
9686     else // Non-branch
9687     {
9688       if(i<slen-1) {
9689         for(hr=0;hr<HOST_REGS;hr++) {
9690           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9691           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9692           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9693           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9694         }
9695       }
9696     }
9697     for(hr=0;hr<HOST_REGS;hr++)
9698     {
9699       // Overwritten registers are not needed
9700       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9701       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9702       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9703       // Source registers are needed
9704       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9705       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9706       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9707       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9708       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9709       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9710       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9711       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9712       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9713         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9714         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9715       }
9716       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9717         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9718         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9719       }
9720       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9721         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9722         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9723       }
9724       // Don't store a register immediately after writing it,
9725       // may prevent dual-issue.
9726       // But do so if this is a branch target, otherwise we
9727       // might have to load the register before the branch.
9728       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9729         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9730            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9731           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9732           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9733         }
9734         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9735            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9736           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9737           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9738         }
9739       }
9740     }
9741     // Cycle count is needed at branches.  Assume it is needed at the target too.
9742     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9743       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9744       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9745     }
9746     // Save it
9747     needed_reg[i]=nr;
9748     
9749     // Deallocate unneeded registers
9750     for(hr=0;hr<HOST_REGS;hr++)
9751     {
9752       if(!((nr>>hr)&1)) {
9753         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9754         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9755            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9756            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9757         {
9758           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9759           {
9760             if(likely[i]) {
9761               regs[i].regmap[hr]=-1;
9762               regs[i].isconst&=~(1<<hr);
9763               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9764             }
9765           }
9766         }
9767         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9768         {
9769           int d1=0,d2=0,map=0,temp=0;
9770           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9771           {
9772             d1=dep1[i+1];
9773             d2=dep2[i+1];
9774           }
9775           if(using_tlb) {
9776             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9777                itype[i+1]==STORE || itype[i+1]==STORELR ||
9778                itype[i+1]==C1LS || itype[i+1]==C2LS)
9779             map=TLREG;
9780           } else
9781           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9782              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9783             map=INVCP;
9784           }
9785           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9786              itype[i+1]==C1LS || itype[i+1]==C2LS)
9787             temp=FTEMP;
9788           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9789              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9790              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9791              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9792              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9793              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9794              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9795              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9796              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9797              regs[i].regmap[hr]!=map )
9798           {
9799             regs[i].regmap[hr]=-1;
9800             regs[i].isconst&=~(1<<hr);
9801             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9802                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9803                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9804                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9805                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9806                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9807                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9808                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9809                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9810                branch_regs[i].regmap[hr]!=map)
9811             {
9812               branch_regs[i].regmap[hr]=-1;
9813               branch_regs[i].regmap_entry[hr]=-1;
9814               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9815               {
9816                 if(!likely[i]&&i<slen-2) {
9817                   regmap_pre[i+2][hr]=-1;
9818                 }
9819               }
9820             }
9821           }
9822         }
9823         else
9824         {
9825           // Non-branch
9826           if(i>0)
9827           {
9828             int d1=0,d2=0,map=-1,temp=-1;
9829             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9830             {
9831               d1=dep1[i];
9832               d2=dep2[i];
9833             }
9834             if(using_tlb) {
9835               if(itype[i]==LOAD || itype[i]==LOADLR ||
9836                  itype[i]==STORE || itype[i]==STORELR ||
9837                  itype[i]==C1LS || itype[i]==C2LS)
9838               map=TLREG;
9839             } else if(itype[i]==STORE || itype[i]==STORELR ||
9840                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9841               map=INVCP;
9842             }
9843             if(itype[i]==LOADLR || itype[i]==STORELR ||
9844                itype[i]==C1LS || itype[i]==C2LS)
9845               temp=FTEMP;
9846             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9847                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9848                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9849                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9850                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9851                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9852             {
9853               if(i<slen-1&&!is_ds[i]) {
9854                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9855                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9856                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9857                 {
9858                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9859                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9860                 }
9861                 regmap_pre[i+1][hr]=-1;
9862                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9863               }
9864               regs[i].regmap[hr]=-1;
9865               regs[i].isconst&=~(1<<hr);
9866             }
9867           }
9868         }
9869       }
9870     }
9871   }
9872   
9873   /* Pass 5 - Pre-allocate registers */
9874   
9875   // If a register is allocated during a loop, try to allocate it for the
9876   // entire loop, if possible.  This avoids loading/storing registers
9877   // inside of the loop.
9878
9879   signed char f_regmap[HOST_REGS];
9880   clear_all_regs(f_regmap);
9881   for(i=0;i<slen-1;i++)
9882   {
9883     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9884     {
9885       if(ba[i]>=start && ba[i]<(start+i*4)) 
9886       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9887       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9888       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9889       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9890       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9891       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9892       {
9893         int t=(ba[i]-start)>>2;
9894         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9895         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9896         for(hr=0;hr<HOST_REGS;hr++)
9897         {
9898           if(regs[i].regmap[hr]>64) {
9899             if(!((regs[i].dirty>>hr)&1))
9900               f_regmap[hr]=regs[i].regmap[hr];
9901             else f_regmap[hr]=-1;
9902           }
9903           else if(regs[i].regmap[hr]>=0) {
9904             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9905               // dealloc old register
9906               int n;
9907               for(n=0;n<HOST_REGS;n++)
9908               {
9909                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9910               }
9911               // and alloc new one
9912               f_regmap[hr]=regs[i].regmap[hr];
9913             }
9914           }
9915           if(branch_regs[i].regmap[hr]>64) {
9916             if(!((branch_regs[i].dirty>>hr)&1))
9917               f_regmap[hr]=branch_regs[i].regmap[hr];
9918             else f_regmap[hr]=-1;
9919           }
9920           else if(branch_regs[i].regmap[hr]>=0) {
9921             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9922               // dealloc old register
9923               int n;
9924               for(n=0;n<HOST_REGS;n++)
9925               {
9926                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9927               }
9928               // and alloc new one
9929               f_regmap[hr]=branch_regs[i].regmap[hr];
9930             }
9931           }
9932           if(ooo[i]) {
9933             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9934               f_regmap[hr]=branch_regs[i].regmap[hr];
9935           }else{
9936             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9937               f_regmap[hr]=branch_regs[i].regmap[hr];
9938           }
9939           // Avoid dirty->clean transition
9940           #ifdef DESTRUCTIVE_WRITEBACK
9941           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9942           #endif
9943           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9944           // case above, however it's always a good idea.  We can't hoist the
9945           // load if the register was already allocated, so there's no point
9946           // wasting time analyzing most of these cases.  It only "succeeds"
9947           // when the mapping was different and the load can be replaced with
9948           // a mov, which is of negligible benefit.  So such cases are
9949           // skipped below.
9950           if(f_regmap[hr]>0) {
9951             if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
9952               int r=f_regmap[hr];
9953               for(j=t;j<=i;j++)
9954               {
9955                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9956                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9957                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9958                 if(r>63) {
9959                   // NB This can exclude the case where the upper-half
9960                   // register is lower numbered than the lower-half
9961                   // register.  Not sure if it's worth fixing...
9962                   if(get_reg(regs[j].regmap,r&63)<0) break;
9963                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9964                   if(regs[j].is32&(1LL<<(r&63))) break;
9965                 }
9966                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9967                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9968                   int k;
9969                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9970                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9971                     if(r>63) {
9972                       if(get_reg(regs[i].regmap,r&63)<0) break;
9973                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9974                     }
9975                     k=i;
9976                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9977                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9978                         //printf("no free regs for store %x\n",start+(k-1)*4);
9979                         break;
9980                       }
9981                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9982                         //printf("no-match due to different register\n");
9983                         break;
9984                       }
9985                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9986                         //printf("no-match due to branch\n");
9987                         break;
9988                       }
9989                       // call/ret fast path assumes no registers allocated
9990                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9991                         break;
9992                       }
9993                       if(r>63) {
9994                         // NB This can exclude the case where the upper-half
9995                         // register is lower numbered than the lower-half
9996                         // register.  Not sure if it's worth fixing...
9997                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9998                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9999                       }
10000                       k--;
10001                     }
10002                     if(i<slen-1) {
10003                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10004                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10005                         //printf("bad match after branch\n");
10006                         break;
10007                       }
10008                     }
10009                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10010                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10011                       while(k<i) {
10012                         regs[k].regmap_entry[hr]=f_regmap[hr];
10013                         regs[k].regmap[hr]=f_regmap[hr];
10014                         regmap_pre[k+1][hr]=f_regmap[hr];
10015                         regs[k].wasdirty&=~(1<<hr);
10016                         regs[k].dirty&=~(1<<hr);
10017                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10018                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10019                         regs[k].wasconst&=~(1<<hr);
10020                         regs[k].isconst&=~(1<<hr);
10021                         k++;
10022                       }
10023                     }
10024                     else {
10025                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10026                       break;
10027                     }
10028                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10029                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10030                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10031                       regs[i].regmap_entry[hr]=f_regmap[hr];
10032                       regs[i].regmap[hr]=f_regmap[hr];
10033                       regs[i].wasdirty&=~(1<<hr);
10034                       regs[i].dirty&=~(1<<hr);
10035                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10036                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10037                       regs[i].wasconst&=~(1<<hr);
10038                       regs[i].isconst&=~(1<<hr);
10039                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10040                       branch_regs[i].wasdirty&=~(1<<hr);
10041                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10042                       branch_regs[i].regmap[hr]=f_regmap[hr];
10043                       branch_regs[i].dirty&=~(1<<hr);
10044                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10045                       branch_regs[i].wasconst&=~(1<<hr);
10046                       branch_regs[i].isconst&=~(1<<hr);
10047                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10048                         regmap_pre[i+2][hr]=f_regmap[hr];
10049                         regs[i+2].wasdirty&=~(1<<hr);
10050                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10051                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10052                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10053                       }
10054                     }
10055                   }
10056                   for(k=t;k<j;k++) {
10057                     // Alloc register clean at beginning of loop,
10058                     // but may dirty it in pass 6
10059                     regs[k].regmap_entry[hr]=f_regmap[hr];
10060                     regs[k].regmap[hr]=f_regmap[hr];
10061                     regs[k].dirty&=~(1<<hr);
10062                     regs[k].wasconst&=~(1<<hr);
10063                     regs[k].isconst&=~(1<<hr);
10064                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10065                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10066                       branch_regs[k].regmap[hr]=f_regmap[hr];
10067                       branch_regs[k].dirty&=~(1<<hr);
10068                       branch_regs[k].wasconst&=~(1<<hr);
10069                       branch_regs[k].isconst&=~(1<<hr);
10070                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10071                         regmap_pre[k+2][hr]=f_regmap[hr];
10072                         regs[k+2].wasdirty&=~(1<<hr);
10073                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10074                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10075                       }
10076                     }
10077                     else
10078                     {
10079                       regmap_pre[k+1][hr]=f_regmap[hr];
10080                       regs[k+1].wasdirty&=~(1<<hr);
10081                     }
10082                   }
10083                   if(regs[j].regmap[hr]==f_regmap[hr])
10084                     regs[j].regmap_entry[hr]=f_regmap[hr];
10085                   break;
10086                 }
10087                 if(j==i) break;
10088                 if(regs[j].regmap[hr]>=0)
10089                   break;
10090                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10091                   //printf("no-match due to different register\n");
10092                   break;
10093                 }
10094                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10095                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10096                   break;
10097                 }
10098                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10099                 {
10100                   // Stop on unconditional branch
10101                   break;
10102                 }
10103                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10104                 {
10105                   if(ooo[j]) {
10106                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10107                       break;
10108                   }else{
10109                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10110                       break;
10111                   }
10112                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10113                     //printf("no-match due to different register (branch)\n");
10114                     break;
10115                   }
10116                 }
10117                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10118                   //printf("No free regs for store %x\n",start+j*4);
10119                   break;
10120                 }
10121                 if(f_regmap[hr]>=64) {
10122                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10123                     break;
10124                   }
10125                   else
10126                   {
10127                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10128                       break;
10129                     }
10130                   }
10131                 }
10132               }
10133             }
10134           }
10135         }
10136       }
10137     }else{
10138       int count=0;
10139       for(hr=0;hr<HOST_REGS;hr++)
10140       {
10141         if(hr!=EXCLUDE_REG) {
10142           if(regs[i].regmap[hr]>64) {
10143             if(!((regs[i].dirty>>hr)&1))
10144               f_regmap[hr]=regs[i].regmap[hr];
10145           }
10146           else if(regs[i].regmap[hr]>=0) {
10147             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10148               // dealloc old register
10149               int n;
10150               for(n=0;n<HOST_REGS;n++)
10151               {
10152                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10153               }
10154               // and alloc new one
10155               f_regmap[hr]=regs[i].regmap[hr];
10156             }
10157           }
10158           else if(regs[i].regmap[hr]<0) count++;
10159         }
10160       }
10161       // Try to restore cycle count at branch targets
10162       if(bt[i]) {
10163         for(j=i;j<slen-1;j++) {
10164           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10165           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10166             //printf("no free regs for store %x\n",start+j*4);
10167             break;
10168           }
10169         }
10170         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10171           int k=i;
10172           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10173           while(k<j) {
10174             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10175             regs[k].regmap[HOST_CCREG]=CCREG;
10176             regmap_pre[k+1][HOST_CCREG]=CCREG;
10177             regs[k+1].wasdirty|=1<<HOST_CCREG;
10178             regs[k].dirty|=1<<HOST_CCREG;
10179             regs[k].wasconst&=~(1<<HOST_CCREG);
10180             regs[k].isconst&=~(1<<HOST_CCREG);
10181             k++;
10182           }
10183           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10184         }
10185         // Work backwards from the branch target
10186         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10187         {
10188           //printf("Extend backwards\n");
10189           int k;
10190           k=i;
10191           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10192             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10193               //printf("no free regs for store %x\n",start+(k-1)*4);
10194               break;
10195             }
10196             k--;
10197           }
10198           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10199             //printf("Extend CC, %x ->\n",start+k*4);
10200             while(k<=i) {
10201               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10202               regs[k].regmap[HOST_CCREG]=CCREG;
10203               regmap_pre[k+1][HOST_CCREG]=CCREG;
10204               regs[k+1].wasdirty|=1<<HOST_CCREG;
10205               regs[k].dirty|=1<<HOST_CCREG;
10206               regs[k].wasconst&=~(1<<HOST_CCREG);
10207               regs[k].isconst&=~(1<<HOST_CCREG);
10208               k++;
10209             }
10210           }
10211           else {
10212             //printf("Fail Extend CC, %x ->\n",start+k*4);
10213           }
10214         }
10215       }
10216       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10217          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10218          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10219          itype[i]!=FCONV&&itype[i]!=FCOMP)
10220       {
10221         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10222       }
10223     }
10224   }
10225   
10226   // This allocates registers (if possible) one instruction prior
10227   // to use, which can avoid a load-use penalty on certain CPUs.
10228   for(i=0;i<slen-1;i++)
10229   {
10230     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10231     {
10232       if(!bt[i+1])
10233       {
10234         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10235            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10236         {
10237           if(rs1[i+1]) {
10238             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10239             {
10240               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10241               {
10242                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10243                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10244                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10245                 regs[i].isconst&=~(1<<hr);
10246                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10247                 constmap[i][hr]=constmap[i+1][hr];
10248                 regs[i+1].wasdirty&=~(1<<hr);
10249                 regs[i].dirty&=~(1<<hr);
10250               }
10251             }
10252           }
10253           if(rs2[i+1]) {
10254             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10255             {
10256               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10257               {
10258                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10259                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10260                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10261                 regs[i].isconst&=~(1<<hr);
10262                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10263                 constmap[i][hr]=constmap[i+1][hr];
10264                 regs[i+1].wasdirty&=~(1<<hr);
10265                 regs[i].dirty&=~(1<<hr);
10266               }
10267             }
10268           }
10269           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10270             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10271             {
10272               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10273               {
10274                 regs[i].regmap[hr]=rs1[i+1];
10275                 regmap_pre[i+1][hr]=rs1[i+1];
10276                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10277                 regs[i].isconst&=~(1<<hr);
10278                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10279                 constmap[i][hr]=constmap[i+1][hr];
10280                 regs[i+1].wasdirty&=~(1<<hr);
10281                 regs[i].dirty&=~(1<<hr);
10282               }
10283             }
10284           }
10285           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10286             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10287             {
10288               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10289               {
10290                 regs[i].regmap[hr]=rs1[i+1];
10291                 regmap_pre[i+1][hr]=rs1[i+1];
10292                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10293                 regs[i].isconst&=~(1<<hr);
10294                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10295                 constmap[i][hr]=constmap[i+1][hr];
10296                 regs[i+1].wasdirty&=~(1<<hr);
10297                 regs[i].dirty&=~(1<<hr);
10298               }
10299             }
10300           }
10301           #ifndef HOST_IMM_ADDR32
10302           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10303             hr=get_reg(regs[i+1].regmap,TLREG);
10304             if(hr>=0) {
10305               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10306               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10307                 int nr;
10308                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10309                 {
10310                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10311                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10312                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10313                   regs[i].isconst&=~(1<<hr);
10314                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10315                   constmap[i][hr]=constmap[i+1][hr];
10316                   regs[i+1].wasdirty&=~(1<<hr);
10317                   regs[i].dirty&=~(1<<hr);
10318                 }
10319                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10320                 {
10321                   // move it to another register
10322                   regs[i+1].regmap[hr]=-1;
10323                   regmap_pre[i+2][hr]=-1;
10324                   regs[i+1].regmap[nr]=TLREG;
10325                   regmap_pre[i+2][nr]=TLREG;
10326                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10327                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10328                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10329                   regs[i].isconst&=~(1<<nr);
10330                   regs[i+1].isconst&=~(1<<nr);
10331                   regs[i].dirty&=~(1<<nr);
10332                   regs[i+1].wasdirty&=~(1<<nr);
10333                   regs[i+1].dirty&=~(1<<nr);
10334                   regs[i+2].wasdirty&=~(1<<nr);
10335                 }
10336               }
10337             }
10338           }
10339           #endif
10340           if(itype[i+1]==STORE||itype[i+1]==STORELR
10341              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10342             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10343               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10344               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10345               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10346               assert(hr>=0);
10347               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10348               {
10349                 regs[i].regmap[hr]=rs1[i+1];
10350                 regmap_pre[i+1][hr]=rs1[i+1];
10351                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10352                 regs[i].isconst&=~(1<<hr);
10353                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10354                 constmap[i][hr]=constmap[i+1][hr];
10355                 regs[i+1].wasdirty&=~(1<<hr);
10356                 regs[i].dirty&=~(1<<hr);
10357               }
10358             }
10359           }
10360           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10361             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10362               int nr;
10363               hr=get_reg(regs[i+1].regmap,FTEMP);
10364               assert(hr>=0);
10365               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10366               {
10367                 regs[i].regmap[hr]=rs1[i+1];
10368                 regmap_pre[i+1][hr]=rs1[i+1];
10369                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10370                 regs[i].isconst&=~(1<<hr);
10371                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10372                 constmap[i][hr]=constmap[i+1][hr];
10373                 regs[i+1].wasdirty&=~(1<<hr);
10374                 regs[i].dirty&=~(1<<hr);
10375               }
10376               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10377               {
10378                 // move it to another register
10379                 regs[i+1].regmap[hr]=-1;
10380                 regmap_pre[i+2][hr]=-1;
10381                 regs[i+1].regmap[nr]=FTEMP;
10382                 regmap_pre[i+2][nr]=FTEMP;
10383                 regs[i].regmap[nr]=rs1[i+1];
10384                 regmap_pre[i+1][nr]=rs1[i+1];
10385                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10386                 regs[i].isconst&=~(1<<nr);
10387                 regs[i+1].isconst&=~(1<<nr);
10388                 regs[i].dirty&=~(1<<nr);
10389                 regs[i+1].wasdirty&=~(1<<nr);
10390                 regs[i+1].dirty&=~(1<<nr);
10391                 regs[i+2].wasdirty&=~(1<<nr);
10392               }
10393             }
10394           }
10395           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10396             if(itype[i+1]==LOAD) 
10397               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10398             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10399               hr=get_reg(regs[i+1].regmap,FTEMP);
10400             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10401               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10402               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10403             }
10404             if(hr>=0&&regs[i].regmap[hr]<0) {
10405               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10406               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10407                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10408                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10409                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10410                 regs[i].isconst&=~(1<<hr);
10411                 regs[i+1].wasdirty&=~(1<<hr);
10412                 regs[i].dirty&=~(1<<hr);
10413               }
10414             }
10415           }
10416         }
10417       }
10418     }
10419   }
10420   
10421   /* Pass 6 - Optimize clean/dirty state */
10422   clean_registers(0,slen-1,1);
10423   
10424   /* Pass 7 - Identify 32-bit registers */
10425 #ifndef FORCE32
10426   provisional_r32();
10427
10428   u_int r32=0;
10429   
10430   for (i=slen-1;i>=0;i--)
10431   {
10432     int hr;
10433     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10434     {
10435       if(ba[i]<start || ba[i]>=(start+slen*4))
10436       {
10437         // Branch out of this block, don't need anything
10438         r32=0;
10439       }
10440       else
10441       {
10442         // Internal branch
10443         // Need whatever matches the target
10444         // (and doesn't get overwritten by the delay slot instruction)
10445         r32=0;
10446         int t=(ba[i]-start)>>2;
10447         if(ba[i]>start+i*4) {
10448           // Forward branch
10449           if(!(requires_32bit[t]&~regs[i].was32))
10450             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10451         }else{
10452           // Backward branch
10453           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10454           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10455           if(!(pr32[t]&~regs[i].was32))
10456             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10457         }
10458       }
10459       // Conditional branch may need registers for following instructions
10460       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10461       {
10462         if(i<slen-2) {
10463           r32|=requires_32bit[i+2];
10464           r32&=regs[i].was32;
10465           // Mark this address as a branch target since it may be called
10466           // upon return from interrupt
10467           bt[i+2]=1;
10468         }
10469       }
10470       // Merge in delay slot
10471       if(!likely[i]) {
10472         // These are overwritten unless the branch is "likely"
10473         // and the delay slot is nullified if not taken
10474         r32&=~(1LL<<rt1[i+1]);
10475         r32&=~(1LL<<rt2[i+1]);
10476       }
10477       // Assume these are needed (delay slot)
10478       if(us1[i+1]>0)
10479       {
10480         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10481       }
10482       if(us2[i+1]>0)
10483       {
10484         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10485       }
10486       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10487       {
10488         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10489       }
10490       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10491       {
10492         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10493       }
10494     }
10495     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10496     {
10497       // SYSCALL instruction (software interrupt)
10498       r32=0;
10499     }
10500     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10501     {
10502       // ERET instruction (return from interrupt)
10503       r32=0;
10504     }
10505     // Check 32 bits
10506     r32&=~(1LL<<rt1[i]);
10507     r32&=~(1LL<<rt2[i]);
10508     if(us1[i]>0)
10509     {
10510       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10511     }
10512     if(us2[i]>0)
10513     {
10514       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10515     }
10516     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10517     {
10518       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10519     }
10520     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10521     {
10522       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10523     }
10524     requires_32bit[i]=r32;
10525     
10526     // Dirty registers which are 32-bit, require 32-bit input
10527     // as they will be written as 32-bit values
10528     for(hr=0;hr<HOST_REGS;hr++)
10529     {
10530       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10531         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10532           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10533           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10534         }
10535       }
10536     }
10537     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10538   }
10539 #endif
10540
10541   if(itype[slen-1]==SPAN) {
10542     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10543   }
10544   
10545   /* Debug/disassembly */
10546   if((void*)assem_debug==(void*)printf) 
10547   for(i=0;i<slen;i++)
10548   {
10549     printf("U:");
10550     int r;
10551     for(r=1;r<=CCREG;r++) {
10552       if((unneeded_reg[i]>>r)&1) {
10553         if(r==HIREG) printf(" HI");
10554         else if(r==LOREG) printf(" LO");
10555         else printf(" r%d",r);
10556       }
10557     }
10558 #ifndef FORCE32
10559     printf(" UU:");
10560     for(r=1;r<=CCREG;r++) {
10561       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10562         if(r==HIREG) printf(" HI");
10563         else if(r==LOREG) printf(" LO");
10564         else printf(" r%d",r);
10565       }
10566     }
10567     printf(" 32:");
10568     for(r=0;r<=CCREG;r++) {
10569       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10570       if((regs[i].was32>>r)&1) {
10571         if(r==CCREG) printf(" CC");
10572         else if(r==HIREG) printf(" HI");
10573         else if(r==LOREG) printf(" LO");
10574         else printf(" r%d",r);
10575       }
10576     }
10577 #endif
10578     printf("\n");
10579     #if defined(__i386__) || defined(__x86_64__)
10580     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10581     #endif
10582     #ifdef __arm__
10583     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10584     #endif
10585     printf("needs: ");
10586     if(needed_reg[i]&1) printf("eax ");
10587     if((needed_reg[i]>>1)&1) printf("ecx ");
10588     if((needed_reg[i]>>2)&1) printf("edx ");
10589     if((needed_reg[i]>>3)&1) printf("ebx ");
10590     if((needed_reg[i]>>5)&1) printf("ebp ");
10591     if((needed_reg[i]>>6)&1) printf("esi ");
10592     if((needed_reg[i]>>7)&1) printf("edi ");
10593     printf("r:");
10594     for(r=0;r<=CCREG;r++) {
10595       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10596       if((requires_32bit[i]>>r)&1) {
10597         if(r==CCREG) printf(" CC");
10598         else if(r==HIREG) printf(" HI");
10599         else if(r==LOREG) printf(" LO");
10600         else printf(" r%d",r);
10601       }
10602     }
10603     printf("\n");
10604     /*printf("pr:");
10605     for(r=0;r<=CCREG;r++) {
10606       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10607       if((pr32[i]>>r)&1) {
10608         if(r==CCREG) printf(" CC");
10609         else if(r==HIREG) printf(" HI");
10610         else if(r==LOREG) printf(" LO");
10611         else printf(" r%d",r);
10612       }
10613     }
10614     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10615     printf("\n");*/
10616     #if defined(__i386__) || defined(__x86_64__)
10617     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10618     printf("dirty: ");
10619     if(regs[i].wasdirty&1) printf("eax ");
10620     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10621     if((regs[i].wasdirty>>2)&1) printf("edx ");
10622     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10623     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10624     if((regs[i].wasdirty>>6)&1) printf("esi ");
10625     if((regs[i].wasdirty>>7)&1) printf("edi ");
10626     #endif
10627     #ifdef __arm__
10628     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10629     printf("dirty: ");
10630     if(regs[i].wasdirty&1) printf("r0 ");
10631     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10632     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10633     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10634     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10635     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10636     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10637     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10638     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10639     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10640     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10641     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10642     #endif
10643     printf("\n");
10644     disassemble_inst(i);
10645     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10646     #if defined(__i386__) || defined(__x86_64__)
10647     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10648     if(regs[i].dirty&1) printf("eax ");
10649     if((regs[i].dirty>>1)&1) printf("ecx ");
10650     if((regs[i].dirty>>2)&1) printf("edx ");
10651     if((regs[i].dirty>>3)&1) printf("ebx ");
10652     if((regs[i].dirty>>5)&1) printf("ebp ");
10653     if((regs[i].dirty>>6)&1) printf("esi ");
10654     if((regs[i].dirty>>7)&1) printf("edi ");
10655     #endif
10656     #ifdef __arm__
10657     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10658     if(regs[i].dirty&1) printf("r0 ");
10659     if((regs[i].dirty>>1)&1) printf("r1 ");
10660     if((regs[i].dirty>>2)&1) printf("r2 ");
10661     if((regs[i].dirty>>3)&1) printf("r3 ");
10662     if((regs[i].dirty>>4)&1) printf("r4 ");
10663     if((regs[i].dirty>>5)&1) printf("r5 ");
10664     if((regs[i].dirty>>6)&1) printf("r6 ");
10665     if((regs[i].dirty>>7)&1) printf("r7 ");
10666     if((regs[i].dirty>>8)&1) printf("r8 ");
10667     if((regs[i].dirty>>9)&1) printf("r9 ");
10668     if((regs[i].dirty>>10)&1) printf("r10 ");
10669     if((regs[i].dirty>>12)&1) printf("r12 ");
10670     #endif
10671     printf("\n");
10672     if(regs[i].isconst) {
10673       printf("constants: ");
10674       #if defined(__i386__) || defined(__x86_64__)
10675       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10676       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10677       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10678       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10679       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10680       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10681       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10682       #endif
10683       #ifdef __arm__
10684       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10685       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10686       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10687       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10688       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10689       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10690       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10691       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10692       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10693       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10694       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10695       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10696       #endif
10697       printf("\n");
10698     }
10699 #ifndef FORCE32
10700     printf(" 32:");
10701     for(r=0;r<=CCREG;r++) {
10702       if((regs[i].is32>>r)&1) {
10703         if(r==CCREG) printf(" CC");
10704         else if(r==HIREG) printf(" HI");
10705         else if(r==LOREG) printf(" LO");
10706         else printf(" r%d",r);
10707       }
10708     }
10709     printf("\n");
10710 #endif
10711     /*printf(" p32:");
10712     for(r=0;r<=CCREG;r++) {
10713       if((p32[i]>>r)&1) {
10714         if(r==CCREG) printf(" CC");
10715         else if(r==HIREG) printf(" HI");
10716         else if(r==LOREG) printf(" LO");
10717         else printf(" r%d",r);
10718       }
10719     }
10720     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10721     else printf("\n");*/
10722     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10723       #if defined(__i386__) || defined(__x86_64__)
10724       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10725       if(branch_regs[i].dirty&1) printf("eax ");
10726       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10727       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10728       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10729       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10730       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10731       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10732       #endif
10733       #ifdef __arm__
10734       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10735       if(branch_regs[i].dirty&1) printf("r0 ");
10736       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10737       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10738       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10739       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10740       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10741       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10742       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10743       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10744       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10745       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10746       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10747       #endif
10748 #ifndef FORCE32
10749       printf(" 32:");
10750       for(r=0;r<=CCREG;r++) {
10751         if((branch_regs[i].is32>>r)&1) {
10752           if(r==CCREG) printf(" CC");
10753           else if(r==HIREG) printf(" HI");
10754           else if(r==LOREG) printf(" LO");
10755           else printf(" r%d",r);
10756         }
10757       }
10758       printf("\n");
10759 #endif
10760     }
10761   }
10762
10763   /* Pass 8 - Assembly */
10764   linkcount=0;stubcount=0;
10765   ds=0;is_delayslot=0;
10766   cop1_usable=0;
10767   uint64_t is32_pre=0;
10768   u_int dirty_pre=0;
10769   u_int beginning=(u_int)out;
10770   if((u_int)addr&1) {
10771     ds=1;
10772     pagespan_ds();
10773   }
10774   u_int instr_addr0_override=0;
10775
10776 #ifdef PCSX
10777   if (start == 0x80030000) {
10778     // nasty hack for fastbios thing
10779     instr_addr0_override=(u_int)out;
10780     emit_movimm(start,0);
10781     emit_readword((int)&pcaddr,1);
10782     emit_writeword(0,(int)&pcaddr);
10783     emit_cmp(0,1);
10784     emit_jne((int)new_dyna_leave);
10785   }
10786 #endif
10787   for(i=0;i<slen;i++)
10788   {
10789     //if(ds) printf("ds: ");
10790     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10791     if(ds) {
10792       ds=0; // Skip delay slot
10793       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10794       instr_addr[i]=0;
10795     } else {
10796       #ifndef DESTRUCTIVE_WRITEBACK
10797       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10798       {
10799         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10800               unneeded_reg[i],unneeded_reg_upper[i]);
10801         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10802               unneeded_reg[i],unneeded_reg_upper[i]);
10803       }
10804       is32_pre=regs[i].is32;
10805       dirty_pre=regs[i].dirty;
10806       #endif
10807       // write back
10808       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10809       {
10810         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10811                       unneeded_reg[i],unneeded_reg_upper[i]);
10812         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10813       }
10814       // branch target entry point
10815       instr_addr[i]=(u_int)out;
10816       assem_debug("<->\n");
10817       // load regs
10818       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10819         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10820       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10821       address_generation(i,&regs[i],regs[i].regmap_entry);
10822       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10823       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10824       {
10825         // Load the delay slot registers if necessary
10826         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10827           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10828         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10829           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10830         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10831           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10832       }
10833       else if(i+1<slen)
10834       {
10835         // Preload registers for following instruction
10836         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10837           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10838             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10839         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10840           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10841             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10842       }
10843       // TODO: if(is_ooo(i)) address_generation(i+1);
10844       if(itype[i]==CJUMP||itype[i]==FJUMP)
10845         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10846       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10847         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10848       if(bt[i]) cop1_usable=0;
10849       // assemble
10850       switch(itype[i]) {
10851         case ALU:
10852           alu_assemble(i,&regs[i]);break;
10853         case IMM16:
10854           imm16_assemble(i,&regs[i]);break;
10855         case SHIFT:
10856           shift_assemble(i,&regs[i]);break;
10857         case SHIFTIMM:
10858           shiftimm_assemble(i,&regs[i]);break;
10859         case LOAD:
10860           load_assemble(i,&regs[i]);break;
10861         case LOADLR:
10862           loadlr_assemble(i,&regs[i]);break;
10863         case STORE:
10864           store_assemble(i,&regs[i]);break;
10865         case STORELR:
10866           storelr_assemble(i,&regs[i]);break;
10867         case COP0:
10868           cop0_assemble(i,&regs[i]);break;
10869         case COP1:
10870           cop1_assemble(i,&regs[i]);break;
10871         case C1LS:
10872           c1ls_assemble(i,&regs[i]);break;
10873         case COP2:
10874           cop2_assemble(i,&regs[i]);break;
10875         case C2LS:
10876           c2ls_assemble(i,&regs[i]);break;
10877         case C2OP:
10878           c2op_assemble(i,&regs[i]);break;
10879         case FCONV:
10880           fconv_assemble(i,&regs[i]);break;
10881         case FLOAT:
10882           float_assemble(i,&regs[i]);break;
10883         case FCOMP:
10884           fcomp_assemble(i,&regs[i]);break;
10885         case MULTDIV:
10886           multdiv_assemble(i,&regs[i]);break;
10887         case MOV:
10888           mov_assemble(i,&regs[i]);break;
10889         case SYSCALL:
10890           syscall_assemble(i,&regs[i]);break;
10891         case HLECALL:
10892           hlecall_assemble(i,&regs[i]);break;
10893         case INTCALL:
10894           intcall_assemble(i,&regs[i]);break;
10895         case UJUMP:
10896           ujump_assemble(i,&regs[i]);ds=1;break;
10897         case RJUMP:
10898           rjump_assemble(i,&regs[i]);ds=1;break;
10899         case CJUMP:
10900           cjump_assemble(i,&regs[i]);ds=1;break;
10901         case SJUMP:
10902           sjump_assemble(i,&regs[i]);ds=1;break;
10903         case FJUMP:
10904           fjump_assemble(i,&regs[i]);ds=1;break;
10905         case SPAN:
10906           pagespan_assemble(i,&regs[i]);break;
10907       }
10908       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10909         literal_pool(1024);
10910       else
10911         literal_pool_jumpover(256);
10912     }
10913   }
10914   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10915   // If the block did not end with an unconditional branch,
10916   // add a jump to the next instruction.
10917   if(i>1) {
10918     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10919       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10920       assert(i==slen);
10921       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10922         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10923         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10924           emit_loadreg(CCREG,HOST_CCREG);
10925         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10926       }
10927       else if(!likely[i-2])
10928       {
10929         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10930         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10931       }
10932       else
10933       {
10934         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10935         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10936       }
10937       add_to_linker((int)out,start+i*4,0);
10938       emit_jmp(0);
10939     }
10940   }
10941   else
10942   {
10943     assert(i>0);
10944     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10945     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10946     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10947       emit_loadreg(CCREG,HOST_CCREG);
10948     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10949     add_to_linker((int)out,start+i*4,0);
10950     emit_jmp(0);
10951   }
10952
10953   // TODO: delay slot stubs?
10954   // Stubs
10955   for(i=0;i<stubcount;i++)
10956   {
10957     switch(stubs[i][0])
10958     {
10959       case LOADB_STUB:
10960       case LOADH_STUB:
10961       case LOADW_STUB:
10962       case LOADD_STUB:
10963       case LOADBU_STUB:
10964       case LOADHU_STUB:
10965         do_readstub(i);break;
10966       case STOREB_STUB:
10967       case STOREH_STUB:
10968       case STOREW_STUB:
10969       case STORED_STUB:
10970         do_writestub(i);break;
10971       case CC_STUB:
10972         do_ccstub(i);break;
10973       case INVCODE_STUB:
10974         do_invstub(i);break;
10975       case FP_STUB:
10976         do_cop1stub(i);break;
10977       case STORELR_STUB:
10978         do_unalignedwritestub(i);break;
10979     }
10980   }
10981
10982   if (instr_addr0_override)
10983     instr_addr[0] = instr_addr0_override;
10984
10985   /* Pass 9 - Linker */
10986   for(i=0;i<linkcount;i++)
10987   {
10988     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10989     literal_pool(64);
10990     if(!link_addr[i][2])
10991     {
10992       void *stub=out;
10993       void *addr=check_addr(link_addr[i][1]);
10994       emit_extjump(link_addr[i][0],link_addr[i][1]);
10995       if(addr) {
10996         set_jump_target(link_addr[i][0],(int)addr);
10997         add_link(link_addr[i][1],stub);
10998       }
10999       else set_jump_target(link_addr[i][0],(int)stub);
11000     }
11001     else
11002     {
11003       // Internal branch
11004       int target=(link_addr[i][1]-start)>>2;
11005       assert(target>=0&&target<slen);
11006       assert(instr_addr[target]);
11007       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11008       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11009       //#else
11010       set_jump_target(link_addr[i][0],instr_addr[target]);
11011       //#endif
11012     }
11013   }
11014   // External Branch Targets (jump_in)
11015   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11016   for(i=0;i<slen;i++)
11017   {
11018     if(bt[i]||i==0)
11019     {
11020       if(instr_addr[i]) // TODO - delay slots (=null)
11021       {
11022         u_int vaddr=start+i*4;
11023         u_int page=get_page(vaddr);
11024         u_int vpage=get_vpage(vaddr);
11025         literal_pool(256);
11026         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11027 #ifndef FORCE32
11028         if(!requires_32bit[i])
11029 #else
11030         if(1)
11031 #endif
11032         {
11033           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11034           assem_debug("jump_in: %x\n",start+i*4);
11035           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11036           int entry_point=do_dirty_stub(i);
11037           ll_add(jump_in+page,vaddr,(void *)entry_point);
11038           // If there was an existing entry in the hash table,
11039           // replace it with the new address.
11040           // Don't add new entries.  We'll insert the
11041           // ones that actually get used in check_addr().
11042           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11043           if(ht_bin[0]==vaddr) {
11044             ht_bin[1]=entry_point;
11045           }
11046           if(ht_bin[2]==vaddr) {
11047             ht_bin[3]=entry_point;
11048           }
11049         }
11050         else
11051         {
11052           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11053           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11054           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11055           //int entry_point=(int)out;
11056           ////assem_debug("entry_point: %x\n",entry_point);
11057           //load_regs_entry(i);
11058           //if(entry_point==(int)out)
11059           //  entry_point=instr_addr[i];
11060           //else
11061           //  emit_jmp(instr_addr[i]);
11062           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11063           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11064           int entry_point=do_dirty_stub(i);
11065           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11066         }
11067       }
11068     }
11069   }
11070   // Write out the literal pool if necessary
11071   literal_pool(0);
11072   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11073   // Align code
11074   if(((u_int)out)&7) emit_addnop(13);
11075   #endif
11076   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11077   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11078   memcpy(copy,source,slen*4);
11079   copy+=slen*4;
11080   
11081   #ifdef __arm__
11082   __clear_cache((void *)beginning,out);
11083   #endif
11084   
11085   // If we're within 256K of the end of the buffer,
11086   // start over from the beginning. (Is 256K enough?)
11087   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11088   
11089   // Trap writes to any of the pages we compiled
11090   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11091     invalid_code[i]=0;
11092 #ifndef DISABLE_TLB
11093     memory_map[i]|=0x40000000;
11094     if((signed int)start>=(signed int)0xC0000000) {
11095       assert(using_tlb);
11096       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11097       invalid_code[j]=0;
11098       memory_map[j]|=0x40000000;
11099       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11100     }
11101 #endif
11102   }
11103 #ifdef PCSX
11104   // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
11105   if(get_page(start)<(RAM_SIZE>>12))
11106     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11107       invalid_code[((u_int)0x80000000>>12)|i]=0;
11108 #endif
11109   
11110   /* Pass 10 - Free memory by expiring oldest blocks */
11111   
11112   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11113   while(expirep!=end)
11114   {
11115     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11116     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11117     inv_debug("EXP: Phase %d\n",expirep);
11118     switch((expirep>>11)&3)
11119     {
11120       case 0:
11121         // Clear jump_in and jump_dirty
11122         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11123         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11124         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11125         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11126         break;
11127       case 1:
11128         // Clear pointers
11129         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11130         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11131         break;
11132       case 2:
11133         // Clear hash table
11134         for(i=0;i<32;i++) {
11135           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11136           if((ht_bin[3]>>shift)==(base>>shift) ||
11137              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11138             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11139             ht_bin[2]=ht_bin[3]=-1;
11140           }
11141           if((ht_bin[1]>>shift)==(base>>shift) ||
11142              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11143             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11144             ht_bin[0]=ht_bin[2];
11145             ht_bin[1]=ht_bin[3];
11146             ht_bin[2]=ht_bin[3]=-1;
11147           }
11148         }
11149         break;
11150       case 3:
11151         // Clear jump_out
11152         #ifdef __arm__
11153         if((expirep&2047)==0) 
11154           do_clear_cache();
11155         #endif
11156         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11157         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11158         break;
11159     }
11160     expirep=(expirep+1)&65535;
11161   }
11162   return 0;
11163 }
11164
11165 // vim:shiftwidth=2:expandtab