drc: merge Ari64's patch: 03_needed_again
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   static u_int sp_in_mirror;
130   u_int stop_after_jal;
131   extern u_char restore_candidate[512];
132   extern int cycle_count;
133
134   /* registers that may be allocated */
135   /* 1-31 gpr */
136 #define HIREG 32 // hi
137 #define LOREG 33 // lo
138 #define FSREG 34 // FPU status (FCSR)
139 #define CSREG 35 // Coprocessor status
140 #define CCREG 36 // Cycle count
141 #define INVCP 37 // Pointer to invalid_code
142 #define MMREG 38 // Pointer to memory_map
143 #define ROREG 39 // ram offset (if rdram!=0x80000000)
144 #define TEMPREG 40
145 #define FTEMP 40 // FPU temporary register
146 #define PTEMP 41 // Prefetch temporary register
147 #define TLREG 42 // TLB mapping offset
148 #define RHASH 43 // Return address hash
149 #define RHTBL 44 // Return address hash table address
150 #define RTEMP 45 // JR/JALR address register
151 #define MAXREG 45
152 #define AGEN1 46 // Address generation temporary register
153 #define AGEN2 47 // Address generation temporary register
154 #define MGEN1 48 // Maptable address generation temporary register
155 #define MGEN2 49 // Maptable address generation temporary register
156 #define BTREG 50 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185 #define HLECALL 26// PCSX fake opcodes for HLE
186 #define COP2 27   // Coprocessor 2 move
187 #define C2LS 28   // Coprocessor 2 load/store
188 #define C2OP 29   // Coprocessor 2 operation
189 #define INTCALL 30// Call interpreter to handle rare corner cases
190
191   /* stubs */
192 #define CC_STUB 1
193 #define FP_STUB 2
194 #define LOADB_STUB 3
195 #define LOADH_STUB 4
196 #define LOADW_STUB 5
197 #define LOADD_STUB 6
198 #define LOADBU_STUB 7
199 #define LOADHU_STUB 8
200 #define STOREB_STUB 9
201 #define STOREH_STUB 10
202 #define STOREW_STUB 11
203 #define STORED_STUB 12
204 #define STORELR_STUB 13
205 #define INVCODE_STUB 14
206
207   /* branch codes */
208 #define TAKEN 1
209 #define NOTTAKEN 2
210 #define NULLDS 3
211
212 // asm linkage
213 int new_recompile_block(int addr);
214 void *get_addr_ht(u_int vaddr);
215 void invalidate_block(u_int block);
216 void invalidate_addr(u_int addr);
217 void remove_hash(int vaddr);
218 void jump_vaddr();
219 void dyna_linker();
220 void dyna_linker_ds();
221 void verify_code();
222 void verify_code_vm();
223 void verify_code_ds();
224 void cc_interrupt();
225 void fp_exception();
226 void fp_exception_ds();
227 void jump_syscall();
228 void jump_syscall_hle();
229 void jump_eret();
230 void jump_hlecall();
231 void jump_intcall();
232 void new_dyna_leave();
233
234 // TLB
235 void TLBWI_new();
236 void TLBWR_new();
237 void read_nomem_new();
238 void read_nomemb_new();
239 void read_nomemh_new();
240 void read_nomemd_new();
241 void write_nomem_new();
242 void write_nomemb_new();
243 void write_nomemh_new();
244 void write_nomemd_new();
245 void write_rdram_new();
246 void write_rdramb_new();
247 void write_rdramh_new();
248 void write_rdramd_new();
249 extern u_int memory_map[1048576];
250
251 // Needed by assembler
252 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
253 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
254 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
255 void load_all_regs(signed char i_regmap[]);
256 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
257 void load_regs_entry(int t);
258 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
259
260 int tracedebug=0;
261
262 //#define DEBUG_CYCLE_COUNT 1
263
264 void nullf() {}
265 //#define assem_debug printf
266 //#define inv_debug printf
267 #define assem_debug nullf
268 #define inv_debug nullf
269
270 static void tlb_hacks()
271 {
272 #ifndef DISABLE_TLB
273   // Goldeneye hack
274   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
275   {
276     u_int addr;
277     int n;
278     switch (ROM_HEADER->Country_code&0xFF) 
279     {
280       case 0x45: // U
281         addr=0x34b30;
282         break;                   
283       case 0x4A: // J 
284         addr=0x34b70;    
285         break;    
286       case 0x50: // E 
287         addr=0x329f0;
288         break;                        
289       default: 
290         // Unknown country code
291         addr=0;
292         break;
293     }
294     u_int rom_addr=(u_int)rom;
295     #ifdef ROM_COPY
296     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
297     // in the lower 4G of memory to use this hack.  Copy it if necessary.
298     if((void *)rom>(void *)0xffffffff) {
299       munmap(ROM_COPY, 67108864);
300       if(mmap(ROM_COPY, 12582912,
301               PROT_READ | PROT_WRITE,
302               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
303               -1, 0) <= 0) {printf("mmap() failed\n");}
304       memcpy(ROM_COPY,rom,12582912);
305       rom_addr=(u_int)ROM_COPY;
306     }
307     #endif
308     if(addr) {
309       for(n=0x7F000;n<0x80000;n++) {
310         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
311       }
312     }
313   }
314 #endif
315 }
316
317 static u_int get_page(u_int vaddr)
318 {
319 #ifndef PCSX
320   u_int page=(vaddr^0x80000000)>>12;
321 #else
322   u_int page=vaddr&~0xe0000000;
323   if (page < 0x1000000)
324     page &= ~0x0e00000; // RAM mirrors
325   page>>=12;
326 #endif
327 #ifndef DISABLE_TLB
328   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
329 #endif
330   if(page>2048) page=2048+(page&2047);
331   return page;
332 }
333
334 static u_int get_vpage(u_int vaddr)
335 {
336   u_int vpage=(vaddr^0x80000000)>>12;
337 #ifndef DISABLE_TLB
338   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
339 #endif
340   if(vpage>2048) vpage=2048+(vpage&2047);
341   return vpage;
342 }
343
344 // Get address from virtual address
345 // This is called from the recompiled JR/JALR instructions
346 void *get_addr(u_int vaddr)
347 {
348   u_int page=get_page(vaddr);
349   u_int vpage=get_vpage(vaddr);
350   struct ll_entry *head;
351   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
352   head=jump_in[page];
353   while(head!=NULL) {
354     if(head->vaddr==vaddr&&head->reg32==0) {
355   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
356       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
357       ht_bin[3]=ht_bin[1];
358       ht_bin[2]=ht_bin[0];
359       ht_bin[1]=(int)head->addr;
360       ht_bin[0]=vaddr;
361       return head->addr;
362     }
363     head=head->next;
364   }
365   head=jump_dirty[vpage];
366   while(head!=NULL) {
367     if(head->vaddr==vaddr&&head->reg32==0) {
368       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
369       // Don't restore blocks which are about to expire from the cache
370       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
371       if(verify_dirty(head->addr)) {
372         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
373         invalid_code[vaddr>>12]=0;
374         memory_map[vaddr>>12]|=0x40000000;
375         if(vpage<2048) {
376 #ifndef DISABLE_TLB
377           if(tlb_LUT_r[vaddr>>12]) {
378             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
379             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
380           }
381 #endif
382           restore_candidate[vpage>>3]|=1<<(vpage&7);
383         }
384         else restore_candidate[page>>3]|=1<<(page&7);
385         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
386         if(ht_bin[0]==vaddr) {
387           ht_bin[1]=(int)head->addr; // Replace existing entry
388         }
389         else
390         {
391           ht_bin[3]=ht_bin[1];
392           ht_bin[2]=ht_bin[0];
393           ht_bin[1]=(int)head->addr;
394           ht_bin[0]=vaddr;
395         }
396         return head->addr;
397       }
398     }
399     head=head->next;
400   }
401   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
402   int r=new_recompile_block(vaddr);
403   if(r==0) return get_addr(vaddr);
404   // Execute in unmapped page, generate pagefault execption
405   Status|=2;
406   Cause=(vaddr<<31)|0x8;
407   EPC=(vaddr&1)?vaddr-5:vaddr;
408   BadVAddr=(vaddr&~1);
409   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
410   EntryHi=BadVAddr&0xFFFFE000;
411   return get_addr_ht(0x80000000);
412 }
413 // Look up address in hash table first
414 void *get_addr_ht(u_int vaddr)
415 {
416   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   return get_addr(vaddr);
421 }
422
423 void *get_addr_32(u_int vaddr,u_int flags)
424 {
425 #ifdef FORCE32
426   return get_addr(vaddr);
427 #else
428   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
429   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
430   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
431   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
432   u_int page=get_page(vaddr);
433   u_int vpage=get_vpage(vaddr);
434   struct ll_entry *head;
435   head=jump_in[page];
436   while(head!=NULL) {
437     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
438       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
439       if(head->reg32==0) {
440         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441         if(ht_bin[0]==-1) {
442           ht_bin[1]=(int)head->addr;
443           ht_bin[0]=vaddr;
444         }else if(ht_bin[2]==-1) {
445           ht_bin[3]=(int)head->addr;
446           ht_bin[2]=vaddr;
447         }
448         //ht_bin[3]=ht_bin[1];
449         //ht_bin[2]=ht_bin[0];
450         //ht_bin[1]=(int)head->addr;
451         //ht_bin[0]=vaddr;
452       }
453       return head->addr;
454     }
455     head=head->next;
456   }
457   head=jump_dirty[vpage];
458   while(head!=NULL) {
459     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
460       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
461       // Don't restore blocks which are about to expire from the cache
462       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
463       if(verify_dirty(head->addr)) {
464         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
465         invalid_code[vaddr>>12]=0;
466         memory_map[vaddr>>12]|=0x40000000;
467         if(vpage<2048) {
468 #ifndef DISABLE_TLB
469           if(tlb_LUT_r[vaddr>>12]) {
470             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
471             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
472           }
473 #endif
474           restore_candidate[vpage>>3]|=1<<(vpage&7);
475         }
476         else restore_candidate[page>>3]|=1<<(page&7);
477         if(head->reg32==0) {
478           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479           if(ht_bin[0]==-1) {
480             ht_bin[1]=(int)head->addr;
481             ht_bin[0]=vaddr;
482           }else if(ht_bin[2]==-1) {
483             ht_bin[3]=(int)head->addr;
484             ht_bin[2]=vaddr;
485           }
486           //ht_bin[3]=ht_bin[1];
487           //ht_bin[2]=ht_bin[0];
488           //ht_bin[1]=(int)head->addr;
489           //ht_bin[0]=vaddr;
490         }
491         return head->addr;
492       }
493     }
494     head=head->next;
495   }
496   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
497   int r=new_recompile_block(vaddr);
498   if(r==0) return get_addr(vaddr);
499   // Execute in unmapped page, generate pagefault execption
500   Status|=2;
501   Cause=(vaddr<<31)|0x8;
502   EPC=(vaddr&1)?vaddr-5:vaddr;
503   BadVAddr=(vaddr&~1);
504   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
505   EntryHi=BadVAddr&0xFFFFE000;
506   return get_addr_ht(0x80000000);
507 #endif
508 }
509
510 void clear_all_regs(signed char regmap[])
511 {
512   int hr;
513   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
514 }
515
516 signed char get_reg(signed char regmap[],int r)
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
520   return -1;
521 }
522
523 // Find a register that is available for two consecutive cycles
524 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
525 {
526   int hr;
527   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
528   return -1;
529 }
530
531 int count_free_regs(signed char regmap[])
532 {
533   int count=0;
534   int hr;
535   for(hr=0;hr<HOST_REGS;hr++)
536   {
537     if(hr!=EXCLUDE_REG) {
538       if(regmap[hr]<0) count++;
539     }
540   }
541   return count;
542 }
543
544 void dirty_reg(struct regstat *cur,signed char reg)
545 {
546   int hr;
547   if(!reg) return;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       cur->dirty|=1<<hr;
551     }
552   }
553 }
554
555 // If we dirty the lower half of a 64 bit register which is now being
556 // sign-extended, we need to dump the upper half.
557 // Note: Do this only after completion of the instruction, because
558 // some instructions may need to read the full 64-bit value even if
559 // overwriting it (eg SLTI, DSRA32).
560 static void flush_dirty_uppers(struct regstat *cur)
561 {
562   int hr,reg;
563   for (hr=0;hr<HOST_REGS;hr++) {
564     if((cur->dirty>>hr)&1) {
565       reg=cur->regmap[hr];
566       if(reg>=64) 
567         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
568     }
569   }
570 }
571
572 void set_const(struct regstat *cur,signed char reg,uint64_t value)
573 {
574   int hr;
575   if(!reg) return;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       cur->isconst|=1<<hr;
579       cur->constmap[hr]=value;
580     }
581     else if((cur->regmap[hr]^64)==reg) {
582       cur->isconst|=1<<hr;
583       cur->constmap[hr]=value>>32;
584     }
585   }
586 }
587
588 void clear_const(struct regstat *cur,signed char reg)
589 {
590   int hr;
591   if(!reg) return;
592   for (hr=0;hr<HOST_REGS;hr++) {
593     if((cur->regmap[hr]&63)==reg) {
594       cur->isconst&=~(1<<hr);
595     }
596   }
597 }
598
599 int is_const(struct regstat *cur,signed char reg)
600 {
601   int hr;
602   if(!reg) return 1;
603   for (hr=0;hr<HOST_REGS;hr++) {
604     if((cur->regmap[hr]&63)==reg) {
605       return (cur->isconst>>hr)&1;
606     }
607   }
608   return 0;
609 }
610 uint64_t get_const(struct regstat *cur,signed char reg)
611 {
612   int hr;
613   if(!reg) return 0;
614   for (hr=0;hr<HOST_REGS;hr++) {
615     if(cur->regmap[hr]==reg) {
616       return cur->constmap[hr];
617     }
618   }
619   printf("Unknown constant in r%d\n",reg);
620   exit(1);
621 }
622
623 // Least soon needed registers
624 // Look at the next ten instructions and see which registers
625 // will be used.  Try not to reallocate these.
626 void lsn(u_char hsn[], int i, int *preferred_reg)
627 {
628   int j;
629   int b=-1;
630   for(j=0;j<9;j++)
631   {
632     if(i+j>=slen) {
633       j=slen-i-1;
634       break;
635     }
636     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
637     {
638       // Don't go past an unconditonal jump
639       j++;
640       break;
641     }
642   }
643   for(;j>=0;j--)
644   {
645     if(rs1[i+j]) hsn[rs1[i+j]]=j;
646     if(rs2[i+j]) hsn[rs2[i+j]]=j;
647     if(rt1[i+j]) hsn[rt1[i+j]]=j;
648     if(rt2[i+j]) hsn[rt2[i+j]]=j;
649     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
650       // Stores can allocate zero
651       hsn[rs1[i+j]]=j;
652       hsn[rs2[i+j]]=j;
653     }
654     // On some architectures stores need invc_ptr
655     #if defined(HOST_IMM8)
656     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
657       hsn[INVCP]=j;
658     }
659     #endif
660     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
661     {
662       hsn[CCREG]=j;
663       b=j;
664     }
665   }
666   if(b>=0)
667   {
668     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
669     {
670       // Follow first branch
671       int t=(ba[i+b]-start)>>2;
672       j=7-b;if(t+j>=slen) j=slen-t-1;
673       for(;j>=0;j--)
674       {
675         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
676         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
677         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
678         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
679       }
680     }
681     // TODO: preferred register based on backward branch
682   }
683   // Delay slot should preferably not overwrite branch conditions or cycle count
684   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
685     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
686     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
687     hsn[CCREG]=1;
688     // ...or hash tables
689     hsn[RHASH]=1;
690     hsn[RHTBL]=1;
691   }
692   // Coprocessor load/store needs FTEMP, even if not declared
693   if(itype[i]==C1LS||itype[i]==C2LS) {
694     hsn[FTEMP]=0;
695   }
696   // Load L/R also uses FTEMP as a temporary register
697   if(itype[i]==LOADLR) {
698     hsn[FTEMP]=0;
699   }
700   // Also SWL/SWR/SDL/SDR
701   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
702     hsn[FTEMP]=0;
703   }
704   // Don't remove the TLB registers either
705   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
706     hsn[TLREG]=0;
707   }
708   // Don't remove the miniht registers
709   if(itype[i]==UJUMP||itype[i]==RJUMP)
710   {
711     hsn[RHASH]=0;
712     hsn[RHTBL]=0;
713   }
714 }
715
716 // We only want to allocate registers if we're going to use them again soon
717 int needed_again(int r, int i)
718 {
719   int j;
720   int b=-1;
721   int rn=10;
722   
723   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
724   {
725     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
726       return 0; // Don't need any registers if exiting the block
727   }
728   for(j=0;j<9;j++)
729   {
730     if(i+j>=slen) {
731       j=slen-i-1;
732       break;
733     }
734     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
735     {
736       // Don't go past an unconditonal jump
737       j++;
738       break;
739     }
740     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
741     {
742       break;
743     }
744   }
745   for(;j>=1;j--)
746   {
747     if(rs1[i+j]==r) rn=j;
748     if(rs2[i+j]==r) rn=j;
749     if((unneeded_reg[i+j]>>r)&1) rn=10;
750     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
751     {
752       b=j;
753     }
754   }
755   /*
756   if(b>=0)
757   {
758     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
759     {
760       // Follow first branch
761       int o=rn;
762       int t=(ba[i+b]-start)>>2;
763       j=7-b;if(t+j>=slen) j=slen-t-1;
764       for(;j>=0;j--)
765       {
766         if(!((unneeded_reg[t+j]>>r)&1)) {
767           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
768           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
769         }
770         else rn=o;
771       }
772     }
773   }*/
774   if(rn<10) return 1;
775   return 0;
776 }
777
778 // Try to match register allocations at the end of a loop with those
779 // at the beginning
780 int loop_reg(int i, int r, int hr)
781 {
782   int j,k;
783   for(j=0;j<9;j++)
784   {
785     if(i+j>=slen) {
786       j=slen-i-1;
787       break;
788     }
789     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
790     {
791       // Don't go past an unconditonal jump
792       j++;
793       break;
794     }
795   }
796   k=0;
797   if(i>0){
798     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
799       k--;
800   }
801   for(;k<j;k++)
802   {
803     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
804     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
805     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
806     {
807       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
808       {
809         int t=(ba[i+k]-start)>>2;
810         int reg=get_reg(regs[t].regmap_entry,r);
811         if(reg>=0) return reg;
812         //reg=get_reg(regs[t+1].regmap_entry,r);
813         //if(reg>=0) return reg;
814       }
815     }
816   }
817   return hr;
818 }
819
820
821 // Allocate every register, preserving source/target regs
822 void alloc_all(struct regstat *cur,int i)
823 {
824   int hr;
825   
826   for(hr=0;hr<HOST_REGS;hr++) {
827     if(hr!=EXCLUDE_REG) {
828       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
829          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
830       {
831         cur->regmap[hr]=-1;
832         cur->dirty&=~(1<<hr);
833       }
834       // Don't need zeros
835       if((cur->regmap[hr]&63)==0)
836       {
837         cur->regmap[hr]=-1;
838         cur->dirty&=~(1<<hr);
839       }
840     }
841   }
842 }
843
844
845 void div64(int64_t dividend,int64_t divisor)
846 {
847   lo=dividend/divisor;
848   hi=dividend%divisor;
849   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
850   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
851 }
852 void divu64(uint64_t dividend,uint64_t divisor)
853 {
854   lo=dividend/divisor;
855   hi=dividend%divisor;
856   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
857   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
858 }
859
860 void mult64(uint64_t m1,uint64_t m2)
861 {
862    unsigned long long int op1, op2, op3, op4;
863    unsigned long long int result1, result2, result3, result4;
864    unsigned long long int temp1, temp2, temp3, temp4;
865    int sign = 0;
866    
867    if (m1 < 0)
868      {
869     op2 = -m1;
870     sign = 1 - sign;
871      }
872    else op2 = m1;
873    if (m2 < 0)
874      {
875     op4 = -m2;
876     sign = 1 - sign;
877      }
878    else op4 = m2;
879    
880    op1 = op2 & 0xFFFFFFFF;
881    op2 = (op2 >> 32) & 0xFFFFFFFF;
882    op3 = op4 & 0xFFFFFFFF;
883    op4 = (op4 >> 32) & 0xFFFFFFFF;
884    
885    temp1 = op1 * op3;
886    temp2 = (temp1 >> 32) + op1 * op4;
887    temp3 = op2 * op3;
888    temp4 = (temp3 >> 32) + op2 * op4;
889    
890    result1 = temp1 & 0xFFFFFFFF;
891    result2 = temp2 + (temp3 & 0xFFFFFFFF);
892    result3 = (result2 >> 32) + temp4;
893    result4 = (result3 >> 32);
894    
895    lo = result1 | (result2 << 32);
896    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
897    if (sign)
898      {
899     hi = ~hi;
900     if (!lo) hi++;
901     else lo = ~lo + 1;
902      }
903 }
904
905 void multu64(uint64_t m1,uint64_t m2)
906 {
907    unsigned long long int op1, op2, op3, op4;
908    unsigned long long int result1, result2, result3, result4;
909    unsigned long long int temp1, temp2, temp3, temp4;
910    
911    op1 = m1 & 0xFFFFFFFF;
912    op2 = (m1 >> 32) & 0xFFFFFFFF;
913    op3 = m2 & 0xFFFFFFFF;
914    op4 = (m2 >> 32) & 0xFFFFFFFF;
915    
916    temp1 = op1 * op3;
917    temp2 = (temp1 >> 32) + op1 * op4;
918    temp3 = op2 * op3;
919    temp4 = (temp3 >> 32) + op2 * op4;
920    
921    result1 = temp1 & 0xFFFFFFFF;
922    result2 = temp2 + (temp3 & 0xFFFFFFFF);
923    result3 = (result2 >> 32) + temp4;
924    result4 = (result3 >> 32);
925    
926    lo = result1 | (result2 << 32);
927    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
928    
929   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
930   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
931 }
932
933 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
934 {
935   if(bits) {
936     original<<=64-bits;
937     original>>=64-bits;
938     loaded<<=bits;
939     original|=loaded;
940   }
941   else original=loaded;
942   return original;
943 }
944 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
945 {
946   if(bits^56) {
947     original>>=64-(bits^56);
948     original<<=64-(bits^56);
949     loaded>>=bits^56;
950     original|=loaded;
951   }
952   else original=loaded;
953   return original;
954 }
955
956 #ifdef __i386__
957 #include "assem_x86.c"
958 #endif
959 #ifdef __x86_64__
960 #include "assem_x64.c"
961 #endif
962 #ifdef __arm__
963 #include "assem_arm.c"
964 #endif
965
966 // Add virtual address mapping to linked list
967 void ll_add(struct ll_entry **head,int vaddr,void *addr)
968 {
969   struct ll_entry *new_entry;
970   new_entry=malloc(sizeof(struct ll_entry));
971   assert(new_entry!=NULL);
972   new_entry->vaddr=vaddr;
973   new_entry->reg32=0;
974   new_entry->addr=addr;
975   new_entry->next=*head;
976   *head=new_entry;
977 }
978
979 // Add virtual address mapping for 32-bit compiled block
980 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
981 {
982   ll_add(head,vaddr,addr);
983 #ifndef FORCE32
984   (*head)->reg32=reg32;
985 #endif
986 }
987
988 // Check if an address is already compiled
989 // but don't return addresses which are about to expire from the cache
990 void *check_addr(u_int vaddr)
991 {
992   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
993   if(ht_bin[0]==vaddr) {
994     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
995       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
996   }
997   if(ht_bin[2]==vaddr) {
998     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
999       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1000   }
1001   u_int page=get_page(vaddr);
1002   struct ll_entry *head;
1003   head=jump_in[page];
1004   while(head!=NULL) {
1005     if(head->vaddr==vaddr&&head->reg32==0) {
1006       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1007         // Update existing entry with current address
1008         if(ht_bin[0]==vaddr) {
1009           ht_bin[1]=(int)head->addr;
1010           return head->addr;
1011         }
1012         if(ht_bin[2]==vaddr) {
1013           ht_bin[3]=(int)head->addr;
1014           return head->addr;
1015         }
1016         // Insert into hash table with low priority.
1017         // Don't evict existing entries, as they are probably
1018         // addresses that are being accessed frequently.
1019         if(ht_bin[0]==-1) {
1020           ht_bin[1]=(int)head->addr;
1021           ht_bin[0]=vaddr;
1022         }else if(ht_bin[2]==-1) {
1023           ht_bin[3]=(int)head->addr;
1024           ht_bin[2]=vaddr;
1025         }
1026         return head->addr;
1027       }
1028     }
1029     head=head->next;
1030   }
1031   return 0;
1032 }
1033
1034 void remove_hash(int vaddr)
1035 {
1036   //printf("remove hash: %x\n",vaddr);
1037   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1038   if(ht_bin[2]==vaddr) {
1039     ht_bin[2]=ht_bin[3]=-1;
1040   }
1041   if(ht_bin[0]==vaddr) {
1042     ht_bin[0]=ht_bin[2];
1043     ht_bin[1]=ht_bin[3];
1044     ht_bin[2]=ht_bin[3]=-1;
1045   }
1046 }
1047
1048 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1049 {
1050   struct ll_entry *next;
1051   while(*head) {
1052     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1053        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1054     {
1055       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1056       remove_hash((*head)->vaddr);
1057       next=(*head)->next;
1058       free(*head);
1059       *head=next;
1060     }
1061     else
1062     {
1063       head=&((*head)->next);
1064     }
1065   }
1066 }
1067
1068 // Remove all entries from linked list
1069 void ll_clear(struct ll_entry **head)
1070 {
1071   struct ll_entry *cur;
1072   struct ll_entry *next;
1073   if(cur=*head) {
1074     *head=0;
1075     while(cur) {
1076       next=cur->next;
1077       free(cur);
1078       cur=next;
1079     }
1080   }
1081 }
1082
1083 // Dereference the pointers and remove if it matches
1084 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1085 {
1086   while(head) {
1087     int ptr=get_pointer(head->addr);
1088     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1089     if(((ptr>>shift)==(addr>>shift)) ||
1090        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1091     {
1092       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1093       u_int host_addr=(u_int)kill_pointer(head->addr);
1094       #ifdef __arm__
1095         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1096       #endif
1097     }
1098     head=head->next;
1099   }
1100 }
1101
1102 // This is called when we write to a compiled block (see do_invstub)
1103 void invalidate_page(u_int page)
1104 {
1105   struct ll_entry *head;
1106   struct ll_entry *next;
1107   head=jump_in[page];
1108   jump_in[page]=0;
1109   while(head!=NULL) {
1110     inv_debug("INVALIDATE: %x\n",head->vaddr);
1111     remove_hash(head->vaddr);
1112     next=head->next;
1113     free(head);
1114     head=next;
1115   }
1116   head=jump_out[page];
1117   jump_out[page]=0;
1118   while(head!=NULL) {
1119     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1120     u_int host_addr=(u_int)kill_pointer(head->addr);
1121     #ifdef __arm__
1122       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1123     #endif
1124     next=head->next;
1125     free(head);
1126     head=next;
1127   }
1128 }
1129 void invalidate_block(u_int block)
1130 {
1131   u_int page=get_page(block<<12);
1132   u_int vpage=get_vpage(block<<12);
1133   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1134   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1135   u_int first,last;
1136   first=last=page;
1137   struct ll_entry *head;
1138   head=jump_dirty[vpage];
1139   //printf("page=%d vpage=%d\n",page,vpage);
1140   while(head!=NULL) {
1141     u_int start,end;
1142     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1143       get_bounds((int)head->addr,&start,&end);
1144       //printf("start: %x end: %x\n",start,end);
1145       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1146         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1147           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1148           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1149         }
1150       }
1151 #ifndef DISABLE_TLB
1152       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1153         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1154           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1155           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1156         }
1157       }
1158 #endif
1159     }
1160     head=head->next;
1161   }
1162   //printf("first=%d last=%d\n",first,last);
1163   invalidate_page(page);
1164   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1165   assert(last<page+5);
1166   // Invalidate the adjacent pages if a block crosses a 4K boundary
1167   while(first<page) {
1168     invalidate_page(first);
1169     first++;
1170   }
1171   for(first=page+1;first<last;first++) {
1172     invalidate_page(first);
1173   }
1174   #ifdef __arm__
1175     do_clear_cache();
1176   #endif
1177   
1178   // Don't trap writes
1179   invalid_code[block]=1;
1180 #ifdef PCSX
1181   invalid_code[((u_int)0x80000000>>12)|page]=1;
1182 #endif
1183 #ifndef DISABLE_TLB
1184   // If there is a valid TLB entry for this page, remove write protect
1185   if(tlb_LUT_w[block]) {
1186     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1187     // CHECK: Is this right?
1188     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1189     u_int real_block=tlb_LUT_w[block]>>12;
1190     invalid_code[real_block]=1;
1191     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1192   }
1193   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1194 #endif
1195
1196   #ifdef USE_MINI_HT
1197   memset(mini_ht,-1,sizeof(mini_ht));
1198   #endif
1199 }
1200 void invalidate_addr(u_int addr)
1201 {
1202   invalidate_block(addr>>12);
1203 }
1204 // This is called when loading a save state.
1205 // Anything could have changed, so invalidate everything.
1206 void invalidate_all_pages()
1207 {
1208   u_int page,n;
1209   for(page=0;page<4096;page++)
1210     invalidate_page(page);
1211   for(page=0;page<1048576;page++)
1212     if(!invalid_code[page]) {
1213       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1214       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1215     }
1216   #ifdef __arm__
1217   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1218   #endif
1219   #ifdef USE_MINI_HT
1220   memset(mini_ht,-1,sizeof(mini_ht));
1221   #endif
1222   #ifndef DISABLE_TLB
1223   // TLB
1224   for(page=0;page<0x100000;page++) {
1225     if(tlb_LUT_r[page]) {
1226       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1227       if(!tlb_LUT_w[page]||!invalid_code[page])
1228         memory_map[page]|=0x40000000; // Write protect
1229     }
1230     else memory_map[page]=-1;
1231     if(page==0x80000) page=0xC0000;
1232   }
1233   tlb_hacks();
1234   #endif
1235 }
1236
1237 // Add an entry to jump_out after making a link
1238 void add_link(u_int vaddr,void *src)
1239 {
1240   u_int page=get_page(vaddr);
1241   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1242   ll_add(jump_out+page,vaddr,src);
1243   //int ptr=get_pointer(src);
1244   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1245 }
1246
1247 // If a code block was found to be unmodified (bit was set in
1248 // restore_candidate) and it remains unmodified (bit is clear
1249 // in invalid_code) then move the entries for that 4K page from
1250 // the dirty list to the clean list.
1251 void clean_blocks(u_int page)
1252 {
1253   struct ll_entry *head;
1254   inv_debug("INV: clean_blocks page=%d\n",page);
1255   head=jump_dirty[page];
1256   while(head!=NULL) {
1257     if(!invalid_code[head->vaddr>>12]) {
1258       // Don't restore blocks which are about to expire from the cache
1259       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1260         u_int start,end;
1261         if(verify_dirty((int)head->addr)) {
1262           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1263           u_int i;
1264           u_int inv=0;
1265           get_bounds((int)head->addr,&start,&end);
1266           if(start-(u_int)rdram<RAM_SIZE) {
1267             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1268               inv|=invalid_code[i];
1269             }
1270           }
1271           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1272             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1273             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1274             if(addr<start||addr>=end) inv=1;
1275           }
1276           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1277             inv=1;
1278           }
1279           if(!inv) {
1280             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1281             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1282               u_int ppage=page;
1283 #ifndef DISABLE_TLB
1284               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1285 #endif
1286               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1287               //printf("page=%x, addr=%x\n",page,head->vaddr);
1288               //assert(head->vaddr>>12==(page|0x80000));
1289               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1290               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1291               if(!head->reg32) {
1292                 if(ht_bin[0]==head->vaddr) {
1293                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1294                 }
1295                 if(ht_bin[2]==head->vaddr) {
1296                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1297                 }
1298               }
1299             }
1300           }
1301         }
1302       }
1303     }
1304     head=head->next;
1305   }
1306 }
1307
1308
1309 void mov_alloc(struct regstat *current,int i)
1310 {
1311   // Note: Don't need to actually alloc the source registers
1312   if((~current->is32>>rs1[i])&1) {
1313     //alloc_reg64(current,i,rs1[i]);
1314     alloc_reg64(current,i,rt1[i]);
1315     current->is32&=~(1LL<<rt1[i]);
1316   } else {
1317     //alloc_reg(current,i,rs1[i]);
1318     alloc_reg(current,i,rt1[i]);
1319     current->is32|=(1LL<<rt1[i]);
1320   }
1321   clear_const(current,rs1[i]);
1322   clear_const(current,rt1[i]);
1323   dirty_reg(current,rt1[i]);
1324 }
1325
1326 void shiftimm_alloc(struct regstat *current,int i)
1327 {
1328   clear_const(current,rs1[i]);
1329   clear_const(current,rt1[i]);
1330   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1331   {
1332     if(rt1[i]) {
1333       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1334       else lt1[i]=rs1[i];
1335       alloc_reg(current,i,rt1[i]);
1336       current->is32|=1LL<<rt1[i];
1337       dirty_reg(current,rt1[i]);
1338     }
1339   }
1340   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1341   {
1342     if(rt1[i]) {
1343       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1344       alloc_reg64(current,i,rt1[i]);
1345       current->is32&=~(1LL<<rt1[i]);
1346       dirty_reg(current,rt1[i]);
1347     }
1348   }
1349   if(opcode2[i]==0x3c) // DSLL32
1350   {
1351     if(rt1[i]) {
1352       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1353       alloc_reg64(current,i,rt1[i]);
1354       current->is32&=~(1LL<<rt1[i]);
1355       dirty_reg(current,rt1[i]);
1356     }
1357   }
1358   if(opcode2[i]==0x3e) // DSRL32
1359   {
1360     if(rt1[i]) {
1361       alloc_reg64(current,i,rs1[i]);
1362       if(imm[i]==32) {
1363         alloc_reg64(current,i,rt1[i]);
1364         current->is32&=~(1LL<<rt1[i]);
1365       } else {
1366         alloc_reg(current,i,rt1[i]);
1367         current->is32|=1LL<<rt1[i];
1368       }
1369       dirty_reg(current,rt1[i]);
1370     }
1371   }
1372   if(opcode2[i]==0x3f) // DSRA32
1373   {
1374     if(rt1[i]) {
1375       alloc_reg64(current,i,rs1[i]);
1376       alloc_reg(current,i,rt1[i]);
1377       current->is32|=1LL<<rt1[i];
1378       dirty_reg(current,rt1[i]);
1379     }
1380   }
1381 }
1382
1383 void shift_alloc(struct regstat *current,int i)
1384 {
1385   if(rt1[i]) {
1386     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1387     {
1388       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1389       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1390       alloc_reg(current,i,rt1[i]);
1391       if(rt1[i]==rs2[i]) {
1392         alloc_reg_temp(current,i,-1);
1393         minimum_free_regs[i]=1;
1394       }
1395       current->is32|=1LL<<rt1[i];
1396     } else { // DSLLV/DSRLV/DSRAV
1397       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1398       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1399       alloc_reg64(current,i,rt1[i]);
1400       current->is32&=~(1LL<<rt1[i]);
1401       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1402       {
1403         alloc_reg_temp(current,i,-1);
1404         minimum_free_regs[i]=1;
1405       }
1406     }
1407     clear_const(current,rs1[i]);
1408     clear_const(current,rs2[i]);
1409     clear_const(current,rt1[i]);
1410     dirty_reg(current,rt1[i]);
1411   }
1412 }
1413
1414 void alu_alloc(struct regstat *current,int i)
1415 {
1416   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1417     if(rt1[i]) {
1418       if(rs1[i]&&rs2[i]) {
1419         alloc_reg(current,i,rs1[i]);
1420         alloc_reg(current,i,rs2[i]);
1421       }
1422       else {
1423         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1424         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1425       }
1426       alloc_reg(current,i,rt1[i]);
1427     }
1428     current->is32|=1LL<<rt1[i];
1429   }
1430   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1431     if(rt1[i]) {
1432       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1433       {
1434         alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rs2[i]);
1436         alloc_reg(current,i,rt1[i]);
1437       } else {
1438         alloc_reg(current,i,rs1[i]);
1439         alloc_reg(current,i,rs2[i]);
1440         alloc_reg(current,i,rt1[i]);
1441       }
1442     }
1443     current->is32|=1LL<<rt1[i];
1444   }
1445   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1446     if(rt1[i]) {
1447       if(rs1[i]&&rs2[i]) {
1448         alloc_reg(current,i,rs1[i]);
1449         alloc_reg(current,i,rs2[i]);
1450       }
1451       else
1452       {
1453         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1454         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1455       }
1456       alloc_reg(current,i,rt1[i]);
1457       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1458       {
1459         if(!((current->uu>>rt1[i])&1)) {
1460           alloc_reg64(current,i,rt1[i]);
1461         }
1462         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1463           if(rs1[i]&&rs2[i]) {
1464             alloc_reg64(current,i,rs1[i]);
1465             alloc_reg64(current,i,rs2[i]);
1466           }
1467           else
1468           {
1469             // Is is really worth it to keep 64-bit values in registers?
1470             #ifdef NATIVE_64BIT
1471             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1472             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1473             #endif
1474           }
1475         }
1476         current->is32&=~(1LL<<rt1[i]);
1477       } else {
1478         current->is32|=1LL<<rt1[i];
1479       }
1480     }
1481   }
1482   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1483     if(rt1[i]) {
1484       if(rs1[i]&&rs2[i]) {
1485         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1486           alloc_reg64(current,i,rs1[i]);
1487           alloc_reg64(current,i,rs2[i]);
1488           alloc_reg64(current,i,rt1[i]);
1489         } else {
1490           alloc_reg(current,i,rs1[i]);
1491           alloc_reg(current,i,rs2[i]);
1492           alloc_reg(current,i,rt1[i]);
1493         }
1494       }
1495       else {
1496         alloc_reg(current,i,rt1[i]);
1497         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1498           // DADD used as move, or zeroing
1499           // If we have a 64-bit source, then make the target 64 bits too
1500           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1501             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1502             alloc_reg64(current,i,rt1[i]);
1503           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1504             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1505             alloc_reg64(current,i,rt1[i]);
1506           }
1507           if(opcode2[i]>=0x2e&&rs2[i]) {
1508             // DSUB used as negation - 64-bit result
1509             // If we have a 32-bit register, extend it to 64 bits
1510             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1511             alloc_reg64(current,i,rt1[i]);
1512           }
1513         }
1514       }
1515       if(rs1[i]&&rs2[i]) {
1516         current->is32&=~(1LL<<rt1[i]);
1517       } else if(rs1[i]) {
1518         current->is32&=~(1LL<<rt1[i]);
1519         if((current->is32>>rs1[i])&1)
1520           current->is32|=1LL<<rt1[i];
1521       } else if(rs2[i]) {
1522         current->is32&=~(1LL<<rt1[i]);
1523         if((current->is32>>rs2[i])&1)
1524           current->is32|=1LL<<rt1[i];
1525       } else {
1526         current->is32|=1LL<<rt1[i];
1527       }
1528     }
1529   }
1530   clear_const(current,rs1[i]);
1531   clear_const(current,rs2[i]);
1532   clear_const(current,rt1[i]);
1533   dirty_reg(current,rt1[i]);
1534 }
1535
1536 void imm16_alloc(struct regstat *current,int i)
1537 {
1538   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1539   else lt1[i]=rs1[i];
1540   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1541   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1542     current->is32&=~(1LL<<rt1[i]);
1543     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1544       // TODO: Could preserve the 32-bit flag if the immediate is zero
1545       alloc_reg64(current,i,rt1[i]);
1546       alloc_reg64(current,i,rs1[i]);
1547     }
1548     clear_const(current,rs1[i]);
1549     clear_const(current,rt1[i]);
1550   }
1551   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1552     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1553     current->is32|=1LL<<rt1[i];
1554     clear_const(current,rs1[i]);
1555     clear_const(current,rt1[i]);
1556   }
1557   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1558     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1559       if(rs1[i]!=rt1[i]) {
1560         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1561         alloc_reg64(current,i,rt1[i]);
1562         current->is32&=~(1LL<<rt1[i]);
1563       }
1564     }
1565     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1566     if(is_const(current,rs1[i])) {
1567       int v=get_const(current,rs1[i]);
1568       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1569       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1570       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1571     }
1572     else clear_const(current,rt1[i]);
1573   }
1574   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1575     if(is_const(current,rs1[i])) {
1576       int v=get_const(current,rs1[i]);
1577       set_const(current,rt1[i],v+imm[i]);
1578     }
1579     else clear_const(current,rt1[i]);
1580     current->is32|=1LL<<rt1[i];
1581   }
1582   else {
1583     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1584     current->is32|=1LL<<rt1[i];
1585   }
1586   dirty_reg(current,rt1[i]);
1587 }
1588
1589 void load_alloc(struct regstat *current,int i)
1590 {
1591   clear_const(current,rt1[i]);
1592   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1593   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1594   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1595   if(rt1[i]) {
1596     alloc_reg(current,i,rt1[i]);
1597     if(get_reg(current->regmap,rt1[i])<0) {
1598       // dummy load, but we still need a register to calculate the address
1599       alloc_reg_temp(current,i,-1);
1600       minimum_free_regs[i]=1;
1601     }
1602     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1603     {
1604       current->is32&=~(1LL<<rt1[i]);
1605       alloc_reg64(current,i,rt1[i]);
1606     }
1607     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1608     {
1609       current->is32&=~(1LL<<rt1[i]);
1610       alloc_reg64(current,i,rt1[i]);
1611       alloc_all(current,i);
1612       alloc_reg64(current,i,FTEMP);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615     else current->is32|=1LL<<rt1[i];
1616     dirty_reg(current,rt1[i]);
1617     // If using TLB, need a register for pointer to the mapping table
1618     if(using_tlb) alloc_reg(current,i,TLREG);
1619     // LWL/LWR need a temporary register for the old value
1620     if(opcode[i]==0x22||opcode[i]==0x26)
1621     {
1622       alloc_reg(current,i,FTEMP);
1623       alloc_reg_temp(current,i,-1);
1624       minimum_free_regs[i]=1;
1625     }
1626   }
1627   else
1628   {
1629     // Load to r0 (dummy load)
1630     // but we still need a register to calculate the address
1631     if(opcode[i]==0x22||opcode[i]==0x26)
1632     {
1633       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1634     }
1635     alloc_reg_temp(current,i,-1);
1636     minimum_free_regs[i]=1;
1637     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1638     {
1639       alloc_all(current,i);
1640       alloc_reg64(current,i,FTEMP);
1641       minimum_free_regs[i]=HOST_REGS;
1642     }
1643   }
1644 }
1645
1646 void store_alloc(struct regstat *current,int i)
1647 {
1648   clear_const(current,rs2[i]);
1649   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1650   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1651   alloc_reg(current,i,rs2[i]);
1652   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1653     alloc_reg64(current,i,rs2[i]);
1654     if(rs2[i]) alloc_reg(current,i,FTEMP);
1655   }
1656   // If using TLB, need a register for pointer to the mapping table
1657   if(using_tlb) alloc_reg(current,i,TLREG);
1658   #if defined(HOST_IMM8)
1659   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1660   else alloc_reg(current,i,INVCP);
1661   #endif
1662   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1663     alloc_reg(current,i,FTEMP);
1664   }
1665   // We need a temporary register for address generation
1666   alloc_reg_temp(current,i,-1);
1667   minimum_free_regs[i]=1;
1668 }
1669
1670 void c1ls_alloc(struct regstat *current,int i)
1671 {
1672   //clear_const(current,rs1[i]); // FIXME
1673   clear_const(current,rt1[i]);
1674   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1675   alloc_reg(current,i,CSREG); // Status
1676   alloc_reg(current,i,FTEMP);
1677   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1678     alloc_reg64(current,i,FTEMP);
1679   }
1680   // If using TLB, need a register for pointer to the mapping table
1681   if(using_tlb) alloc_reg(current,i,TLREG);
1682   #if defined(HOST_IMM8)
1683   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1684   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1685     alloc_reg(current,i,INVCP);
1686   #endif
1687   // We need a temporary register for address generation
1688   alloc_reg_temp(current,i,-1);
1689 }
1690
1691 void c2ls_alloc(struct regstat *current,int i)
1692 {
1693   clear_const(current,rt1[i]);
1694   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1695   alloc_reg(current,i,FTEMP);
1696   // If using TLB, need a register for pointer to the mapping table
1697   if(using_tlb) alloc_reg(current,i,TLREG);
1698   #if defined(HOST_IMM8)
1699   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1700   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1701     alloc_reg(current,i,INVCP);
1702   #endif
1703   // We need a temporary register for address generation
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707
1708 #ifndef multdiv_alloc
1709 void multdiv_alloc(struct regstat *current,int i)
1710 {
1711   //  case 0x18: MULT
1712   //  case 0x19: MULTU
1713   //  case 0x1A: DIV
1714   //  case 0x1B: DIVU
1715   //  case 0x1C: DMULT
1716   //  case 0x1D: DMULTU
1717   //  case 0x1E: DDIV
1718   //  case 0x1F: DDIVU
1719   clear_const(current,rs1[i]);
1720   clear_const(current,rs2[i]);
1721   if(rs1[i]&&rs2[i])
1722   {
1723     if((opcode2[i]&4)==0) // 32-bit
1724     {
1725       current->u&=~(1LL<<HIREG);
1726       current->u&=~(1LL<<LOREG);
1727       alloc_reg(current,i,HIREG);
1728       alloc_reg(current,i,LOREG);
1729       alloc_reg(current,i,rs1[i]);
1730       alloc_reg(current,i,rs2[i]);
1731       current->is32|=1LL<<HIREG;
1732       current->is32|=1LL<<LOREG;
1733       dirty_reg(current,HIREG);
1734       dirty_reg(current,LOREG);
1735     }
1736     else // 64-bit
1737     {
1738       current->u&=~(1LL<<HIREG);
1739       current->u&=~(1LL<<LOREG);
1740       current->uu&=~(1LL<<HIREG);
1741       current->uu&=~(1LL<<LOREG);
1742       alloc_reg64(current,i,HIREG);
1743       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1744       alloc_reg64(current,i,rs1[i]);
1745       alloc_reg64(current,i,rs2[i]);
1746       alloc_all(current,i);
1747       current->is32&=~(1LL<<HIREG);
1748       current->is32&=~(1LL<<LOREG);
1749       dirty_reg(current,HIREG);
1750       dirty_reg(current,LOREG);
1751       minimum_free_regs[i]=HOST_REGS;
1752     }
1753   }
1754   else
1755   {
1756     // Multiply by zero is zero.
1757     // MIPS does not have a divide by zero exception.
1758     // The result is undefined, we return zero.
1759     alloc_reg(current,i,HIREG);
1760     alloc_reg(current,i,LOREG);
1761     current->is32|=1LL<<HIREG;
1762     current->is32|=1LL<<LOREG;
1763     dirty_reg(current,HIREG);
1764     dirty_reg(current,LOREG);
1765   }
1766 }
1767 #endif
1768
1769 void cop0_alloc(struct regstat *current,int i)
1770 {
1771   if(opcode2[i]==0) // MFC0
1772   {
1773     if(rt1[i]) {
1774       clear_const(current,rt1[i]);
1775       alloc_all(current,i);
1776       alloc_reg(current,i,rt1[i]);
1777       current->is32|=1LL<<rt1[i];
1778       dirty_reg(current,rt1[i]);
1779     }
1780   }
1781   else if(opcode2[i]==4) // MTC0
1782   {
1783     if(rs1[i]){
1784       clear_const(current,rs1[i]);
1785       alloc_reg(current,i,rs1[i]);
1786       alloc_all(current,i);
1787     }
1788     else {
1789       alloc_all(current,i); // FIXME: Keep r0
1790       current->u&=~1LL;
1791       alloc_reg(current,i,0);
1792     }
1793   }
1794   else
1795   {
1796     // TLBR/TLBWI/TLBWR/TLBP/ERET
1797     assert(opcode2[i]==0x10);
1798     alloc_all(current,i);
1799   }
1800   minimum_free_regs[i]=HOST_REGS;
1801 }
1802
1803 void cop1_alloc(struct regstat *current,int i)
1804 {
1805   alloc_reg(current,i,CSREG); // Load status
1806   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1807   {
1808     if(rt1[i]){
1809       clear_const(current,rt1[i]);
1810       if(opcode2[i]==1) {
1811         alloc_reg64(current,i,rt1[i]); // DMFC1
1812         current->is32&=~(1LL<<rt1[i]);
1813       }else{
1814         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1815         current->is32|=1LL<<rt1[i];
1816       }
1817       dirty_reg(current,rt1[i]);
1818     }
1819     alloc_reg_temp(current,i,-1);
1820   }
1821   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1822   {
1823     if(rs1[i]){
1824       clear_const(current,rs1[i]);
1825       if(opcode2[i]==5)
1826         alloc_reg64(current,i,rs1[i]); // DMTC1
1827       else
1828         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1829       alloc_reg_temp(current,i,-1);
1830     }
1831     else {
1832       current->u&=~1LL;
1833       alloc_reg(current,i,0);
1834       alloc_reg_temp(current,i,-1);
1835     }
1836   }
1837   minimum_free_regs[i]=1;
1838 }
1839 void fconv_alloc(struct regstat *current,int i)
1840 {
1841   alloc_reg(current,i,CSREG); // Load status
1842   alloc_reg_temp(current,i,-1);
1843   minimum_free_regs[i]=1;
1844 }
1845 void float_alloc(struct regstat *current,int i)
1846 {
1847   alloc_reg(current,i,CSREG); // Load status
1848   alloc_reg_temp(current,i,-1);
1849   minimum_free_regs[i]=1;
1850 }
1851 void c2op_alloc(struct regstat *current,int i)
1852 {
1853   alloc_reg_temp(current,i,-1);
1854 }
1855 void fcomp_alloc(struct regstat *current,int i)
1856 {
1857   alloc_reg(current,i,CSREG); // Load status
1858   alloc_reg(current,i,FSREG); // Load flags
1859   dirty_reg(current,FSREG); // Flag will be modified
1860   alloc_reg_temp(current,i,-1);
1861   minimum_free_regs[i]=1;
1862 }
1863
1864 void syscall_alloc(struct regstat *current,int i)
1865 {
1866   alloc_cc(current,i);
1867   dirty_reg(current,CCREG);
1868   alloc_all(current,i);
1869   minimum_free_regs[i]=HOST_REGS;
1870   current->isconst=0;
1871 }
1872
1873 void delayslot_alloc(struct regstat *current,int i)
1874 {
1875   switch(itype[i]) {
1876     case UJUMP:
1877     case CJUMP:
1878     case SJUMP:
1879     case RJUMP:
1880     case FJUMP:
1881     case SYSCALL:
1882     case HLECALL:
1883     case SPAN:
1884       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1885       printf("Disabled speculative precompilation\n");
1886       stop_after_jal=1;
1887       break;
1888     case IMM16:
1889       imm16_alloc(current,i);
1890       break;
1891     case LOAD:
1892     case LOADLR:
1893       load_alloc(current,i);
1894       break;
1895     case STORE:
1896     case STORELR:
1897       store_alloc(current,i);
1898       break;
1899     case ALU:
1900       alu_alloc(current,i);
1901       break;
1902     case SHIFT:
1903       shift_alloc(current,i);
1904       break;
1905     case MULTDIV:
1906       multdiv_alloc(current,i);
1907       break;
1908     case SHIFTIMM:
1909       shiftimm_alloc(current,i);
1910       break;
1911     case MOV:
1912       mov_alloc(current,i);
1913       break;
1914     case COP0:
1915       cop0_alloc(current,i);
1916       break;
1917     case COP1:
1918     case COP2:
1919       cop1_alloc(current,i);
1920       break;
1921     case C1LS:
1922       c1ls_alloc(current,i);
1923       break;
1924     case C2LS:
1925       c2ls_alloc(current,i);
1926       break;
1927     case FCONV:
1928       fconv_alloc(current,i);
1929       break;
1930     case FLOAT:
1931       float_alloc(current,i);
1932       break;
1933     case FCOMP:
1934       fcomp_alloc(current,i);
1935       break;
1936     case C2OP:
1937       c2op_alloc(current,i);
1938       break;
1939   }
1940 }
1941
1942 // Special case where a branch and delay slot span two pages in virtual memory
1943 static void pagespan_alloc(struct regstat *current,int i)
1944 {
1945   current->isconst=0;
1946   current->wasconst=0;
1947   regs[i].wasconst=0;
1948   minimum_free_regs[i]=HOST_REGS;
1949   alloc_all(current,i);
1950   alloc_cc(current,i);
1951   dirty_reg(current,CCREG);
1952   if(opcode[i]==3) // JAL
1953   {
1954     alloc_reg(current,i,31);
1955     dirty_reg(current,31);
1956   }
1957   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1958   {
1959     alloc_reg(current,i,rs1[i]);
1960     if (rt1[i]!=0) {
1961       alloc_reg(current,i,rt1[i]);
1962       dirty_reg(current,rt1[i]);
1963     }
1964   }
1965   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1966   {
1967     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1968     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1969     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1970     {
1971       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1972       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1973     }
1974   }
1975   else
1976   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1977   {
1978     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1979     if(!((current->is32>>rs1[i])&1))
1980     {
1981       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1982     }
1983   }
1984   else
1985   if(opcode[i]==0x11) // BC1
1986   {
1987     alloc_reg(current,i,FSREG);
1988     alloc_reg(current,i,CSREG);
1989   }
1990   //else ...
1991 }
1992
1993 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1994 {
1995   stubs[stubcount][0]=type;
1996   stubs[stubcount][1]=addr;
1997   stubs[stubcount][2]=retaddr;
1998   stubs[stubcount][3]=a;
1999   stubs[stubcount][4]=b;
2000   stubs[stubcount][5]=c;
2001   stubs[stubcount][6]=d;
2002   stubs[stubcount][7]=e;
2003   stubcount++;
2004 }
2005
2006 // Write out a single register
2007 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2008 {
2009   int hr;
2010   for(hr=0;hr<HOST_REGS;hr++) {
2011     if(hr!=EXCLUDE_REG) {
2012       if((regmap[hr]&63)==r) {
2013         if((dirty>>hr)&1) {
2014           if(regmap[hr]<64) {
2015             emit_storereg(r,hr);
2016 #ifndef FORCE32
2017             if((is32>>regmap[hr])&1) {
2018               emit_sarimm(hr,31,hr);
2019               emit_storereg(r|64,hr);
2020             }
2021 #endif
2022           }else{
2023             emit_storereg(r|64,hr);
2024           }
2025         }
2026       }
2027     }
2028   }
2029 }
2030
2031 int mchecksum()
2032 {
2033   //if(!tracedebug) return 0;
2034   int i;
2035   int sum=0;
2036   for(i=0;i<2097152;i++) {
2037     unsigned int temp=sum;
2038     sum<<=1;
2039     sum|=(~temp)>>31;
2040     sum^=((u_int *)rdram)[i];
2041   }
2042   return sum;
2043 }
2044 int rchecksum()
2045 {
2046   int i;
2047   int sum=0;
2048   for(i=0;i<64;i++)
2049     sum^=((u_int *)reg)[i];
2050   return sum;
2051 }
2052 void rlist()
2053 {
2054   int i;
2055   printf("TRACE: ");
2056   for(i=0;i<32;i++)
2057     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2058   printf("\n");
2059 #ifndef DISABLE_COP1
2060   printf("TRACE: ");
2061   for(i=0;i<32;i++)
2062     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2063   printf("\n");
2064 #endif
2065 }
2066
2067 void enabletrace()
2068 {
2069   tracedebug=1;
2070 }
2071
2072 void memdebug(int i)
2073 {
2074   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2075   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2076   //rlist();
2077   //if(tracedebug) {
2078   //if(Count>=-2084597794) {
2079   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2080   //if(0) {
2081     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2082     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2083     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2084     rlist();
2085     #ifdef __i386__
2086     printf("TRACE: %x\n",(&i)[-1]);
2087     #endif
2088     #ifdef __arm__
2089     int j;
2090     printf("TRACE: %x \n",(&j)[10]);
2091     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2092     #endif
2093     //fflush(stdout);
2094   }
2095   //printf("TRACE: %x\n",(&i)[-1]);
2096 }
2097
2098 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2099 {
2100   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2101 }
2102
2103 void alu_assemble(int i,struct regstat *i_regs)
2104 {
2105   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2106     if(rt1[i]) {
2107       signed char s1,s2,t;
2108       t=get_reg(i_regs->regmap,rt1[i]);
2109       if(t>=0) {
2110         s1=get_reg(i_regs->regmap,rs1[i]);
2111         s2=get_reg(i_regs->regmap,rs2[i]);
2112         if(rs1[i]&&rs2[i]) {
2113           assert(s1>=0);
2114           assert(s2>=0);
2115           if(opcode2[i]&2) emit_sub(s1,s2,t);
2116           else emit_add(s1,s2,t);
2117         }
2118         else if(rs1[i]) {
2119           if(s1>=0) emit_mov(s1,t);
2120           else emit_loadreg(rs1[i],t);
2121         }
2122         else if(rs2[i]) {
2123           if(s2>=0) {
2124             if(opcode2[i]&2) emit_neg(s2,t);
2125             else emit_mov(s2,t);
2126           }
2127           else {
2128             emit_loadreg(rs2[i],t);
2129             if(opcode2[i]&2) emit_neg(t,t);
2130           }
2131         }
2132         else emit_zeroreg(t);
2133       }
2134     }
2135   }
2136   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2137     if(rt1[i]) {
2138       signed char s1l,s2l,s1h,s2h,tl,th;
2139       tl=get_reg(i_regs->regmap,rt1[i]);
2140       th=get_reg(i_regs->regmap,rt1[i]|64);
2141       if(tl>=0) {
2142         s1l=get_reg(i_regs->regmap,rs1[i]);
2143         s2l=get_reg(i_regs->regmap,rs2[i]);
2144         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2145         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2146         if(rs1[i]&&rs2[i]) {
2147           assert(s1l>=0);
2148           assert(s2l>=0);
2149           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2150           else emit_adds(s1l,s2l,tl);
2151           if(th>=0) {
2152             #ifdef INVERTED_CARRY
2153             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2154             #else
2155             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2156             #endif
2157             else emit_add(s1h,s2h,th);
2158           }
2159         }
2160         else if(rs1[i]) {
2161           if(s1l>=0) emit_mov(s1l,tl);
2162           else emit_loadreg(rs1[i],tl);
2163           if(th>=0) {
2164             if(s1h>=0) emit_mov(s1h,th);
2165             else emit_loadreg(rs1[i]|64,th);
2166           }
2167         }
2168         else if(rs2[i]) {
2169           if(s2l>=0) {
2170             if(opcode2[i]&2) emit_negs(s2l,tl);
2171             else emit_mov(s2l,tl);
2172           }
2173           else {
2174             emit_loadreg(rs2[i],tl);
2175             if(opcode2[i]&2) emit_negs(tl,tl);
2176           }
2177           if(th>=0) {
2178             #ifdef INVERTED_CARRY
2179             if(s2h>=0) emit_mov(s2h,th);
2180             else emit_loadreg(rs2[i]|64,th);
2181             if(opcode2[i]&2) {
2182               emit_adcimm(-1,th); // x86 has inverted carry flag
2183               emit_not(th,th);
2184             }
2185             #else
2186             if(opcode2[i]&2) {
2187               if(s2h>=0) emit_rscimm(s2h,0,th);
2188               else {
2189                 emit_loadreg(rs2[i]|64,th);
2190                 emit_rscimm(th,0,th);
2191               }
2192             }else{
2193               if(s2h>=0) emit_mov(s2h,th);
2194               else emit_loadreg(rs2[i]|64,th);
2195             }
2196             #endif
2197           }
2198         }
2199         else {
2200           emit_zeroreg(tl);
2201           if(th>=0) emit_zeroreg(th);
2202         }
2203       }
2204     }
2205   }
2206   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2207     if(rt1[i]) {
2208       signed char s1l,s1h,s2l,s2h,t;
2209       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2210       {
2211         t=get_reg(i_regs->regmap,rt1[i]);
2212         //assert(t>=0);
2213         if(t>=0) {
2214           s1l=get_reg(i_regs->regmap,rs1[i]);
2215           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2216           s2l=get_reg(i_regs->regmap,rs2[i]);
2217           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2218           if(rs2[i]==0) // rx<r0
2219           {
2220             assert(s1h>=0);
2221             if(opcode2[i]==0x2a) // SLT
2222               emit_shrimm(s1h,31,t);
2223             else // SLTU (unsigned can not be less than zero)
2224               emit_zeroreg(t);
2225           }
2226           else if(rs1[i]==0) // r0<rx
2227           {
2228             assert(s2h>=0);
2229             if(opcode2[i]==0x2a) // SLT
2230               emit_set_gz64_32(s2h,s2l,t);
2231             else // SLTU (set if not zero)
2232               emit_set_nz64_32(s2h,s2l,t);
2233           }
2234           else {
2235             assert(s1l>=0);assert(s1h>=0);
2236             assert(s2l>=0);assert(s2h>=0);
2237             if(opcode2[i]==0x2a) // SLT
2238               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2239             else // SLTU
2240               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2241           }
2242         }
2243       } else {
2244         t=get_reg(i_regs->regmap,rt1[i]);
2245         //assert(t>=0);
2246         if(t>=0) {
2247           s1l=get_reg(i_regs->regmap,rs1[i]);
2248           s2l=get_reg(i_regs->regmap,rs2[i]);
2249           if(rs2[i]==0) // rx<r0
2250           {
2251             assert(s1l>=0);
2252             if(opcode2[i]==0x2a) // SLT
2253               emit_shrimm(s1l,31,t);
2254             else // SLTU (unsigned can not be less than zero)
2255               emit_zeroreg(t);
2256           }
2257           else if(rs1[i]==0) // r0<rx
2258           {
2259             assert(s2l>=0);
2260             if(opcode2[i]==0x2a) // SLT
2261               emit_set_gz32(s2l,t);
2262             else // SLTU (set if not zero)
2263               emit_set_nz32(s2l,t);
2264           }
2265           else{
2266             assert(s1l>=0);assert(s2l>=0);
2267             if(opcode2[i]==0x2a) // SLT
2268               emit_set_if_less32(s1l,s2l,t);
2269             else // SLTU
2270               emit_set_if_carry32(s1l,s2l,t);
2271           }
2272         }
2273       }
2274     }
2275   }
2276   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2277     if(rt1[i]) {
2278       signed char s1l,s1h,s2l,s2h,th,tl;
2279       tl=get_reg(i_regs->regmap,rt1[i]);
2280       th=get_reg(i_regs->regmap,rt1[i]|64);
2281       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2282       {
2283         assert(tl>=0);
2284         if(tl>=0) {
2285           s1l=get_reg(i_regs->regmap,rs1[i]);
2286           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2287           s2l=get_reg(i_regs->regmap,rs2[i]);
2288           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2289           if(rs1[i]&&rs2[i]) {
2290             assert(s1l>=0);assert(s1h>=0);
2291             assert(s2l>=0);assert(s2h>=0);
2292             if(opcode2[i]==0x24) { // AND
2293               emit_and(s1l,s2l,tl);
2294               emit_and(s1h,s2h,th);
2295             } else
2296             if(opcode2[i]==0x25) { // OR
2297               emit_or(s1l,s2l,tl);
2298               emit_or(s1h,s2h,th);
2299             } else
2300             if(opcode2[i]==0x26) { // XOR
2301               emit_xor(s1l,s2l,tl);
2302               emit_xor(s1h,s2h,th);
2303             } else
2304             if(opcode2[i]==0x27) { // NOR
2305               emit_or(s1l,s2l,tl);
2306               emit_or(s1h,s2h,th);
2307               emit_not(tl,tl);
2308               emit_not(th,th);
2309             }
2310           }
2311           else
2312           {
2313             if(opcode2[i]==0x24) { // AND
2314               emit_zeroreg(tl);
2315               emit_zeroreg(th);
2316             } else
2317             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2318               if(rs1[i]){
2319                 if(s1l>=0) emit_mov(s1l,tl);
2320                 else emit_loadreg(rs1[i],tl);
2321                 if(s1h>=0) emit_mov(s1h,th);
2322                 else emit_loadreg(rs1[i]|64,th);
2323               }
2324               else
2325               if(rs2[i]){
2326                 if(s2l>=0) emit_mov(s2l,tl);
2327                 else emit_loadreg(rs2[i],tl);
2328                 if(s2h>=0) emit_mov(s2h,th);
2329                 else emit_loadreg(rs2[i]|64,th);
2330               }
2331               else{
2332                 emit_zeroreg(tl);
2333                 emit_zeroreg(th);
2334               }
2335             } else
2336             if(opcode2[i]==0x27) { // NOR
2337               if(rs1[i]){
2338                 if(s1l>=0) emit_not(s1l,tl);
2339                 else{
2340                   emit_loadreg(rs1[i],tl);
2341                   emit_not(tl,tl);
2342                 }
2343                 if(s1h>=0) emit_not(s1h,th);
2344                 else{
2345                   emit_loadreg(rs1[i]|64,th);
2346                   emit_not(th,th);
2347                 }
2348               }
2349               else
2350               if(rs2[i]){
2351                 if(s2l>=0) emit_not(s2l,tl);
2352                 else{
2353                   emit_loadreg(rs2[i],tl);
2354                   emit_not(tl,tl);
2355                 }
2356                 if(s2h>=0) emit_not(s2h,th);
2357                 else{
2358                   emit_loadreg(rs2[i]|64,th);
2359                   emit_not(th,th);
2360                 }
2361               }
2362               else {
2363                 emit_movimm(-1,tl);
2364                 emit_movimm(-1,th);
2365               }
2366             }
2367           }
2368         }
2369       }
2370       else
2371       {
2372         // 32 bit
2373         if(tl>=0) {
2374           s1l=get_reg(i_regs->regmap,rs1[i]);
2375           s2l=get_reg(i_regs->regmap,rs2[i]);
2376           if(rs1[i]&&rs2[i]) {
2377             assert(s1l>=0);
2378             assert(s2l>=0);
2379             if(opcode2[i]==0x24) { // AND
2380               emit_and(s1l,s2l,tl);
2381             } else
2382             if(opcode2[i]==0x25) { // OR
2383               emit_or(s1l,s2l,tl);
2384             } else
2385             if(opcode2[i]==0x26) { // XOR
2386               emit_xor(s1l,s2l,tl);
2387             } else
2388             if(opcode2[i]==0x27) { // NOR
2389               emit_or(s1l,s2l,tl);
2390               emit_not(tl,tl);
2391             }
2392           }
2393           else
2394           {
2395             if(opcode2[i]==0x24) { // AND
2396               emit_zeroreg(tl);
2397             } else
2398             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2399               if(rs1[i]){
2400                 if(s1l>=0) emit_mov(s1l,tl);
2401                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2402               }
2403               else
2404               if(rs2[i]){
2405                 if(s2l>=0) emit_mov(s2l,tl);
2406                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2407               }
2408               else emit_zeroreg(tl);
2409             } else
2410             if(opcode2[i]==0x27) { // NOR
2411               if(rs1[i]){
2412                 if(s1l>=0) emit_not(s1l,tl);
2413                 else {
2414                   emit_loadreg(rs1[i],tl);
2415                   emit_not(tl,tl);
2416                 }
2417               }
2418               else
2419               if(rs2[i]){
2420                 if(s2l>=0) emit_not(s2l,tl);
2421                 else {
2422                   emit_loadreg(rs2[i],tl);
2423                   emit_not(tl,tl);
2424                 }
2425               }
2426               else emit_movimm(-1,tl);
2427             }
2428           }
2429         }
2430       }
2431     }
2432   }
2433 }
2434
2435 void imm16_assemble(int i,struct regstat *i_regs)
2436 {
2437   if (opcode[i]==0x0f) { // LUI
2438     if(rt1[i]) {
2439       signed char t;
2440       t=get_reg(i_regs->regmap,rt1[i]);
2441       //assert(t>=0);
2442       if(t>=0) {
2443         if(!((i_regs->isconst>>t)&1))
2444           emit_movimm(imm[i]<<16,t);
2445       }
2446     }
2447   }
2448   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2449     if(rt1[i]) {
2450       signed char s,t;
2451       t=get_reg(i_regs->regmap,rt1[i]);
2452       s=get_reg(i_regs->regmap,rs1[i]);
2453       if(rs1[i]) {
2454         //assert(t>=0);
2455         //assert(s>=0);
2456         if(t>=0) {
2457           if(!((i_regs->isconst>>t)&1)) {
2458             if(s<0) {
2459               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2460               emit_addimm(t,imm[i],t);
2461             }else{
2462               if(!((i_regs->wasconst>>s)&1))
2463                 emit_addimm(s,imm[i],t);
2464               else
2465                 emit_movimm(constmap[i][s]+imm[i],t);
2466             }
2467           }
2468         }
2469       } else {
2470         if(t>=0) {
2471           if(!((i_regs->isconst>>t)&1))
2472             emit_movimm(imm[i],t);
2473         }
2474       }
2475     }
2476   }
2477   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2478     if(rt1[i]) {
2479       signed char sh,sl,th,tl;
2480       th=get_reg(i_regs->regmap,rt1[i]|64);
2481       tl=get_reg(i_regs->regmap,rt1[i]);
2482       sh=get_reg(i_regs->regmap,rs1[i]|64);
2483       sl=get_reg(i_regs->regmap,rs1[i]);
2484       if(tl>=0) {
2485         if(rs1[i]) {
2486           assert(sh>=0);
2487           assert(sl>=0);
2488           if(th>=0) {
2489             emit_addimm64_32(sh,sl,imm[i],th,tl);
2490           }
2491           else {
2492             emit_addimm(sl,imm[i],tl);
2493           }
2494         } else {
2495           emit_movimm(imm[i],tl);
2496           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2497         }
2498       }
2499     }
2500   }
2501   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2502     if(rt1[i]) {
2503       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2504       signed char sh,sl,t;
2505       t=get_reg(i_regs->regmap,rt1[i]);
2506       sh=get_reg(i_regs->regmap,rs1[i]|64);
2507       sl=get_reg(i_regs->regmap,rs1[i]);
2508       //assert(t>=0);
2509       if(t>=0) {
2510         if(rs1[i]>0) {
2511           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2512           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2513             if(opcode[i]==0x0a) { // SLTI
2514               if(sl<0) {
2515                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2516                 emit_slti32(t,imm[i],t);
2517               }else{
2518                 emit_slti32(sl,imm[i],t);
2519               }
2520             }
2521             else { // SLTIU
2522               if(sl<0) {
2523                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2524                 emit_sltiu32(t,imm[i],t);
2525               }else{
2526                 emit_sltiu32(sl,imm[i],t);
2527               }
2528             }
2529           }else{ // 64-bit
2530             assert(sl>=0);
2531             if(opcode[i]==0x0a) // SLTI
2532               emit_slti64_32(sh,sl,imm[i],t);
2533             else // SLTIU
2534               emit_sltiu64_32(sh,sl,imm[i],t);
2535           }
2536         }else{
2537           // SLTI(U) with r0 is just stupid,
2538           // nonetheless examples can be found
2539           if(opcode[i]==0x0a) // SLTI
2540             if(0<imm[i]) emit_movimm(1,t);
2541             else emit_zeroreg(t);
2542           else // SLTIU
2543           {
2544             if(imm[i]) emit_movimm(1,t);
2545             else emit_zeroreg(t);
2546           }
2547         }
2548       }
2549     }
2550   }
2551   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2552     if(rt1[i]) {
2553       signed char sh,sl,th,tl;
2554       th=get_reg(i_regs->regmap,rt1[i]|64);
2555       tl=get_reg(i_regs->regmap,rt1[i]);
2556       sh=get_reg(i_regs->regmap,rs1[i]|64);
2557       sl=get_reg(i_regs->regmap,rs1[i]);
2558       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2559         if(opcode[i]==0x0c) //ANDI
2560         {
2561           if(rs1[i]) {
2562             if(sl<0) {
2563               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2564               emit_andimm(tl,imm[i],tl);
2565             }else{
2566               if(!((i_regs->wasconst>>sl)&1))
2567                 emit_andimm(sl,imm[i],tl);
2568               else
2569                 emit_movimm(constmap[i][sl]&imm[i],tl);
2570             }
2571           }
2572           else
2573             emit_zeroreg(tl);
2574           if(th>=0) emit_zeroreg(th);
2575         }
2576         else
2577         {
2578           if(rs1[i]) {
2579             if(sl<0) {
2580               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2581             }
2582             if(th>=0) {
2583               if(sh<0) {
2584                 emit_loadreg(rs1[i]|64,th);
2585               }else{
2586                 emit_mov(sh,th);
2587               }
2588             }
2589             if(opcode[i]==0x0d) //ORI
2590             if(sl<0) {
2591               emit_orimm(tl,imm[i],tl);
2592             }else{
2593               if(!((i_regs->wasconst>>sl)&1))
2594                 emit_orimm(sl,imm[i],tl);
2595               else
2596                 emit_movimm(constmap[i][sl]|imm[i],tl);
2597             }
2598             if(opcode[i]==0x0e) //XORI
2599             if(sl<0) {
2600               emit_xorimm(tl,imm[i],tl);
2601             }else{
2602               if(!((i_regs->wasconst>>sl)&1))
2603                 emit_xorimm(sl,imm[i],tl);
2604               else
2605                 emit_movimm(constmap[i][sl]^imm[i],tl);
2606             }
2607           }
2608           else {
2609             emit_movimm(imm[i],tl);
2610             if(th>=0) emit_zeroreg(th);
2611           }
2612         }
2613       }
2614     }
2615   }
2616 }
2617
2618 void shiftimm_assemble(int i,struct regstat *i_regs)
2619 {
2620   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2621   {
2622     if(rt1[i]) {
2623       signed char s,t;
2624       t=get_reg(i_regs->regmap,rt1[i]);
2625       s=get_reg(i_regs->regmap,rs1[i]);
2626       //assert(t>=0);
2627       if(t>=0){
2628         if(rs1[i]==0)
2629         {
2630           emit_zeroreg(t);
2631         }
2632         else
2633         {
2634           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2635           if(imm[i]) {
2636             if(opcode2[i]==0) // SLL
2637             {
2638               emit_shlimm(s<0?t:s,imm[i],t);
2639             }
2640             if(opcode2[i]==2) // SRL
2641             {
2642               emit_shrimm(s<0?t:s,imm[i],t);
2643             }
2644             if(opcode2[i]==3) // SRA
2645             {
2646               emit_sarimm(s<0?t:s,imm[i],t);
2647             }
2648           }else{
2649             // Shift by zero
2650             if(s>=0 && s!=t) emit_mov(s,t);
2651           }
2652         }
2653       }
2654       //emit_storereg(rt1[i],t); //DEBUG
2655     }
2656   }
2657   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2658   {
2659     if(rt1[i]) {
2660       signed char sh,sl,th,tl;
2661       th=get_reg(i_regs->regmap,rt1[i]|64);
2662       tl=get_reg(i_regs->regmap,rt1[i]);
2663       sh=get_reg(i_regs->regmap,rs1[i]|64);
2664       sl=get_reg(i_regs->regmap,rs1[i]);
2665       if(tl>=0) {
2666         if(rs1[i]==0)
2667         {
2668           emit_zeroreg(tl);
2669           if(th>=0) emit_zeroreg(th);
2670         }
2671         else
2672         {
2673           assert(sl>=0);
2674           assert(sh>=0);
2675           if(imm[i]) {
2676             if(opcode2[i]==0x38) // DSLL
2677             {
2678               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2679               emit_shlimm(sl,imm[i],tl);
2680             }
2681             if(opcode2[i]==0x3a) // DSRL
2682             {
2683               emit_shrdimm(sl,sh,imm[i],tl);
2684               if(th>=0) emit_shrimm(sh,imm[i],th);
2685             }
2686             if(opcode2[i]==0x3b) // DSRA
2687             {
2688               emit_shrdimm(sl,sh,imm[i],tl);
2689               if(th>=0) emit_sarimm(sh,imm[i],th);
2690             }
2691           }else{
2692             // Shift by zero
2693             if(sl!=tl) emit_mov(sl,tl);
2694             if(th>=0&&sh!=th) emit_mov(sh,th);
2695           }
2696         }
2697       }
2698     }
2699   }
2700   if(opcode2[i]==0x3c) // DSLL32
2701   {
2702     if(rt1[i]) {
2703       signed char sl,tl,th;
2704       tl=get_reg(i_regs->regmap,rt1[i]);
2705       th=get_reg(i_regs->regmap,rt1[i]|64);
2706       sl=get_reg(i_regs->regmap,rs1[i]);
2707       if(th>=0||tl>=0){
2708         assert(tl>=0);
2709         assert(th>=0);
2710         assert(sl>=0);
2711         emit_mov(sl,th);
2712         emit_zeroreg(tl);
2713         if(imm[i]>32)
2714         {
2715           emit_shlimm(th,imm[i]&31,th);
2716         }
2717       }
2718     }
2719   }
2720   if(opcode2[i]==0x3e) // DSRL32
2721   {
2722     if(rt1[i]) {
2723       signed char sh,tl,th;
2724       tl=get_reg(i_regs->regmap,rt1[i]);
2725       th=get_reg(i_regs->regmap,rt1[i]|64);
2726       sh=get_reg(i_regs->regmap,rs1[i]|64);
2727       if(tl>=0){
2728         assert(sh>=0);
2729         emit_mov(sh,tl);
2730         if(th>=0) emit_zeroreg(th);
2731         if(imm[i]>32)
2732         {
2733           emit_shrimm(tl,imm[i]&31,tl);
2734         }
2735       }
2736     }
2737   }
2738   if(opcode2[i]==0x3f) // DSRA32
2739   {
2740     if(rt1[i]) {
2741       signed char sh,tl;
2742       tl=get_reg(i_regs->regmap,rt1[i]);
2743       sh=get_reg(i_regs->regmap,rs1[i]|64);
2744       if(tl>=0){
2745         assert(sh>=0);
2746         emit_mov(sh,tl);
2747         if(imm[i]>32)
2748         {
2749           emit_sarimm(tl,imm[i]&31,tl);
2750         }
2751       }
2752     }
2753   }
2754 }
2755
2756 #ifndef shift_assemble
2757 void shift_assemble(int i,struct regstat *i_regs)
2758 {
2759   printf("Need shift_assemble for this architecture.\n");
2760   exit(1);
2761 }
2762 #endif
2763
2764 void load_assemble(int i,struct regstat *i_regs)
2765 {
2766   int s,th,tl,addr,map=-1;
2767   int offset;
2768   int jaddr=0;
2769   int memtarget=0,c=0;
2770   u_int hr,reglist=0;
2771   th=get_reg(i_regs->regmap,rt1[i]|64);
2772   tl=get_reg(i_regs->regmap,rt1[i]);
2773   s=get_reg(i_regs->regmap,rs1[i]);
2774   offset=imm[i];
2775   for(hr=0;hr<HOST_REGS;hr++) {
2776     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2777   }
2778   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2779   if(s>=0) {
2780     c=(i_regs->wasconst>>s)&1;
2781     if (c) {
2782       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2783       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2784     }
2785   }
2786   //printf("load_assemble: c=%d\n",c);
2787   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2788   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2789 #ifdef PCSX
2790   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2791     ||rt1[i]==0) {
2792       // could be FIFO, must perform the read
2793       // ||dummy read
2794       assem_debug("(forced read)\n");
2795       tl=get_reg(i_regs->regmap,-1);
2796       assert(tl>=0);
2797   }
2798 #endif
2799   if(offset||s<0||c) addr=tl;
2800   else addr=s;
2801   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2802  if(tl>=0) {
2803   //printf("load_assemble: c=%d\n",c);
2804   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2805   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2806   reglist&=~(1<<tl);
2807   if(th>=0) reglist&=~(1<<th);
2808   if(!using_tlb) {
2809     if(!c) {
2810       #ifdef RAM_OFFSET
2811       map=get_reg(i_regs->regmap,ROREG);
2812       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2813       #endif
2814 //#define R29_HACK 1
2815       #ifdef R29_HACK
2816       // Strmnnrmn's speed hack
2817       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2818       #endif
2819       {
2820         #ifdef PCSX
2821         if(sp_in_mirror&&rs1[i]==29) {
2822           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2823           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2824         }
2825         else
2826         #endif
2827         emit_cmpimm(addr,RAM_SIZE);
2828         jaddr=(int)out;
2829         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2830         // Hint to branch predictor that the branch is unlikely to be taken
2831         if(rs1[i]>=28)
2832           emit_jno_unlikely(0);
2833         else
2834         #endif
2835         emit_jno(0);
2836       }
2837     }
2838   }else{ // using tlb
2839     int x=0;
2840     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2841     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2842     map=get_reg(i_regs->regmap,TLREG);
2843     assert(map>=0);
2844     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2845     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2846   }
2847   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2848   if (opcode[i]==0x20) { // LB
2849     if(!c||memtarget) {
2850       if(!dummy) {
2851         #ifdef HOST_IMM_ADDR32
2852         if(c)
2853           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2854         else
2855         #endif
2856         {
2857           //emit_xorimm(addr,3,tl);
2858           //gen_tlb_addr_r(tl,map);
2859           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2860           int x=0,a=tl;
2861 #ifdef BIG_ENDIAN_MIPS
2862           if(!c) emit_xorimm(addr,3,tl);
2863           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2864 #else
2865           if(!c) a=addr;
2866 #endif
2867 #ifdef PCSX
2868           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2869 #endif
2870           emit_movsbl_indexed_tlb(x,a,map,tl);
2871         }
2872       }
2873       if(jaddr)
2874         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2875     }
2876     else
2877       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2878   }
2879   if (opcode[i]==0x21) { // LH
2880     if(!c||memtarget) {
2881       if(!dummy) {
2882         #ifdef HOST_IMM_ADDR32
2883         if(c)
2884           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2885         else
2886         #endif
2887         {
2888           int x=0,a=tl;
2889 #ifdef BIG_ENDIAN_MIPS
2890           if(!c) emit_xorimm(addr,2,tl);
2891           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2892 #else
2893           if(!c) a=addr;
2894 #endif
2895 #ifdef PCSX
2896           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2897 #endif
2898           //#ifdef
2899           //emit_movswl_indexed_tlb(x,tl,map,tl);
2900           //else
2901           if(map>=0) {
2902             gen_tlb_addr_r(a,map);
2903             emit_movswl_indexed(x,a,tl);
2904           }else{
2905             #ifdef RAM_OFFSET
2906             emit_movswl_indexed(x,a,tl);
2907             #else
2908             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2909             #endif
2910           }
2911         }
2912       }
2913       if(jaddr)
2914         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2915     }
2916     else
2917       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2918   }
2919   if (opcode[i]==0x23) { // LW
2920     if(!c||memtarget) {
2921       if(!dummy) {
2922         int a=addr;
2923 #ifdef PCSX
2924         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2925 #endif
2926         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2927         #ifdef HOST_IMM_ADDR32
2928         if(c)
2929           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2930         else
2931         #endif
2932         emit_readword_indexed_tlb(0,a,map,tl);
2933       }
2934       if(jaddr)
2935         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2936     }
2937     else
2938       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2939   }
2940   if (opcode[i]==0x24) { // LBU
2941     if(!c||memtarget) {
2942       if(!dummy) {
2943         #ifdef HOST_IMM_ADDR32
2944         if(c)
2945           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2946         else
2947         #endif
2948         {
2949           //emit_xorimm(addr,3,tl);
2950           //gen_tlb_addr_r(tl,map);
2951           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2952           int x=0,a=tl;
2953 #ifdef BIG_ENDIAN_MIPS
2954           if(!c) emit_xorimm(addr,3,tl);
2955           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2956 #else
2957           if(!c) a=addr;
2958 #endif
2959 #ifdef PCSX
2960           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2961 #endif
2962           emit_movzbl_indexed_tlb(x,a,map,tl);
2963         }
2964       }
2965       if(jaddr)
2966         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2967     }
2968     else
2969       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2970   }
2971   if (opcode[i]==0x25) { // LHU
2972     if(!c||memtarget) {
2973       if(!dummy) {
2974         #ifdef HOST_IMM_ADDR32
2975         if(c)
2976           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2977         else
2978         #endif
2979         {
2980           int x=0,a=tl;
2981 #ifdef BIG_ENDIAN_MIPS
2982           if(!c) emit_xorimm(addr,2,tl);
2983           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2984 #else
2985           if(!c) a=addr;
2986 #endif
2987 #ifdef PCSX
2988           if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
2989 #endif
2990           //#ifdef
2991           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2992           //#else
2993           if(map>=0) {
2994             gen_tlb_addr_r(a,map);
2995             emit_movzwl_indexed(x,a,tl);
2996           }else{
2997             #ifdef RAM_OFFSET
2998             emit_movzwl_indexed(x,a,tl);
2999             #else
3000             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3001             #endif
3002           }
3003         }
3004       }
3005       if(jaddr)
3006         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3007     }
3008     else
3009       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3010   }
3011   if (opcode[i]==0x27) { // LWU
3012     assert(th>=0);
3013     if(!c||memtarget) {
3014       if(!dummy) {
3015         int a=addr;
3016 #ifdef PCSX
3017         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3018 #endif
3019         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3020         #ifdef HOST_IMM_ADDR32
3021         if(c)
3022           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3023         else
3024         #endif
3025         emit_readword_indexed_tlb(0,a,map,tl);
3026       }
3027       if(jaddr)
3028         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3029     }
3030     else {
3031       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3032     }
3033     emit_zeroreg(th);
3034   }
3035   if (opcode[i]==0x37) { // LD
3036     if(!c||memtarget) {
3037       if(!dummy) {
3038         int a=addr;
3039 #ifdef PCSX
3040         if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3041 #endif
3042         //gen_tlb_addr_r(tl,map);
3043         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3044         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3045         #ifdef HOST_IMM_ADDR32
3046         if(c)
3047           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3048         else
3049         #endif
3050         emit_readdword_indexed_tlb(0,a,map,th,tl);
3051       }
3052       if(jaddr)
3053         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3054     }
3055     else
3056       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3057   }
3058  }
3059   //emit_storereg(rt1[i],tl); // DEBUG
3060   //if(opcode[i]==0x23)
3061   //if(opcode[i]==0x24)
3062   //if(opcode[i]==0x23||opcode[i]==0x24)
3063   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3064   {
3065     //emit_pusha();
3066     save_regs(0x100f);
3067         emit_readword((int)&last_count,ECX);
3068         #ifdef __i386__
3069         if(get_reg(i_regs->regmap,CCREG)<0)
3070           emit_loadreg(CCREG,HOST_CCREG);
3071         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3072         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3073         emit_writeword(HOST_CCREG,(int)&Count);
3074         #endif
3075         #ifdef __arm__
3076         if(get_reg(i_regs->regmap,CCREG)<0)
3077           emit_loadreg(CCREG,0);
3078         else
3079           emit_mov(HOST_CCREG,0);
3080         emit_add(0,ECX,0);
3081         emit_addimm(0,2*ccadj[i],0);
3082         emit_writeword(0,(int)&Count);
3083         #endif
3084     emit_call((int)memdebug);
3085     //emit_popa();
3086     restore_regs(0x100f);
3087   }/**/
3088 }
3089
3090 #ifndef loadlr_assemble
3091 void loadlr_assemble(int i,struct regstat *i_regs)
3092 {
3093   printf("Need loadlr_assemble for this architecture.\n");
3094   exit(1);
3095 }
3096 #endif
3097
3098 void store_assemble(int i,struct regstat *i_regs)
3099 {
3100   int s,th,tl,map=-1;
3101   int addr,temp;
3102   int offset;
3103   int jaddr=0,jaddr2,type;
3104   int memtarget=0,c=0;
3105   int agr=AGEN1+(i&1);
3106   u_int hr,reglist=0;
3107   th=get_reg(i_regs->regmap,rs2[i]|64);
3108   tl=get_reg(i_regs->regmap,rs2[i]);
3109   s=get_reg(i_regs->regmap,rs1[i]);
3110   temp=get_reg(i_regs->regmap,agr);
3111   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3112   offset=imm[i];
3113   if(s>=0) {
3114     c=(i_regs->wasconst>>s)&1;
3115     if(c) {
3116       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3117       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3118     }
3119   }
3120   assert(tl>=0);
3121   assert(temp>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3126   if(offset||s<0||c) addr=temp;
3127   else addr=s;
3128   if(!using_tlb) {
3129     if(!c) {
3130       #ifdef PCSX
3131       if(sp_in_mirror&&rs1[i]==29) {
3132         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3133         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3134       }
3135       else
3136       #endif
3137       #ifdef R29_HACK
3138       // Strmnnrmn's speed hack
3139       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3140       #endif
3141       emit_cmpimm(addr,RAM_SIZE);
3142       #ifdef DESTRUCTIVE_SHIFT
3143       if(s==addr) emit_mov(s,temp);
3144       #endif
3145       #ifdef R29_HACK
3146       memtarget=1;
3147       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3148       #endif
3149       {
3150         jaddr=(int)out;
3151         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3152         // Hint to branch predictor that the branch is unlikely to be taken
3153         if(rs1[i]>=28)
3154           emit_jno_unlikely(0);
3155         else
3156         #endif
3157         emit_jno(0);
3158       }
3159     }
3160   }else{ // using tlb
3161     int x=0;
3162     if (opcode[i]==0x28) x=3; // SB
3163     if (opcode[i]==0x29) x=2; // SH
3164     map=get_reg(i_regs->regmap,TLREG);
3165     assert(map>=0);
3166     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3167     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3168   }
3169
3170   if (opcode[i]==0x28) { // SB
3171     if(!c||memtarget) {
3172       int x=0,a=temp;
3173 #ifdef BIG_ENDIAN_MIPS
3174       if(!c) emit_xorimm(addr,3,temp);
3175       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3176 #else
3177       if(!c) a=addr;
3178 #endif
3179 #ifdef PCSX
3180       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3181 #endif
3182       //gen_tlb_addr_w(temp,map);
3183       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3184       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3185     }
3186     type=STOREB_STUB;
3187   }
3188   if (opcode[i]==0x29) { // SH
3189     if(!c||memtarget) {
3190       int x=0,a=temp;
3191 #ifdef BIG_ENDIAN_MIPS
3192       if(!c) emit_xorimm(addr,2,temp);
3193       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3194 #else
3195       if(!c) a=addr;
3196 #endif
3197 #ifdef PCSX
3198       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3199 #endif
3200       //#ifdef
3201       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3202       //#else
3203       if(map>=0) {
3204         gen_tlb_addr_w(a,map);
3205         emit_writehword_indexed(tl,x,a);
3206       }else
3207         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3208     }
3209     type=STOREH_STUB;
3210   }
3211   if (opcode[i]==0x2B) { // SW
3212     if(!c||memtarget) {
3213       int a=addr;
3214 #ifdef PCSX
3215       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3216 #endif
3217       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3218       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3219     }
3220     type=STOREW_STUB;
3221   }
3222   if (opcode[i]==0x3F) { // SD
3223     if(!c||memtarget) {
3224       int a=addr;
3225 #ifdef PCSX
3226       if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
3227 #endif
3228       if(rs2[i]) {
3229         assert(th>=0);
3230         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3231         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3232         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3233       }else{
3234         // Store zero
3235         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3236         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3237         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3238       }
3239     }
3240     type=STORED_STUB;
3241   }
3242   if(!using_tlb) {
3243     if(!c||memtarget) {
3244       #ifdef DESTRUCTIVE_SHIFT
3245       // The x86 shift operation is 'destructive'; it overwrites the
3246       // source register, so we need to make a copy first and use that.
3247       addr=temp;
3248       #endif
3249       #if defined(HOST_IMM8)
3250       int ir=get_reg(i_regs->regmap,INVCP);
3251       assert(ir>=0);
3252       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3253       #else
3254       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3255       #endif
3256       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3257       emit_callne(invalidate_addr_reg[addr]);
3258       #else
3259       jaddr2=(int)out;
3260       emit_jne(0);
3261       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3262       #endif
3263     }
3264   }
3265   if(jaddr) {
3266     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3267   } else if(c&&!memtarget) {
3268     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3269   }
3270   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3271   //if(opcode[i]==0x2B || opcode[i]==0x28)
3272   //if(opcode[i]==0x2B || opcode[i]==0x29)
3273   //if(opcode[i]==0x2B)
3274   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3275   {
3276     //emit_pusha();
3277     save_regs(0x100f);
3278         emit_readword((int)&last_count,ECX);
3279         #ifdef __i386__
3280         if(get_reg(i_regs->regmap,CCREG)<0)
3281           emit_loadreg(CCREG,HOST_CCREG);
3282         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3283         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3284         emit_writeword(HOST_CCREG,(int)&Count);
3285         #endif
3286         #ifdef __arm__
3287         if(get_reg(i_regs->regmap,CCREG)<0)
3288           emit_loadreg(CCREG,0);
3289         else
3290           emit_mov(HOST_CCREG,0);
3291         emit_add(0,ECX,0);
3292         emit_addimm(0,2*ccadj[i],0);
3293         emit_writeword(0,(int)&Count);
3294         #endif
3295     emit_call((int)memdebug);
3296     //emit_popa();
3297     restore_regs(0x100f);
3298   }/**/
3299 }
3300
3301 void storelr_assemble(int i,struct regstat *i_regs)
3302 {
3303   int s,th,tl;
3304   int temp;
3305   int temp2;
3306   int offset;
3307   int jaddr=0,jaddr2;
3308   int case1,case2,case3;
3309   int done0,done1,done2;
3310   int memtarget=0,c=0;
3311   int agr=AGEN1+(i&1);
3312   u_int hr,reglist=0;
3313   th=get_reg(i_regs->regmap,rs2[i]|64);
3314   tl=get_reg(i_regs->regmap,rs2[i]);
3315   s=get_reg(i_regs->regmap,rs1[i]);
3316   temp=get_reg(i_regs->regmap,agr);
3317   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3318   offset=imm[i];
3319   if(s>=0) {
3320     c=(i_regs->isconst>>s)&1;
3321     if(c) {
3322       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3323       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3324     }
3325   }
3326   assert(tl>=0);
3327   for(hr=0;hr<HOST_REGS;hr++) {
3328     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3329   }
3330   assert(temp>=0);
3331   if(!using_tlb) {
3332     if(!c) {
3333       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3334       if(!offset&&s!=temp) emit_mov(s,temp);
3335       jaddr=(int)out;
3336       emit_jno(0);
3337     }
3338     else
3339     {
3340       if(!memtarget||!rs1[i]) {
3341         jaddr=(int)out;
3342         emit_jmp(0);
3343       }
3344     }
3345     #ifdef RAM_OFFSET
3346     int map=get_reg(i_regs->regmap,ROREG);
3347     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3348     gen_tlb_addr_w(temp,map);
3349     #else
3350     if((u_int)rdram!=0x80000000) 
3351       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3352     #endif
3353   }else{ // using tlb
3354     int map=get_reg(i_regs->regmap,TLREG);
3355     assert(map>=0);
3356     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3357     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3358     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3359     if(!jaddr&&!memtarget) {
3360       jaddr=(int)out;
3361       emit_jmp(0);
3362     }
3363     gen_tlb_addr_w(temp,map);
3364   }
3365
3366   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3367     temp2=get_reg(i_regs->regmap,FTEMP);
3368     if(!rs2[i]) temp2=th=tl;
3369   }
3370
3371 #ifndef BIG_ENDIAN_MIPS
3372     emit_xorimm(temp,3,temp);
3373 #endif
3374   emit_testimm(temp,2);
3375   case2=(int)out;
3376   emit_jne(0);
3377   emit_testimm(temp,1);
3378   case1=(int)out;
3379   emit_jne(0);
3380   // 0
3381   if (opcode[i]==0x2A) { // SWL
3382     emit_writeword_indexed(tl,0,temp);
3383   }
3384   if (opcode[i]==0x2E) { // SWR
3385     emit_writebyte_indexed(tl,3,temp);
3386   }
3387   if (opcode[i]==0x2C) { // SDL
3388     emit_writeword_indexed(th,0,temp);
3389     if(rs2[i]) emit_mov(tl,temp2);
3390   }
3391   if (opcode[i]==0x2D) { // SDR
3392     emit_writebyte_indexed(tl,3,temp);
3393     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3394   }
3395   done0=(int)out;
3396   emit_jmp(0);
3397   // 1
3398   set_jump_target(case1,(int)out);
3399   if (opcode[i]==0x2A) { // SWL
3400     // Write 3 msb into three least significant bytes
3401     if(rs2[i]) emit_rorimm(tl,8,tl);
3402     emit_writehword_indexed(tl,-1,temp);
3403     if(rs2[i]) emit_rorimm(tl,16,tl);
3404     emit_writebyte_indexed(tl,1,temp);
3405     if(rs2[i]) emit_rorimm(tl,8,tl);
3406   }
3407   if (opcode[i]==0x2E) { // SWR
3408     // Write two lsb into two most significant bytes
3409     emit_writehword_indexed(tl,1,temp);
3410   }
3411   if (opcode[i]==0x2C) { // SDL
3412     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3413     // Write 3 msb into three least significant bytes
3414     if(rs2[i]) emit_rorimm(th,8,th);
3415     emit_writehword_indexed(th,-1,temp);
3416     if(rs2[i]) emit_rorimm(th,16,th);
3417     emit_writebyte_indexed(th,1,temp);
3418     if(rs2[i]) emit_rorimm(th,8,th);
3419   }
3420   if (opcode[i]==0x2D) { // SDR
3421     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3422     // Write two lsb into two most significant bytes
3423     emit_writehword_indexed(tl,1,temp);
3424   }
3425   done1=(int)out;
3426   emit_jmp(0);
3427   // 2
3428   set_jump_target(case2,(int)out);
3429   emit_testimm(temp,1);
3430   case3=(int)out;
3431   emit_jne(0);
3432   if (opcode[i]==0x2A) { // SWL
3433     // Write two msb into two least significant bytes
3434     if(rs2[i]) emit_rorimm(tl,16,tl);
3435     emit_writehword_indexed(tl,-2,temp);
3436     if(rs2[i]) emit_rorimm(tl,16,tl);
3437   }
3438   if (opcode[i]==0x2E) { // SWR
3439     // Write 3 lsb into three most significant bytes
3440     emit_writebyte_indexed(tl,-1,temp);
3441     if(rs2[i]) emit_rorimm(tl,8,tl);
3442     emit_writehword_indexed(tl,0,temp);
3443     if(rs2[i]) emit_rorimm(tl,24,tl);
3444   }
3445   if (opcode[i]==0x2C) { // SDL
3446     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3447     // Write two msb into two least significant bytes
3448     if(rs2[i]) emit_rorimm(th,16,th);
3449     emit_writehword_indexed(th,-2,temp);
3450     if(rs2[i]) emit_rorimm(th,16,th);
3451   }
3452   if (opcode[i]==0x2D) { // SDR
3453     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3454     // Write 3 lsb into three most significant bytes
3455     emit_writebyte_indexed(tl,-1,temp);
3456     if(rs2[i]) emit_rorimm(tl,8,tl);
3457     emit_writehword_indexed(tl,0,temp);
3458     if(rs2[i]) emit_rorimm(tl,24,tl);
3459   }
3460   done2=(int)out;
3461   emit_jmp(0);
3462   // 3
3463   set_jump_target(case3,(int)out);
3464   if (opcode[i]==0x2A) { // SWL
3465     // Write msb into least significant byte
3466     if(rs2[i]) emit_rorimm(tl,24,tl);
3467     emit_writebyte_indexed(tl,-3,temp);
3468     if(rs2[i]) emit_rorimm(tl,8,tl);
3469   }
3470   if (opcode[i]==0x2E) { // SWR
3471     // Write entire word
3472     emit_writeword_indexed(tl,-3,temp);
3473   }
3474   if (opcode[i]==0x2C) { // SDL
3475     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3476     // Write msb into least significant byte
3477     if(rs2[i]) emit_rorimm(th,24,th);
3478     emit_writebyte_indexed(th,-3,temp);
3479     if(rs2[i]) emit_rorimm(th,8,th);
3480   }
3481   if (opcode[i]==0x2D) { // SDR
3482     if(rs2[i]) emit_mov(th,temp2);
3483     // Write entire word
3484     emit_writeword_indexed(tl,-3,temp);
3485   }
3486   set_jump_target(done0,(int)out);
3487   set_jump_target(done1,(int)out);
3488   set_jump_target(done2,(int)out);
3489   if (opcode[i]==0x2C) { // SDL
3490     emit_testimm(temp,4);
3491     done0=(int)out;
3492     emit_jne(0);
3493     emit_andimm(temp,~3,temp);
3494     emit_writeword_indexed(temp2,4,temp);
3495     set_jump_target(done0,(int)out);
3496   }
3497   if (opcode[i]==0x2D) { // SDR
3498     emit_testimm(temp,4);
3499     done0=(int)out;
3500     emit_jeq(0);
3501     emit_andimm(temp,~3,temp);
3502     emit_writeword_indexed(temp2,-4,temp);
3503     set_jump_target(done0,(int)out);
3504   }
3505   if(!c||!memtarget)
3506     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3507   if(!using_tlb) {
3508     #ifdef RAM_OFFSET
3509     int map=get_reg(i_regs->regmap,ROREG);
3510     if(map<0) map=HOST_TEMPREG;
3511     gen_orig_addr_w(temp,map);
3512     #else
3513     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3514     #endif
3515     #if defined(HOST_IMM8)
3516     int ir=get_reg(i_regs->regmap,INVCP);
3517     assert(ir>=0);
3518     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3519     #else
3520     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3521     #endif
3522     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3523     emit_callne(invalidate_addr_reg[temp]);
3524     #else
3525     jaddr2=(int)out;
3526     emit_jne(0);
3527     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3528     #endif
3529   }
3530   /*
3531     emit_pusha();
3532     //save_regs(0x100f);
3533         emit_readword((int)&last_count,ECX);
3534         if(get_reg(i_regs->regmap,CCREG)<0)
3535           emit_loadreg(CCREG,HOST_CCREG);
3536         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3537         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3538         emit_writeword(HOST_CCREG,(int)&Count);
3539     emit_call((int)memdebug);
3540     emit_popa();
3541     //restore_regs(0x100f);
3542   /**/
3543 }
3544
3545 void c1ls_assemble(int i,struct regstat *i_regs)
3546 {
3547 #ifndef DISABLE_COP1
3548   int s,th,tl;
3549   int temp,ar;
3550   int map=-1;
3551   int offset;
3552   int c=0;
3553   int jaddr,jaddr2=0,jaddr3,type;
3554   int agr=AGEN1+(i&1);
3555   u_int hr,reglist=0;
3556   th=get_reg(i_regs->regmap,FTEMP|64);
3557   tl=get_reg(i_regs->regmap,FTEMP);
3558   s=get_reg(i_regs->regmap,rs1[i]);
3559   temp=get_reg(i_regs->regmap,agr);
3560   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3561   offset=imm[i];
3562   assert(tl>=0);
3563   assert(rs1[i]>0);
3564   assert(temp>=0);
3565   for(hr=0;hr<HOST_REGS;hr++) {
3566     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3567   }
3568   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3569   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3570   {
3571     // Loads use a temporary register which we need to save
3572     reglist|=1<<temp;
3573   }
3574   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3575     ar=temp;
3576   else // LWC1/LDC1
3577     ar=tl;
3578   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3579   //else c=(i_regs->wasconst>>s)&1;
3580   if(s>=0) c=(i_regs->wasconst>>s)&1;
3581   // Check cop1 unusable
3582   if(!cop1_usable) {
3583     signed char rs=get_reg(i_regs->regmap,CSREG);
3584     assert(rs>=0);
3585     emit_testimm(rs,0x20000000);
3586     jaddr=(int)out;
3587     emit_jeq(0);
3588     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3589     cop1_usable=1;
3590   }
3591   if (opcode[i]==0x39) { // SWC1 (get float address)
3592     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3593   }
3594   if (opcode[i]==0x3D) { // SDC1 (get double address)
3595     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3596   }
3597   // Generate address + offset
3598   if(!using_tlb) {
3599     if(!c)
3600       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3601   }
3602   else
3603   {
3604     map=get_reg(i_regs->regmap,TLREG);
3605     assert(map>=0);
3606     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3607       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3608     }
3609     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3610       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3611     }
3612   }
3613   if (opcode[i]==0x39) { // SWC1 (read float)
3614     emit_readword_indexed(0,tl,tl);
3615   }
3616   if (opcode[i]==0x3D) { // SDC1 (read double)
3617     emit_readword_indexed(4,tl,th);
3618     emit_readword_indexed(0,tl,tl);
3619   }
3620   if (opcode[i]==0x31) { // LWC1 (get target address)
3621     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3622   }
3623   if (opcode[i]==0x35) { // LDC1 (get target address)
3624     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3625   }
3626   if(!using_tlb) {
3627     if(!c) {
3628       jaddr2=(int)out;
3629       emit_jno(0);
3630     }
3631     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3632       jaddr2=(int)out;
3633       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3634     }
3635     #ifdef DESTRUCTIVE_SHIFT
3636     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3637       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3638     }
3639     #endif
3640   }else{
3641     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3642       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3643     }
3644     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3645       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3646     }
3647   }
3648   if (opcode[i]==0x31) { // LWC1
3649     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3650     //gen_tlb_addr_r(ar,map);
3651     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3652     #ifdef HOST_IMM_ADDR32
3653     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3654     else
3655     #endif
3656     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3657     type=LOADW_STUB;
3658   }
3659   if (opcode[i]==0x35) { // LDC1
3660     assert(th>=0);
3661     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3662     //gen_tlb_addr_r(ar,map);
3663     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3664     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3665     #ifdef HOST_IMM_ADDR32
3666     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3667     else
3668     #endif
3669     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3670     type=LOADD_STUB;
3671   }
3672   if (opcode[i]==0x39) { // SWC1
3673     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3674     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3675     type=STOREW_STUB;
3676   }
3677   if (opcode[i]==0x3D) { // SDC1
3678     assert(th>=0);
3679     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3680     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3681     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3682     type=STORED_STUB;
3683   }
3684   if(!using_tlb) {
3685     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3686       #ifndef DESTRUCTIVE_SHIFT
3687       temp=offset||c||s<0?ar:s;
3688       #endif
3689       #if defined(HOST_IMM8)
3690       int ir=get_reg(i_regs->regmap,INVCP);
3691       assert(ir>=0);
3692       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3693       #else
3694       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3695       #endif
3696       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3697       emit_callne(invalidate_addr_reg[temp]);
3698       #else
3699       jaddr3=(int)out;
3700       emit_jne(0);
3701       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3702       #endif
3703     }
3704   }
3705   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3706   if (opcode[i]==0x31) { // LWC1 (write float)
3707     emit_writeword_indexed(tl,0,temp);
3708   }
3709   if (opcode[i]==0x35) { // LDC1 (write double)
3710     emit_writeword_indexed(th,4,temp);
3711     emit_writeword_indexed(tl,0,temp);
3712   }
3713   //if(opcode[i]==0x39)
3714   /*if(opcode[i]==0x39||opcode[i]==0x31)
3715   {
3716     emit_pusha();
3717         emit_readword((int)&last_count,ECX);
3718         if(get_reg(i_regs->regmap,CCREG)<0)
3719           emit_loadreg(CCREG,HOST_CCREG);
3720         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3721         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3722         emit_writeword(HOST_CCREG,(int)&Count);
3723     emit_call((int)memdebug);
3724     emit_popa();
3725   }/**/
3726 #else
3727   cop1_unusable(i, i_regs);
3728 #endif
3729 }
3730
3731 void c2ls_assemble(int i,struct regstat *i_regs)
3732 {
3733   int s,tl;
3734   int ar;
3735   int offset;
3736   int memtarget=0,c=0;
3737   int jaddr2=0,jaddr3,type;
3738   int agr=AGEN1+(i&1);
3739   u_int hr,reglist=0;
3740   u_int copr=(source[i]>>16)&0x1f;
3741   s=get_reg(i_regs->regmap,rs1[i]);
3742   tl=get_reg(i_regs->regmap,FTEMP);
3743   offset=imm[i];
3744   assert(rs1[i]>0);
3745   assert(tl>=0);
3746   assert(!using_tlb);
3747
3748   for(hr=0;hr<HOST_REGS;hr++) {
3749     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3750   }
3751   if(i_regs->regmap[HOST_CCREG]==CCREG)
3752     reglist&=~(1<<HOST_CCREG);
3753
3754   // get the address
3755   if (opcode[i]==0x3a) { // SWC2
3756     ar=get_reg(i_regs->regmap,agr);
3757     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3758     reglist|=1<<ar;
3759   } else { // LWC2
3760     ar=tl;
3761   }
3762   if(s>=0) c=(i_regs->wasconst>>s)&1;
3763   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3764   if (!offset&&!c&&s>=0) ar=s;
3765   assert(ar>=0);
3766
3767   if (opcode[i]==0x3a) { // SWC2
3768     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3769     type=STOREW_STUB;
3770   }
3771   else
3772     type=LOADW_STUB;
3773
3774   if(c&&!memtarget) {
3775     jaddr2=(int)out;
3776     emit_jmp(0); // inline_readstub/inline_writestub?
3777   }
3778   else {
3779     if(!c) {
3780       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3781       jaddr2=(int)out;
3782       emit_jno(0);
3783     }
3784     if (opcode[i]==0x32) { // LWC2
3785       #ifdef HOST_IMM_ADDR32
3786       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3787       else
3788       #endif
3789       emit_readword_indexed(0,ar,tl);
3790     }
3791     if (opcode[i]==0x3a) { // SWC2
3792       #ifdef DESTRUCTIVE_SHIFT
3793       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3794       #endif
3795       emit_writeword_indexed(tl,0,ar);
3796     }
3797   }
3798   if(jaddr2)
3799     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3800   if (opcode[i]==0x3a) { // SWC2
3801 #if defined(HOST_IMM8)
3802     int ir=get_reg(i_regs->regmap,INVCP);
3803     assert(ir>=0);
3804     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3805 #else
3806     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3807 #endif
3808     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3809     emit_callne(invalidate_addr_reg[ar]);
3810     #else
3811     jaddr3=(int)out;
3812     emit_jne(0);
3813     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3814     #endif
3815   }
3816   if (opcode[i]==0x32) { // LWC2
3817     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3818   }
3819 }
3820
3821 #ifndef multdiv_assemble
3822 void multdiv_assemble(int i,struct regstat *i_regs)
3823 {
3824   printf("Need multdiv_assemble for this architecture.\n");
3825   exit(1);
3826 }
3827 #endif
3828
3829 void mov_assemble(int i,struct regstat *i_regs)
3830 {
3831   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3832   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3833   if(rt1[i]) {
3834     signed char sh,sl,th,tl;
3835     th=get_reg(i_regs->regmap,rt1[i]|64);
3836     tl=get_reg(i_regs->regmap,rt1[i]);
3837     //assert(tl>=0);
3838     if(tl>=0) {
3839       sh=get_reg(i_regs->regmap,rs1[i]|64);
3840       sl=get_reg(i_regs->regmap,rs1[i]);
3841       if(sl>=0) emit_mov(sl,tl);
3842       else emit_loadreg(rs1[i],tl);
3843       if(th>=0) {
3844         if(sh>=0) emit_mov(sh,th);
3845         else emit_loadreg(rs1[i]|64,th);
3846       }
3847     }
3848   }
3849 }
3850
3851 #ifndef fconv_assemble
3852 void fconv_assemble(int i,struct regstat *i_regs)
3853 {
3854   printf("Need fconv_assemble for this architecture.\n");
3855   exit(1);
3856 }
3857 #endif
3858
3859 #if 0
3860 void float_assemble(int i,struct regstat *i_regs)
3861 {
3862   printf("Need float_assemble for this architecture.\n");
3863   exit(1);
3864 }
3865 #endif
3866
3867 void syscall_assemble(int i,struct regstat *i_regs)
3868 {
3869   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3870   assert(ccreg==HOST_CCREG);
3871   assert(!is_delayslot);
3872   emit_movimm(start+i*4,EAX); // Get PC
3873   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3874   emit_jmp((int)jump_syscall_hle); // XXX
3875 }
3876
3877 void hlecall_assemble(int i,struct regstat *i_regs)
3878 {
3879   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3880   assert(ccreg==HOST_CCREG);
3881   assert(!is_delayslot);
3882   emit_movimm(start+i*4+4,0); // Get PC
3883   emit_movimm((int)psxHLEt[source[i]&7],1);
3884   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3885   emit_jmp((int)jump_hlecall);
3886 }
3887
3888 void intcall_assemble(int i,struct regstat *i_regs)
3889 {
3890   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3891   assert(ccreg==HOST_CCREG);
3892   assert(!is_delayslot);
3893   emit_movimm(start+i*4,0); // Get PC
3894   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3895   emit_jmp((int)jump_intcall);
3896 }
3897
3898 void ds_assemble(int i,struct regstat *i_regs)
3899 {
3900   is_delayslot=1;
3901   switch(itype[i]) {
3902     case ALU:
3903       alu_assemble(i,i_regs);break;
3904     case IMM16:
3905       imm16_assemble(i,i_regs);break;
3906     case SHIFT:
3907       shift_assemble(i,i_regs);break;
3908     case SHIFTIMM:
3909       shiftimm_assemble(i,i_regs);break;
3910     case LOAD:
3911       load_assemble(i,i_regs);break;
3912     case LOADLR:
3913       loadlr_assemble(i,i_regs);break;
3914     case STORE:
3915       store_assemble(i,i_regs);break;
3916     case STORELR:
3917       storelr_assemble(i,i_regs);break;
3918     case COP0:
3919       cop0_assemble(i,i_regs);break;
3920     case COP1:
3921       cop1_assemble(i,i_regs);break;
3922     case C1LS:
3923       c1ls_assemble(i,i_regs);break;
3924     case COP2:
3925       cop2_assemble(i,i_regs);break;
3926     case C2LS:
3927       c2ls_assemble(i,i_regs);break;
3928     case C2OP:
3929       c2op_assemble(i,i_regs);break;
3930     case FCONV:
3931       fconv_assemble(i,i_regs);break;
3932     case FLOAT:
3933       float_assemble(i,i_regs);break;
3934     case FCOMP:
3935       fcomp_assemble(i,i_regs);break;
3936     case MULTDIV:
3937       multdiv_assemble(i,i_regs);break;
3938     case MOV:
3939       mov_assemble(i,i_regs);break;
3940     case SYSCALL:
3941     case HLECALL:
3942     case INTCALL:
3943     case SPAN:
3944     case UJUMP:
3945     case RJUMP:
3946     case CJUMP:
3947     case SJUMP:
3948     case FJUMP:
3949       printf("Jump in the delay slot.  This is probably a bug.\n");
3950   }
3951   is_delayslot=0;
3952 }
3953
3954 // Is the branch target a valid internal jump?
3955 int internal_branch(uint64_t i_is32,int addr)
3956 {
3957   if(addr&1) return 0; // Indirect (register) jump
3958   if(addr>=start && addr<start+slen*4-4)
3959   {
3960     int t=(addr-start)>>2;
3961     // Delay slots are not valid branch targets
3962     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3963     // 64 -> 32 bit transition requires a recompile
3964     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3965     {
3966       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3967       else printf("optimizable: yes\n");
3968     }*/
3969     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3970 #ifndef FORCE32
3971     if(requires_32bit[t]&~i_is32) return 0;
3972     else
3973 #endif
3974       return 1;
3975   }
3976   return 0;
3977 }
3978
3979 #ifndef wb_invalidate
3980 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3981   uint64_t u,uint64_t uu)
3982 {
3983   int hr;
3984   for(hr=0;hr<HOST_REGS;hr++) {
3985     if(hr!=EXCLUDE_REG) {
3986       if(pre[hr]!=entry[hr]) {
3987         if(pre[hr]>=0) {
3988           if((dirty>>hr)&1) {
3989             if(get_reg(entry,pre[hr])<0) {
3990               if(pre[hr]<64) {
3991                 if(!((u>>pre[hr])&1)) {
3992                   emit_storereg(pre[hr],hr);
3993                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3994                     emit_sarimm(hr,31,hr);
3995                     emit_storereg(pre[hr]|64,hr);
3996                   }
3997                 }
3998               }else{
3999                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4000                   emit_storereg(pre[hr],hr);
4001                 }
4002               }
4003             }
4004           }
4005         }
4006       }
4007     }
4008   }
4009   // Move from one register to another (no writeback)
4010   for(hr=0;hr<HOST_REGS;hr++) {
4011     if(hr!=EXCLUDE_REG) {
4012       if(pre[hr]!=entry[hr]) {
4013         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4014           int nr;
4015           if((nr=get_reg(entry,pre[hr]))>=0) {
4016             emit_mov(hr,nr);
4017           }
4018         }
4019       }
4020     }
4021   }
4022 }
4023 #endif
4024
4025 // Load the specified registers
4026 // This only loads the registers given as arguments because
4027 // we don't want to load things that will be overwritten
4028 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4029 {
4030   int hr;
4031   // Load 32-bit regs
4032   for(hr=0;hr<HOST_REGS;hr++) {
4033     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4034       if(entry[hr]!=regmap[hr]) {
4035         if(regmap[hr]==rs1||regmap[hr]==rs2)
4036         {
4037           if(regmap[hr]==0) {
4038             emit_zeroreg(hr);
4039           }
4040           else
4041           {
4042             emit_loadreg(regmap[hr],hr);
4043           }
4044         }
4045       }
4046     }
4047   }
4048   //Load 64-bit regs
4049   for(hr=0;hr<HOST_REGS;hr++) {
4050     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4051       if(entry[hr]!=regmap[hr]) {
4052         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4053         {
4054           assert(regmap[hr]!=64);
4055           if((is32>>(regmap[hr]&63))&1) {
4056             int lr=get_reg(regmap,regmap[hr]-64);
4057             if(lr>=0)
4058               emit_sarimm(lr,31,hr);
4059             else
4060               emit_loadreg(regmap[hr],hr);
4061           }
4062           else
4063           {
4064             emit_loadreg(regmap[hr],hr);
4065           }
4066         }
4067       }
4068     }
4069   }
4070 }
4071
4072 // Load registers prior to the start of a loop
4073 // so that they are not loaded within the loop
4074 static void loop_preload(signed char pre[],signed char entry[])
4075 {
4076   int hr;
4077   for(hr=0;hr<HOST_REGS;hr++) {
4078     if(hr!=EXCLUDE_REG) {
4079       if(pre[hr]!=entry[hr]) {
4080         if(entry[hr]>=0) {
4081           if(get_reg(pre,entry[hr])<0) {
4082             assem_debug("loop preload:\n");
4083             //printf("loop preload: %d\n",hr);
4084             if(entry[hr]==0) {
4085               emit_zeroreg(hr);
4086             }
4087             else if(entry[hr]<TEMPREG)
4088             {
4089               emit_loadreg(entry[hr],hr);
4090             }
4091             else if(entry[hr]-64<TEMPREG)
4092             {
4093               emit_loadreg(entry[hr],hr);
4094             }
4095           }
4096         }
4097       }
4098     }
4099   }
4100 }
4101
4102 // Generate address for load/store instruction
4103 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4104 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4105 {
4106   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4107     int ra=-1;
4108     int agr=AGEN1+(i&1);
4109     int mgr=MGEN1+(i&1);
4110     if(itype[i]==LOAD) {
4111       ra=get_reg(i_regs->regmap,rt1[i]);
4112       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4113       assert(ra>=0);
4114     }
4115     if(itype[i]==LOADLR) {
4116       ra=get_reg(i_regs->regmap,FTEMP);
4117     }
4118     if(itype[i]==STORE||itype[i]==STORELR) {
4119       ra=get_reg(i_regs->regmap,agr);
4120       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4121     }
4122     if(itype[i]==C1LS||itype[i]==C2LS) {
4123       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4124         ra=get_reg(i_regs->regmap,FTEMP);
4125       else { // SWC1/SDC1/SWC2/SDC2
4126         ra=get_reg(i_regs->regmap,agr);
4127         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4128       }
4129     }
4130     int rs=get_reg(i_regs->regmap,rs1[i]);
4131     int rm=get_reg(i_regs->regmap,TLREG);
4132     if(ra>=0) {
4133       int offset=imm[i];
4134       int c=(i_regs->wasconst>>rs)&1;
4135       if(rs1[i]==0) {
4136         // Using r0 as a base address
4137         /*if(rm>=0) {
4138           if(!entry||entry[rm]!=mgr) {
4139             generate_map_const(offset,rm);
4140           } // else did it in the previous cycle
4141         }*/
4142         if(!entry||entry[ra]!=agr) {
4143           if (opcode[i]==0x22||opcode[i]==0x26) {
4144             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4145           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4146             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4147           }else{
4148             emit_movimm(offset,ra);
4149           }
4150         } // else did it in the previous cycle
4151       }
4152       else if(rs<0) {
4153         if(!entry||entry[ra]!=rs1[i])
4154           emit_loadreg(rs1[i],ra);
4155         //if(!entry||entry[ra]!=rs1[i])
4156         //  printf("poor load scheduling!\n");
4157       }
4158       else if(c) {
4159         if(rm>=0) {
4160           if(!entry||entry[rm]!=mgr) {
4161             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4162               // Stores to memory go thru the mapper to detect self-modifying
4163               // code, loads don't.
4164               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4165                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4166                 generate_map_const(constmap[i][rs]+offset,rm);
4167             }else{
4168               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4169                 generate_map_const(constmap[i][rs]+offset,rm);
4170             }
4171           }
4172         }
4173         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4174           if(!entry||entry[ra]!=agr) {
4175             if (opcode[i]==0x22||opcode[i]==0x26) {
4176               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4177             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4178               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4179             }else{
4180               #ifdef HOST_IMM_ADDR32
4181               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4182                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4183               #endif
4184               emit_movimm(constmap[i][rs]+offset,ra);
4185             }
4186           } // else did it in the previous cycle
4187         } // else load_consts already did it
4188       }
4189       if(offset&&!c&&rs1[i]) {
4190         if(rs>=0) {
4191           emit_addimm(rs,offset,ra);
4192         }else{
4193           emit_addimm(ra,offset,ra);
4194         }
4195       }
4196     }
4197   }
4198   // Preload constants for next instruction
4199   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4200     int agr,ra;
4201     #ifndef HOST_IMM_ADDR32
4202     // Mapper entry
4203     agr=MGEN1+((i+1)&1);
4204     ra=get_reg(i_regs->regmap,agr);
4205     if(ra>=0) {
4206       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4207       int offset=imm[i+1];
4208       int c=(regs[i+1].wasconst>>rs)&1;
4209       if(c) {
4210         if(itype[i+1]==STORE||itype[i+1]==STORELR
4211            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4212           // Stores to memory go thru the mapper to detect self-modifying
4213           // code, loads don't.
4214           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4215              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4216             generate_map_const(constmap[i+1][rs]+offset,ra);
4217         }else{
4218           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4219             generate_map_const(constmap[i+1][rs]+offset,ra);
4220         }
4221       }
4222       /*else if(rs1[i]==0) {
4223         generate_map_const(offset,ra);
4224       }*/
4225     }
4226     #endif
4227     // Actual address
4228     agr=AGEN1+((i+1)&1);
4229     ra=get_reg(i_regs->regmap,agr);
4230     if(ra>=0) {
4231       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4232       int offset=imm[i+1];
4233       int c=(regs[i+1].wasconst>>rs)&1;
4234       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4235         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4236           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4237         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4238           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4239         }else{
4240           #ifdef HOST_IMM_ADDR32
4241           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4242              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4243           #endif
4244           emit_movimm(constmap[i+1][rs]+offset,ra);
4245         }
4246       }
4247       else if(rs1[i+1]==0) {
4248         // Using r0 as a base address
4249         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4250           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4251         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4252           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4253         }else{
4254           emit_movimm(offset,ra);
4255         }
4256       }
4257     }
4258   }
4259 }
4260
4261 int get_final_value(int hr, int i, int *value)
4262 {
4263   int reg=regs[i].regmap[hr];
4264   while(i<slen-1) {
4265     if(regs[i+1].regmap[hr]!=reg) break;
4266     if(!((regs[i+1].isconst>>hr)&1)) break;
4267     if(bt[i+1]) break;
4268     i++;
4269   }
4270   if(i<slen-1) {
4271     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4272       *value=constmap[i][hr];
4273       return 1;
4274     }
4275     if(!bt[i+1]) {
4276       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4277         // Load in delay slot, out-of-order execution
4278         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4279         {
4280           #ifdef HOST_IMM_ADDR32
4281           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4282           #endif
4283           // Precompute load address
4284           *value=constmap[i][hr]+imm[i+2];
4285           return 1;
4286         }
4287       }
4288       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4289       {
4290         #ifdef HOST_IMM_ADDR32
4291         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4292         #endif
4293         // Precompute load address
4294         *value=constmap[i][hr]+imm[i+1];
4295         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4296         return 1;
4297       }
4298     }
4299   }
4300   *value=constmap[i][hr];
4301   //printf("c=%x\n",(int)constmap[i][hr]);
4302   if(i==slen-1) return 1;
4303   if(reg<64) {
4304     return !((unneeded_reg[i+1]>>reg)&1);
4305   }else{
4306     return !((unneeded_reg_upper[i+1]>>reg)&1);
4307   }
4308 }
4309
4310 // Load registers with known constants
4311 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4312 {
4313   int hr;
4314   // Load 32-bit regs
4315   for(hr=0;hr<HOST_REGS;hr++) {
4316     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4317       //if(entry[hr]!=regmap[hr]) {
4318       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4319         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4320           int value;
4321           if(get_final_value(hr,i,&value)) {
4322             if(value==0) {
4323               emit_zeroreg(hr);
4324             }
4325             else {
4326               emit_movimm(value,hr);
4327             }
4328           }
4329         }
4330       }
4331     }
4332   }
4333   // Load 64-bit regs
4334   for(hr=0;hr<HOST_REGS;hr++) {
4335     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4336       //if(entry[hr]!=regmap[hr]) {
4337       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4338         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4339           if((is32>>(regmap[hr]&63))&1) {
4340             int lr=get_reg(regmap,regmap[hr]-64);
4341             assert(lr>=0);
4342             emit_sarimm(lr,31,hr);
4343           }
4344           else
4345           {
4346             int value;
4347             if(get_final_value(hr,i,&value)) {
4348               if(value==0) {
4349                 emit_zeroreg(hr);
4350               }
4351               else {
4352                 emit_movimm(value,hr);
4353               }
4354             }
4355           }
4356         }
4357       }
4358     }
4359   }
4360 }
4361 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4362 {
4363   int hr;
4364   // Load 32-bit regs
4365   for(hr=0;hr<HOST_REGS;hr++) {
4366     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4367       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4368         int value=constmap[i][hr];
4369         if(value==0) {
4370           emit_zeroreg(hr);
4371         }
4372         else {
4373           emit_movimm(value,hr);
4374         }
4375       }
4376     }
4377   }
4378   // Load 64-bit regs
4379   for(hr=0;hr<HOST_REGS;hr++) {
4380     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4381       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4382         if((is32>>(regmap[hr]&63))&1) {
4383           int lr=get_reg(regmap,regmap[hr]-64);
4384           assert(lr>=0);
4385           emit_sarimm(lr,31,hr);
4386         }
4387         else
4388         {
4389           int value=constmap[i][hr];
4390           if(value==0) {
4391             emit_zeroreg(hr);
4392           }
4393           else {
4394             emit_movimm(value,hr);
4395           }
4396         }
4397       }
4398     }
4399   }
4400 }
4401
4402 // Write out all dirty registers (except cycle count)
4403 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4404 {
4405   int hr;
4406   for(hr=0;hr<HOST_REGS;hr++) {
4407     if(hr!=EXCLUDE_REG) {
4408       if(i_regmap[hr]>0) {
4409         if(i_regmap[hr]!=CCREG) {
4410           if((i_dirty>>hr)&1) {
4411             if(i_regmap[hr]<64) {
4412               emit_storereg(i_regmap[hr],hr);
4413 #ifndef FORCE32
4414               if( ((i_is32>>i_regmap[hr])&1) ) {
4415                 #ifdef DESTRUCTIVE_WRITEBACK
4416                 emit_sarimm(hr,31,hr);
4417                 emit_storereg(i_regmap[hr]|64,hr);
4418                 #else
4419                 emit_sarimm(hr,31,HOST_TEMPREG);
4420                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4421                 #endif
4422               }
4423 #endif
4424             }else{
4425               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4426                 emit_storereg(i_regmap[hr],hr);
4427               }
4428             }
4429           }
4430         }
4431       }
4432     }
4433   }
4434 }
4435 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4436 // This writes the registers not written by store_regs_bt
4437 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4438 {
4439   int hr;
4440   int t=(addr-start)>>2;
4441   for(hr=0;hr<HOST_REGS;hr++) {
4442     if(hr!=EXCLUDE_REG) {
4443       if(i_regmap[hr]>0) {
4444         if(i_regmap[hr]!=CCREG) {
4445           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4446             if((i_dirty>>hr)&1) {
4447               if(i_regmap[hr]<64) {
4448                 emit_storereg(i_regmap[hr],hr);
4449 #ifndef FORCE32
4450                 if( ((i_is32>>i_regmap[hr])&1) ) {
4451                   #ifdef DESTRUCTIVE_WRITEBACK
4452                   emit_sarimm(hr,31,hr);
4453                   emit_storereg(i_regmap[hr]|64,hr);
4454                   #else
4455                   emit_sarimm(hr,31,HOST_TEMPREG);
4456                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4457                   #endif
4458                 }
4459 #endif
4460               }else{
4461                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4462                   emit_storereg(i_regmap[hr],hr);
4463                 }
4464               }
4465             }
4466           }
4467         }
4468       }
4469     }
4470   }
4471 }
4472
4473 // Load all registers (except cycle count)
4474 void load_all_regs(signed char i_regmap[])
4475 {
4476   int hr;
4477   for(hr=0;hr<HOST_REGS;hr++) {
4478     if(hr!=EXCLUDE_REG) {
4479       if(i_regmap[hr]==0) {
4480         emit_zeroreg(hr);
4481       }
4482       else
4483       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4484       {
4485         emit_loadreg(i_regmap[hr],hr);
4486       }
4487     }
4488   }
4489 }
4490
4491 // Load all current registers also needed by next instruction
4492 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4493 {
4494   int hr;
4495   for(hr=0;hr<HOST_REGS;hr++) {
4496     if(hr!=EXCLUDE_REG) {
4497       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4498         if(i_regmap[hr]==0) {
4499           emit_zeroreg(hr);
4500         }
4501         else
4502         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4503         {
4504           emit_loadreg(i_regmap[hr],hr);
4505         }
4506       }
4507     }
4508   }
4509 }
4510
4511 // Load all regs, storing cycle count if necessary
4512 void load_regs_entry(int t)
4513 {
4514   int hr;
4515   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4516   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4517   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4518     emit_storereg(CCREG,HOST_CCREG);
4519   }
4520   // Load 32-bit regs
4521   for(hr=0;hr<HOST_REGS;hr++) {
4522     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4523       if(regs[t].regmap_entry[hr]==0) {
4524         emit_zeroreg(hr);
4525       }
4526       else if(regs[t].regmap_entry[hr]!=CCREG)
4527       {
4528         emit_loadreg(regs[t].regmap_entry[hr],hr);
4529       }
4530     }
4531   }
4532   // Load 64-bit regs
4533   for(hr=0;hr<HOST_REGS;hr++) {
4534     if(regs[t].regmap_entry[hr]>=64) {
4535       assert(regs[t].regmap_entry[hr]!=64);
4536       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4537         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4538         if(lr<0) {
4539           emit_loadreg(regs[t].regmap_entry[hr],hr);
4540         }
4541         else
4542         {
4543           emit_sarimm(lr,31,hr);
4544         }
4545       }
4546       else
4547       {
4548         emit_loadreg(regs[t].regmap_entry[hr],hr);
4549       }
4550     }
4551   }
4552 }
4553
4554 // Store dirty registers prior to branch
4555 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4556 {
4557   if(internal_branch(i_is32,addr))
4558   {
4559     int t=(addr-start)>>2;
4560     int hr;
4561     for(hr=0;hr<HOST_REGS;hr++) {
4562       if(hr!=EXCLUDE_REG) {
4563         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4564           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4565             if((i_dirty>>hr)&1) {
4566               if(i_regmap[hr]<64) {
4567                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4568                   emit_storereg(i_regmap[hr],hr);
4569                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4570                     #ifdef DESTRUCTIVE_WRITEBACK
4571                     emit_sarimm(hr,31,hr);
4572                     emit_storereg(i_regmap[hr]|64,hr);
4573                     #else
4574                     emit_sarimm(hr,31,HOST_TEMPREG);
4575                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4576                     #endif
4577                   }
4578                 }
4579               }else{
4580                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4581                   emit_storereg(i_regmap[hr],hr);
4582                 }
4583               }
4584             }
4585           }
4586         }
4587       }
4588     }
4589   }
4590   else
4591   {
4592     // Branch out of this block, write out all dirty regs
4593     wb_dirtys(i_regmap,i_is32,i_dirty);
4594   }
4595 }
4596
4597 // Load all needed registers for branch target
4598 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4599 {
4600   //if(addr>=start && addr<(start+slen*4))
4601   if(internal_branch(i_is32,addr))
4602   {
4603     int t=(addr-start)>>2;
4604     int hr;
4605     // Store the cycle count before loading something else
4606     if(i_regmap[HOST_CCREG]!=CCREG) {
4607       assert(i_regmap[HOST_CCREG]==-1);
4608     }
4609     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4610       emit_storereg(CCREG,HOST_CCREG);
4611     }
4612     // Load 32-bit regs
4613     for(hr=0;hr<HOST_REGS;hr++) {
4614       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4615         #ifdef DESTRUCTIVE_WRITEBACK
4616         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4617         #else
4618         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4619         #endif
4620           if(regs[t].regmap_entry[hr]==0) {
4621             emit_zeroreg(hr);
4622           }
4623           else if(regs[t].regmap_entry[hr]!=CCREG)
4624           {
4625             emit_loadreg(regs[t].regmap_entry[hr],hr);
4626           }
4627         }
4628       }
4629     }
4630     //Load 64-bit regs
4631     for(hr=0;hr<HOST_REGS;hr++) {
4632       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4633         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4634           assert(regs[t].regmap_entry[hr]!=64);
4635           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4636             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4637             if(lr<0) {
4638               emit_loadreg(regs[t].regmap_entry[hr],hr);
4639             }
4640             else
4641             {
4642               emit_sarimm(lr,31,hr);
4643             }
4644           }
4645           else
4646           {
4647             emit_loadreg(regs[t].regmap_entry[hr],hr);
4648           }
4649         }
4650         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4651           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4652           assert(lr>=0);
4653           emit_sarimm(lr,31,hr);
4654         }
4655       }
4656     }
4657   }
4658 }
4659
4660 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4661 {
4662   if(addr>=start && addr<start+slen*4-4)
4663   {
4664     int t=(addr-start)>>2;
4665     int hr;
4666     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4667     for(hr=0;hr<HOST_REGS;hr++)
4668     {
4669       if(hr!=EXCLUDE_REG)
4670       {
4671         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4672         {
4673           if(regs[t].regmap_entry[hr]!=-1)
4674           {
4675             return 0;
4676           }
4677           else 
4678           if((i_dirty>>hr)&1)
4679           {
4680             if(i_regmap[hr]<64)
4681             {
4682               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4683                 return 0;
4684             }
4685             else
4686             {
4687               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4688                 return 0;
4689             }
4690           }
4691         }
4692         else // Same register but is it 32-bit or dirty?
4693         if(i_regmap[hr]>=0)
4694         {
4695           if(!((regs[t].dirty>>hr)&1))
4696           {
4697             if((i_dirty>>hr)&1)
4698             {
4699               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4700               {
4701                 //printf("%x: dirty no match\n",addr);
4702                 return 0;
4703               }
4704             }
4705           }
4706           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4707           {
4708             //printf("%x: is32 no match\n",addr);
4709             return 0;
4710           }
4711         }
4712       }
4713     }
4714     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4715 #ifndef FORCE32
4716     if(requires_32bit[t]&~i_is32) return 0;
4717 #endif
4718     // Delay slots are not valid branch targets
4719     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4720     // Delay slots require additional processing, so do not match
4721     if(is_ds[t]) return 0;
4722   }
4723   else
4724   {
4725     int hr;
4726     for(hr=0;hr<HOST_REGS;hr++)
4727     {
4728       if(hr!=EXCLUDE_REG)
4729       {
4730         if(i_regmap[hr]>=0)
4731         {
4732           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4733           {
4734             if((i_dirty>>hr)&1)
4735             {
4736               return 0;
4737             }
4738           }
4739         }
4740       }
4741     }
4742   }
4743   return 1;
4744 }
4745
4746 // Used when a branch jumps into the delay slot of another branch
4747 void ds_assemble_entry(int i)
4748 {
4749   int t=(ba[i]-start)>>2;
4750   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4751   assem_debug("Assemble delay slot at %x\n",ba[i]);
4752   assem_debug("<->\n");
4753   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4754     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4755   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4756   address_generation(t,&regs[t],regs[t].regmap_entry);
4757   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4758     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4759   cop1_usable=0;
4760   is_delayslot=0;
4761   switch(itype[t]) {
4762     case ALU:
4763       alu_assemble(t,&regs[t]);break;
4764     case IMM16:
4765       imm16_assemble(t,&regs[t]);break;
4766     case SHIFT:
4767       shift_assemble(t,&regs[t]);break;
4768     case SHIFTIMM:
4769       shiftimm_assemble(t,&regs[t]);break;
4770     case LOAD:
4771       load_assemble(t,&regs[t]);break;
4772     case LOADLR:
4773       loadlr_assemble(t,&regs[t]);break;
4774     case STORE:
4775       store_assemble(t,&regs[t]);break;
4776     case STORELR:
4777       storelr_assemble(t,&regs[t]);break;
4778     case COP0:
4779       cop0_assemble(t,&regs[t]);break;
4780     case COP1:
4781       cop1_assemble(t,&regs[t]);break;
4782     case C1LS:
4783       c1ls_assemble(t,&regs[t]);break;
4784     case COP2:
4785       cop2_assemble(t,&regs[t]);break;
4786     case C2LS:
4787       c2ls_assemble(t,&regs[t]);break;
4788     case C2OP:
4789       c2op_assemble(t,&regs[t]);break;
4790     case FCONV:
4791       fconv_assemble(t,&regs[t]);break;
4792     case FLOAT:
4793       float_assemble(t,&regs[t]);break;
4794     case FCOMP:
4795       fcomp_assemble(t,&regs[t]);break;
4796     case MULTDIV:
4797       multdiv_assemble(t,&regs[t]);break;
4798     case MOV:
4799       mov_assemble(t,&regs[t]);break;
4800     case SYSCALL:
4801     case HLECALL:
4802     case INTCALL:
4803     case SPAN:
4804     case UJUMP:
4805     case RJUMP:
4806     case CJUMP:
4807     case SJUMP:
4808     case FJUMP:
4809       printf("Jump in the delay slot.  This is probably a bug.\n");
4810   }
4811   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4812   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4813   if(internal_branch(regs[t].is32,ba[i]+4))
4814     assem_debug("branch: internal\n");
4815   else
4816     assem_debug("branch: external\n");
4817   assert(internal_branch(regs[t].is32,ba[i]+4));
4818   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4819   emit_jmp(0);
4820 }
4821
4822 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4823 {
4824   int count;
4825   int jaddr;
4826   int idle=0;
4827   if(itype[i]==RJUMP)
4828   {
4829     *adj=0;
4830   }
4831   //if(ba[i]>=start && ba[i]<(start+slen*4))
4832   if(internal_branch(branch_regs[i].is32,ba[i]))
4833   {
4834     int t=(ba[i]-start)>>2;
4835     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4836     else *adj=ccadj[t];
4837   }
4838   else
4839   {
4840     *adj=0;
4841   }
4842   count=ccadj[i];
4843   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4844     // Idle loop
4845     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4846     idle=(int)out;
4847     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4848     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4849     jaddr=(int)out;
4850     emit_jmp(0);
4851   }
4852   else if(*adj==0||invert) {
4853     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4854     jaddr=(int)out;
4855     emit_jns(0);
4856   }
4857   else
4858   {
4859     emit_cmpimm(HOST_CCREG,-2*(count+2));
4860     jaddr=(int)out;
4861     emit_jns(0);
4862   }
4863   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4864 }
4865
4866 void do_ccstub(int n)
4867 {
4868   literal_pool(256);
4869   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4870   set_jump_target(stubs[n][1],(int)out);
4871   int i=stubs[n][4];
4872   if(stubs[n][6]==NULLDS) {
4873     // Delay slot instruction is nullified ("likely" branch)
4874     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4875   }
4876   else if(stubs[n][6]!=TAKEN) {
4877     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4878   }
4879   else {
4880     if(internal_branch(branch_regs[i].is32,ba[i]))
4881       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4882   }
4883   if(stubs[n][5]!=-1)
4884   {
4885     // Save PC as return address
4886     emit_movimm(stubs[n][5],EAX);
4887     emit_writeword(EAX,(int)&pcaddr);
4888   }
4889   else
4890   {
4891     // Return address depends on which way the branch goes
4892     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4893     {
4894       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4895       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4896       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4897       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4898       if(rs1[i]==0)
4899       {
4900         s1l=s2l;s1h=s2h;
4901         s2l=s2h=-1;
4902       }
4903       else if(rs2[i]==0)
4904       {
4905         s2l=s2h=-1;
4906       }
4907       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4908         s1h=s2h=-1;
4909       }
4910       assert(s1l>=0);
4911       #ifdef DESTRUCTIVE_WRITEBACK
4912       if(rs1[i]) {
4913         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4914           emit_loadreg(rs1[i],s1l);
4915       } 
4916       else {
4917         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4918           emit_loadreg(rs2[i],s1l);
4919       }
4920       if(s2l>=0)
4921         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4922           emit_loadreg(rs2[i],s2l);
4923       #endif
4924       int hr=0;
4925       int addr=-1,alt=-1,ntaddr=-1;
4926       while(hr<HOST_REGS)
4927       {
4928         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4929            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4930            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4931         {
4932           addr=hr++;break;
4933         }
4934         hr++;
4935       }
4936       while(hr<HOST_REGS)
4937       {
4938         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4939            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4940            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4941         {
4942           alt=hr++;break;
4943         }
4944         hr++;
4945       }
4946       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4947       {
4948         while(hr<HOST_REGS)
4949         {
4950           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4951              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4952              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4953           {
4954             ntaddr=hr;break;
4955           }
4956           hr++;
4957         }
4958         assert(hr<HOST_REGS);
4959       }
4960       if((opcode[i]&0x2f)==4) // BEQ
4961       {
4962         #ifdef HAVE_CMOV_IMM
4963         if(s1h<0) {
4964           if(s2l>=0) emit_cmp(s1l,s2l);
4965           else emit_test(s1l,s1l);
4966           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4967         }
4968         else
4969         #endif
4970         {
4971           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4972           if(s1h>=0) {
4973             if(s2h>=0) emit_cmp(s1h,s2h);
4974             else emit_test(s1h,s1h);
4975             emit_cmovne_reg(alt,addr);
4976           }
4977           if(s2l>=0) emit_cmp(s1l,s2l);
4978           else emit_test(s1l,s1l);
4979           emit_cmovne_reg(alt,addr);
4980         }
4981       }
4982       if((opcode[i]&0x2f)==5) // BNE
4983       {
4984         #ifdef HAVE_CMOV_IMM
4985         if(s1h<0) {
4986           if(s2l>=0) emit_cmp(s1l,s2l);
4987           else emit_test(s1l,s1l);
4988           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4989         }
4990         else
4991         #endif
4992         {
4993           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4994           if(s1h>=0) {
4995             if(s2h>=0) emit_cmp(s1h,s2h);
4996             else emit_test(s1h,s1h);
4997             emit_cmovne_reg(alt,addr);
4998           }
4999           if(s2l>=0) emit_cmp(s1l,s2l);
5000           else emit_test(s1l,s1l);
5001           emit_cmovne_reg(alt,addr);
5002         }
5003       }
5004       if((opcode[i]&0x2f)==6) // BLEZ
5005       {
5006         //emit_movimm(ba[i],alt);
5007         //emit_movimm(start+i*4+8,addr);
5008         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5009         emit_cmpimm(s1l,1);
5010         if(s1h>=0) emit_mov(addr,ntaddr);
5011         emit_cmovl_reg(alt,addr);
5012         if(s1h>=0) {
5013           emit_test(s1h,s1h);
5014           emit_cmovne_reg(ntaddr,addr);
5015           emit_cmovs_reg(alt,addr);
5016         }
5017       }
5018       if((opcode[i]&0x2f)==7) // BGTZ
5019       {
5020         //emit_movimm(ba[i],addr);
5021         //emit_movimm(start+i*4+8,ntaddr);
5022         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5023         emit_cmpimm(s1l,1);
5024         if(s1h>=0) emit_mov(addr,alt);
5025         emit_cmovl_reg(ntaddr,addr);
5026         if(s1h>=0) {
5027           emit_test(s1h,s1h);
5028           emit_cmovne_reg(alt,addr);
5029           emit_cmovs_reg(ntaddr,addr);
5030         }
5031       }
5032       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5033       {
5034         //emit_movimm(ba[i],alt);
5035         //emit_movimm(start+i*4+8,addr);
5036         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5037         if(s1h>=0) emit_test(s1h,s1h);
5038         else emit_test(s1l,s1l);
5039         emit_cmovs_reg(alt,addr);
5040       }
5041       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5042       {
5043         //emit_movimm(ba[i],addr);
5044         //emit_movimm(start+i*4+8,alt);
5045         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5046         if(s1h>=0) emit_test(s1h,s1h);
5047         else emit_test(s1l,s1l);
5048         emit_cmovs_reg(alt,addr);
5049       }
5050       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5051         if(source[i]&0x10000) // BC1T
5052         {
5053           //emit_movimm(ba[i],alt);
5054           //emit_movimm(start+i*4+8,addr);
5055           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5056           emit_testimm(s1l,0x800000);
5057           emit_cmovne_reg(alt,addr);
5058         }
5059         else // BC1F
5060         {
5061           //emit_movimm(ba[i],addr);
5062           //emit_movimm(start+i*4+8,alt);
5063           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5064           emit_testimm(s1l,0x800000);
5065           emit_cmovne_reg(alt,addr);
5066         }
5067       }
5068       emit_writeword(addr,(int)&pcaddr);
5069     }
5070     else
5071     if(itype[i]==RJUMP)
5072     {
5073       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5074       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5075         r=get_reg(branch_regs[i].regmap,RTEMP);
5076       }
5077       emit_writeword(r,(int)&pcaddr);
5078     }
5079     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5080   }
5081   // Update cycle count
5082   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5083   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5084   emit_call((int)cc_interrupt);
5085   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5086   if(stubs[n][6]==TAKEN) {
5087     if(internal_branch(branch_regs[i].is32,ba[i]))
5088       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5089     else if(itype[i]==RJUMP) {
5090       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5091         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5092       else
5093         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5094     }
5095   }else if(stubs[n][6]==NOTTAKEN) {
5096     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5097     else load_all_regs(branch_regs[i].regmap);
5098   }else if(stubs[n][6]==NULLDS) {
5099     // Delay slot instruction is nullified ("likely" branch)
5100     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5101     else load_all_regs(regs[i].regmap);
5102   }else{
5103     load_all_regs(branch_regs[i].regmap);
5104   }
5105   emit_jmp(stubs[n][2]); // return address
5106   
5107   /* This works but uses a lot of memory...
5108   emit_readword((int)&last_count,ECX);
5109   emit_add(HOST_CCREG,ECX,EAX);
5110   emit_writeword(EAX,(int)&Count);
5111   emit_call((int)gen_interupt);
5112   emit_readword((int)&Count,HOST_CCREG);
5113   emit_readword((int)&next_interupt,EAX);
5114   emit_readword((int)&pending_exception,EBX);
5115   emit_writeword(EAX,(int)&last_count);
5116   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5117   emit_test(EBX,EBX);
5118   int jne_instr=(int)out;
5119   emit_jne(0);
5120   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5121   load_all_regs(branch_regs[i].regmap);
5122   emit_jmp(stubs[n][2]); // return address
5123   set_jump_target(jne_instr,(int)out);
5124   emit_readword((int)&pcaddr,EAX);
5125   // Call get_addr_ht instead of doing the hash table here.
5126   // This code is executed infrequently and takes up a lot of space
5127   // so smaller is better.
5128   emit_storereg(CCREG,HOST_CCREG);
5129   emit_pushreg(EAX);
5130   emit_call((int)get_addr_ht);
5131   emit_loadreg(CCREG,HOST_CCREG);
5132   emit_addimm(ESP,4,ESP);
5133   emit_jmpreg(EAX);*/
5134 }
5135
5136 add_to_linker(int addr,int target,int ext)
5137 {
5138   link_addr[linkcount][0]=addr;
5139   link_addr[linkcount][1]=target;
5140   link_addr[linkcount][2]=ext;  
5141   linkcount++;
5142 }
5143
5144 void ujump_assemble(int i,struct regstat *i_regs)
5145 {
5146   signed char *i_regmap=i_regs->regmap;
5147   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5148   address_generation(i+1,i_regs,regs[i].regmap_entry);
5149   #ifdef REG_PREFETCH
5150   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5151   if(rt1[i]==31&&temp>=0) 
5152   {
5153     int return_address=start+i*4+8;
5154     if(get_reg(branch_regs[i].regmap,31)>0) 
5155     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5156   }
5157   #endif
5158   if(rt1[i]==31) {
5159     int rt;
5160     unsigned int return_address;
5161     rt=get_reg(branch_regs[i].regmap,31);
5162     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5163     //assert(rt>=0);
5164     return_address=start+i*4+8;
5165     if(rt>=0) {
5166       #ifdef USE_MINI_HT
5167       if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5168         int temp=-1; // note: must be ds-safe
5169         #ifdef HOST_TEMPREG
5170         temp=HOST_TEMPREG;
5171         #endif
5172         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5173         else emit_movimm(return_address,rt);
5174       }
5175       else
5176       #endif
5177       {
5178         #ifdef REG_PREFETCH
5179         if(temp>=0) 
5180         {
5181           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5182         }
5183         #endif
5184         emit_movimm(return_address,rt); // PC into link register
5185         #ifdef IMM_PREFETCH
5186         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5187         #endif
5188       }
5189     }
5190   }
5191   ds_assemble(i+1,i_regs);
5192   uint64_t bc_unneeded=branch_regs[i].u;
5193   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5194   bc_unneeded|=1|(1LL<<rt1[i]);
5195   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5196   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5197                 bc_unneeded,bc_unneeded_upper);
5198   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5199   int cc,adj;
5200   cc=get_reg(branch_regs[i].regmap,CCREG);
5201   assert(cc==HOST_CCREG);
5202   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5203   #ifdef REG_PREFETCH
5204   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5205   #endif
5206   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5207   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5208   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5209   if(internal_branch(branch_regs[i].is32,ba[i]))
5210     assem_debug("branch: internal\n");
5211   else
5212     assem_debug("branch: external\n");
5213   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5214     ds_assemble_entry(i);
5215   }
5216   else {
5217     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5218     emit_jmp(0);
5219   }
5220 }
5221
5222 void rjump_assemble(int i,struct regstat *i_regs)
5223 {
5224   signed char *i_regmap=i_regs->regmap;
5225   int temp;
5226   int rs,cc,adj;
5227   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5228   assert(rs>=0);
5229   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5230     // Delay slot abuse, make a copy of the branch address register
5231     temp=get_reg(branch_regs[i].regmap,RTEMP);
5232     assert(temp>=0);
5233     assert(regs[i].regmap[temp]==RTEMP);
5234     emit_mov(rs,temp);
5235     rs=temp;
5236   }
5237   address_generation(i+1,i_regs,regs[i].regmap_entry);
5238   #ifdef REG_PREFETCH
5239   if(rt1[i]==31) 
5240   {
5241     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5242       int return_address=start+i*4+8;
5243       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5244     }
5245   }
5246   #endif
5247   #ifdef USE_MINI_HT
5248   if(rs1[i]==31) {
5249     int rh=get_reg(regs[i].regmap,RHASH);
5250     if(rh>=0) do_preload_rhash(rh);
5251   }
5252   #endif
5253   ds_assemble(i+1,i_regs);
5254   uint64_t bc_unneeded=branch_regs[i].u;
5255   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5256   bc_unneeded|=1|(1LL<<rt1[i]);
5257   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5258   bc_unneeded&=~(1LL<<rs1[i]);
5259   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5260                 bc_unneeded,bc_unneeded_upper);
5261   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5262   if(rt1[i]!=0) {
5263     int rt,return_address;
5264     assert(rt1[i+1]!=rt1[i]);
5265     assert(rt2[i+1]!=rt1[i]);
5266     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5267     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5268     assert(rt>=0);
5269     return_address=start+i*4+8;
5270     #ifdef REG_PREFETCH
5271     if(temp>=0) 
5272     {
5273       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5274     }
5275     #endif
5276     emit_movimm(return_address,rt); // PC into link register
5277     #ifdef IMM_PREFETCH
5278     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5279     #endif
5280   }
5281   cc=get_reg(branch_regs[i].regmap,CCREG);
5282   assert(cc==HOST_CCREG);
5283   #ifdef USE_MINI_HT
5284   int rh=get_reg(branch_regs[i].regmap,RHASH);
5285   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5286   if(rs1[i]==31) {
5287     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5288     do_preload_rhtbl(ht);
5289     do_rhash(rs,rh);
5290   }
5291   #endif
5292   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5293   #ifdef DESTRUCTIVE_WRITEBACK
5294   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5295     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5296       emit_loadreg(rs1[i],rs);
5297     }
5298   }
5299   #endif
5300   #ifdef REG_PREFETCH
5301   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5302   #endif
5303   #ifdef USE_MINI_HT
5304   if(rs1[i]==31) {
5305     do_miniht_load(ht,rh);
5306   }
5307   #endif
5308   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5309   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5310   //assert(adj==0);
5311   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5312   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5313   emit_jns(0);
5314   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5315   #ifdef USE_MINI_HT
5316   if(rs1[i]==31) {
5317     do_miniht_jump(rs,rh,ht);
5318   }
5319   else
5320   #endif
5321   {
5322     //if(rs!=EAX) emit_mov(rs,EAX);
5323     //emit_jmp((int)jump_vaddr_eax);
5324     emit_jmp(jump_vaddr_reg[rs]);
5325   }
5326   /* Check hash table
5327   temp=!rs;
5328   emit_mov(rs,temp);
5329   emit_shrimm(rs,16,rs);
5330   emit_xor(temp,rs,rs);
5331   emit_movzwl_reg(rs,rs);
5332   emit_shlimm(rs,4,rs);
5333   emit_cmpmem_indexed((int)hash_table,rs,temp);
5334   emit_jne((int)out+14);
5335   emit_readword_indexed((int)hash_table+4,rs,rs);
5336   emit_jmpreg(rs);
5337   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5338   emit_addimm_no_flags(8,rs);
5339   emit_jeq((int)out-17);
5340   // No hit on hash table, call compiler
5341   emit_pushreg(temp);
5342 //DEBUG >
5343 #ifdef DEBUG_CYCLE_COUNT
5344   emit_readword((int)&last_count,ECX);
5345   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5346   emit_readword((int)&next_interupt,ECX);
5347   emit_writeword(HOST_CCREG,(int)&Count);
5348   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5349   emit_writeword(ECX,(int)&last_count);
5350 #endif
5351 //DEBUG <
5352   emit_storereg(CCREG,HOST_CCREG);
5353   emit_call((int)get_addr);
5354   emit_loadreg(CCREG,HOST_CCREG);
5355   emit_addimm(ESP,4,ESP);
5356   emit_jmpreg(EAX);*/
5357   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5358   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5359   #endif
5360 }
5361
5362 void cjump_assemble(int i,struct regstat *i_regs)
5363 {
5364   signed char *i_regmap=i_regs->regmap;
5365   int cc;
5366   int match;
5367   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5368   assem_debug("match=%d\n",match);
5369   int s1h,s1l,s2h,s2l;
5370   int prev_cop1_usable=cop1_usable;
5371   int unconditional=0,nop=0;
5372   int only32=0;
5373   int invert=0;
5374   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5375   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5376   if(!match) invert=1;
5377   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5378   if(i>(ba[i]-start)>>2) invert=1;
5379   #endif
5380   
5381   if(ooo[i]) {
5382     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5383     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5384     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5385     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5386   }
5387   else {
5388     s1l=get_reg(i_regmap,rs1[i]);
5389     s1h=get_reg(i_regmap,rs1[i]|64);
5390     s2l=get_reg(i_regmap,rs2[i]);
5391     s2h=get_reg(i_regmap,rs2[i]|64);
5392   }
5393   if(rs1[i]==0&&rs2[i]==0)
5394   {
5395     if(opcode[i]&1) nop=1;
5396     else unconditional=1;
5397     //assert(opcode[i]!=5);
5398     //assert(opcode[i]!=7);
5399     //assert(opcode[i]!=0x15);
5400     //assert(opcode[i]!=0x17);
5401   }
5402   else if(rs1[i]==0)
5403   {
5404     s1l=s2l;s1h=s2h;
5405     s2l=s2h=-1;
5406     only32=(regs[i].was32>>rs2[i])&1;
5407   }
5408   else if(rs2[i]==0)
5409   {
5410     s2l=s2h=-1;
5411     only32=(regs[i].was32>>rs1[i])&1;
5412   }
5413   else {
5414     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5415   }
5416
5417   if(ooo[i]) {
5418     // Out of order execution (delay slot first)
5419     //printf("OOOE\n");
5420     address_generation(i+1,i_regs,regs[i].regmap_entry);
5421     ds_assemble(i+1,i_regs);
5422     int adj;
5423     uint64_t bc_unneeded=branch_regs[i].u;
5424     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5425     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5426     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5427     bc_unneeded|=1;
5428     bc_unneeded_upper|=1;
5429     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5430                   bc_unneeded,bc_unneeded_upper);
5431     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5432     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5433     cc=get_reg(branch_regs[i].regmap,CCREG);
5434     assert(cc==HOST_CCREG);
5435     if(unconditional) 
5436       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5437     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5438     //assem_debug("cycle count (adj)\n");
5439     if(unconditional) {
5440       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5441       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5442         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5443         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5444         if(internal)
5445           assem_debug("branch: internal\n");
5446         else
5447           assem_debug("branch: external\n");
5448         if(internal&&is_ds[(ba[i]-start)>>2]) {
5449           ds_assemble_entry(i);
5450         }
5451         else {
5452           add_to_linker((int)out,ba[i],internal);
5453           emit_jmp(0);
5454         }
5455         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5456         if(((u_int)out)&7) emit_addnop(0);
5457         #endif
5458       }
5459     }
5460     else if(nop) {
5461       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5462       int jaddr=(int)out;
5463       emit_jns(0);
5464       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5465     }
5466     else {
5467       int taken=0,nottaken=0,nottaken1=0;
5468       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5469       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5470       if(!only32)
5471       {
5472         assert(s1h>=0);
5473         if(opcode[i]==4) // BEQ
5474         {
5475           if(s2h>=0) emit_cmp(s1h,s2h);
5476           else emit_test(s1h,s1h);
5477           nottaken1=(int)out;
5478           emit_jne(1);
5479         }
5480         if(opcode[i]==5) // BNE
5481         {
5482           if(s2h>=0) emit_cmp(s1h,s2h);
5483           else emit_test(s1h,s1h);
5484           if(invert) taken=(int)out;
5485           else add_to_linker((int)out,ba[i],internal);
5486           emit_jne(0);
5487         }
5488         if(opcode[i]==6) // BLEZ
5489         {
5490           emit_test(s1h,s1h);
5491           if(invert) taken=(int)out;
5492           else add_to_linker((int)out,ba[i],internal);
5493           emit_js(0);
5494           nottaken1=(int)out;
5495           emit_jne(1);
5496         }
5497         if(opcode[i]==7) // BGTZ
5498         {
5499           emit_test(s1h,s1h);
5500           nottaken1=(int)out;
5501           emit_js(1);
5502           if(invert) taken=(int)out;
5503           else add_to_linker((int)out,ba[i],internal);
5504           emit_jne(0);
5505         }
5506       } // if(!only32)
5507           
5508       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5509       assert(s1l>=0);
5510       if(opcode[i]==4) // BEQ
5511       {
5512         if(s2l>=0) emit_cmp(s1l,s2l);
5513         else emit_test(s1l,s1l);
5514         if(invert){
5515           nottaken=(int)out;
5516           emit_jne(1);
5517         }else{
5518           add_to_linker((int)out,ba[i],internal);
5519           emit_jeq(0);
5520         }
5521       }
5522       if(opcode[i]==5) // BNE
5523       {
5524         if(s2l>=0) emit_cmp(s1l,s2l);
5525         else emit_test(s1l,s1l);
5526         if(invert){
5527           nottaken=(int)out;
5528           emit_jeq(1);
5529         }else{
5530           add_to_linker((int)out,ba[i],internal);
5531           emit_jne(0);
5532         }
5533       }
5534       if(opcode[i]==6) // BLEZ
5535       {
5536         emit_cmpimm(s1l,1);
5537         if(invert){
5538           nottaken=(int)out;
5539           emit_jge(1);
5540         }else{
5541           add_to_linker((int)out,ba[i],internal);
5542           emit_jl(0);
5543         }
5544       }
5545       if(opcode[i]==7) // BGTZ
5546       {
5547         emit_cmpimm(s1l,1);
5548         if(invert){
5549           nottaken=(int)out;
5550           emit_jl(1);
5551         }else{
5552           add_to_linker((int)out,ba[i],internal);
5553           emit_jge(0);
5554         }
5555       }
5556       if(invert) {
5557         if(taken) set_jump_target(taken,(int)out);
5558         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5559         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5560           if(adj) {
5561             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5562             add_to_linker((int)out,ba[i],internal);
5563           }else{
5564             emit_addnop(13);
5565             add_to_linker((int)out,ba[i],internal*2);
5566           }
5567           emit_jmp(0);
5568         }else
5569         #endif
5570         {
5571           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5572           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5573           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5574           if(internal)
5575             assem_debug("branch: internal\n");
5576           else
5577             assem_debug("branch: external\n");
5578           if(internal&&is_ds[(ba[i]-start)>>2]) {
5579             ds_assemble_entry(i);
5580           }
5581           else {
5582             add_to_linker((int)out,ba[i],internal);
5583             emit_jmp(0);
5584           }
5585         }
5586         set_jump_target(nottaken,(int)out);
5587       }
5588
5589       if(nottaken1) set_jump_target(nottaken1,(int)out);
5590       if(adj) {
5591         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5592       }
5593     } // (!unconditional)
5594   } // if(ooo)
5595   else
5596   {
5597     // In-order execution (branch first)
5598     //if(likely[i]) printf("IOL\n");
5599     //else
5600     //printf("IOE\n");
5601     int taken=0,nottaken=0,nottaken1=0;
5602     if(!unconditional&&!nop) {
5603       if(!only32)
5604       {
5605         assert(s1h>=0);
5606         if((opcode[i]&0x2f)==4) // BEQ
5607         {
5608           if(s2h>=0) emit_cmp(s1h,s2h);
5609           else emit_test(s1h,s1h);
5610           nottaken1=(int)out;
5611           emit_jne(2);
5612         }
5613         if((opcode[i]&0x2f)==5) // BNE
5614         {
5615           if(s2h>=0) emit_cmp(s1h,s2h);
5616           else emit_test(s1h,s1h);
5617           taken=(int)out;
5618           emit_jne(1);
5619         }
5620         if((opcode[i]&0x2f)==6) // BLEZ
5621         {
5622           emit_test(s1h,s1h);
5623           taken=(int)out;
5624           emit_js(1);
5625           nottaken1=(int)out;
5626           emit_jne(2);
5627         }
5628         if((opcode[i]&0x2f)==7) // BGTZ
5629         {
5630           emit_test(s1h,s1h);
5631           nottaken1=(int)out;
5632           emit_js(2);
5633           taken=(int)out;
5634           emit_jne(1);
5635         }
5636       } // if(!only32)
5637           
5638       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5639       assert(s1l>=0);
5640       if((opcode[i]&0x2f)==4) // BEQ
5641       {
5642         if(s2l>=0) emit_cmp(s1l,s2l);
5643         else emit_test(s1l,s1l);
5644         nottaken=(int)out;
5645         emit_jne(2);
5646       }
5647       if((opcode[i]&0x2f)==5) // BNE
5648       {
5649         if(s2l>=0) emit_cmp(s1l,s2l);
5650         else emit_test(s1l,s1l);
5651         nottaken=(int)out;
5652         emit_jeq(2);
5653       }
5654       if((opcode[i]&0x2f)==6) // BLEZ
5655       {
5656         emit_cmpimm(s1l,1);
5657         nottaken=(int)out;
5658         emit_jge(2);
5659       }
5660       if((opcode[i]&0x2f)==7) // BGTZ
5661       {
5662         emit_cmpimm(s1l,1);
5663         nottaken=(int)out;
5664         emit_jl(2);
5665       }
5666     } // if(!unconditional)
5667     int adj;
5668     uint64_t ds_unneeded=branch_regs[i].u;
5669     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5670     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5671     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5672     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5673     ds_unneeded|=1;
5674     ds_unneeded_upper|=1;
5675     // branch taken
5676     if(!nop) {
5677       if(taken) set_jump_target(taken,(int)out);
5678       assem_debug("1:\n");
5679       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5680                     ds_unneeded,ds_unneeded_upper);
5681       // load regs
5682       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5683       address_generation(i+1,&branch_regs[i],0);
5684       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5685       ds_assemble(i+1,&branch_regs[i]);
5686       cc=get_reg(branch_regs[i].regmap,CCREG);
5687       if(cc==-1) {
5688         emit_loadreg(CCREG,cc=HOST_CCREG);
5689         // CHECK: Is the following instruction (fall thru) allocated ok?
5690       }
5691       assert(cc==HOST_CCREG);
5692       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5693       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5694       assem_debug("cycle count (adj)\n");
5695       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5696       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5697       if(internal)
5698         assem_debug("branch: internal\n");
5699       else
5700         assem_debug("branch: external\n");
5701       if(internal&&is_ds[(ba[i]-start)>>2]) {
5702         ds_assemble_entry(i);
5703       }
5704       else {
5705         add_to_linker((int)out,ba[i],internal);
5706         emit_jmp(0);
5707       }
5708     }
5709     // branch not taken
5710     cop1_usable=prev_cop1_usable;
5711     if(!unconditional) {
5712       if(nottaken1) set_jump_target(nottaken1,(int)out);
5713       set_jump_target(nottaken,(int)out);
5714       assem_debug("2:\n");
5715       if(!likely[i]) {
5716         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5717                       ds_unneeded,ds_unneeded_upper);
5718         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5719         address_generation(i+1,&branch_regs[i],0);
5720         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5721         ds_assemble(i+1,&branch_regs[i]);
5722       }
5723       cc=get_reg(branch_regs[i].regmap,CCREG);
5724       if(cc==-1&&!likely[i]) {
5725         // Cycle count isn't in a register, temporarily load it then write it out
5726         emit_loadreg(CCREG,HOST_CCREG);
5727         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5728         int jaddr=(int)out;
5729         emit_jns(0);
5730         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5731         emit_storereg(CCREG,HOST_CCREG);
5732       }
5733       else{
5734         cc=get_reg(i_regmap,CCREG);
5735         assert(cc==HOST_CCREG);
5736         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5737         int jaddr=(int)out;
5738         emit_jns(0);
5739         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5740       }
5741     }
5742   }
5743 }
5744
5745 void sjump_assemble(int i,struct regstat *i_regs)
5746 {
5747   signed char *i_regmap=i_regs->regmap;
5748   int cc;
5749   int match;
5750   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5751   assem_debug("smatch=%d\n",match);
5752   int s1h,s1l;
5753   int prev_cop1_usable=cop1_usable;
5754   int unconditional=0,nevertaken=0;
5755   int only32=0;
5756   int invert=0;
5757   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5758   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5759   if(!match) invert=1;
5760   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5761   if(i>(ba[i]-start)>>2) invert=1;
5762   #endif
5763
5764   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5765   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5766
5767   if(ooo[i]) {
5768     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5769     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5770   }
5771   else {
5772     s1l=get_reg(i_regmap,rs1[i]);
5773     s1h=get_reg(i_regmap,rs1[i]|64);
5774   }
5775   if(rs1[i]==0)
5776   {
5777     if(opcode2[i]&1) unconditional=1;
5778     else nevertaken=1;
5779     // These are never taken (r0 is never less than zero)
5780     //assert(opcode2[i]!=0);
5781     //assert(opcode2[i]!=2);
5782     //assert(opcode2[i]!=0x10);
5783     //assert(opcode2[i]!=0x12);
5784   }
5785   else {
5786     only32=(regs[i].was32>>rs1[i])&1;
5787   }
5788
5789   if(ooo[i]) {
5790     // Out of order execution (delay slot first)
5791     //printf("OOOE\n");
5792     address_generation(i+1,i_regs,regs[i].regmap_entry);
5793     ds_assemble(i+1,i_regs);
5794     int adj;
5795     uint64_t bc_unneeded=branch_regs[i].u;
5796     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5797     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5798     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5799     bc_unneeded|=1;
5800     bc_unneeded_upper|=1;
5801     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5802                   bc_unneeded,bc_unneeded_upper);
5803     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5804     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5805     if(rt1[i]==31) {
5806       int rt,return_address;
5807       rt=get_reg(branch_regs[i].regmap,31);
5808       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5809       if(rt>=0) {
5810         // Save the PC even if the branch is not taken
5811         return_address=start+i*4+8;
5812         emit_movimm(return_address,rt); // PC into link register
5813         #ifdef IMM_PREFETCH
5814         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5815         #endif
5816       }
5817     }
5818     cc=get_reg(branch_regs[i].regmap,CCREG);
5819     assert(cc==HOST_CCREG);
5820     if(unconditional) 
5821       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5822     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5823     assem_debug("cycle count (adj)\n");
5824     if(unconditional) {
5825       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5826       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5827         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5828         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5829         if(internal)
5830           assem_debug("branch: internal\n");
5831         else
5832           assem_debug("branch: external\n");
5833         if(internal&&is_ds[(ba[i]-start)>>2]) {
5834           ds_assemble_entry(i);
5835         }
5836         else {
5837           add_to_linker((int)out,ba[i],internal);
5838           emit_jmp(0);
5839         }
5840         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5841         if(((u_int)out)&7) emit_addnop(0);
5842         #endif
5843       }
5844     }
5845     else if(nevertaken) {
5846       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5847       int jaddr=(int)out;
5848       emit_jns(0);
5849       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5850     }
5851     else {
5852       int nottaken=0;
5853       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5854       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5855       if(!only32)
5856       {
5857         assert(s1h>=0);
5858         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5859         {
5860           emit_test(s1h,s1h);
5861           if(invert){
5862             nottaken=(int)out;
5863             emit_jns(1);
5864           }else{
5865             add_to_linker((int)out,ba[i],internal);
5866             emit_js(0);
5867           }
5868         }
5869         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5870         {
5871           emit_test(s1h,s1h);
5872           if(invert){
5873             nottaken=(int)out;
5874             emit_js(1);
5875           }else{
5876             add_to_linker((int)out,ba[i],internal);
5877             emit_jns(0);
5878           }
5879         }
5880       } // if(!only32)
5881       else
5882       {
5883         assert(s1l>=0);
5884         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5885         {
5886           emit_test(s1l,s1l);
5887           if(invert){
5888             nottaken=(int)out;
5889             emit_jns(1);
5890           }else{
5891             add_to_linker((int)out,ba[i],internal);
5892             emit_js(0);
5893           }
5894         }
5895         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5896         {
5897           emit_test(s1l,s1l);
5898           if(invert){
5899             nottaken=(int)out;
5900             emit_js(1);
5901           }else{
5902             add_to_linker((int)out,ba[i],internal);
5903             emit_jns(0);
5904           }
5905         }
5906       } // if(!only32)
5907           
5908       if(invert) {
5909         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5910         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5911           if(adj) {
5912             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5913             add_to_linker((int)out,ba[i],internal);
5914           }else{
5915             emit_addnop(13);
5916             add_to_linker((int)out,ba[i],internal*2);
5917           }
5918           emit_jmp(0);
5919         }else
5920         #endif
5921         {
5922           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5923           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5924           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5925           if(internal)
5926             assem_debug("branch: internal\n");
5927           else
5928             assem_debug("branch: external\n");
5929           if(internal&&is_ds[(ba[i]-start)>>2]) {
5930             ds_assemble_entry(i);
5931           }
5932           else {
5933             add_to_linker((int)out,ba[i],internal);
5934             emit_jmp(0);
5935           }
5936         }
5937         set_jump_target(nottaken,(int)out);
5938       }
5939
5940       if(adj) {
5941         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5942       }
5943     } // (!unconditional)
5944   } // if(ooo)
5945   else
5946   {
5947     // In-order execution (branch first)
5948     //printf("IOE\n");
5949     int nottaken=0;
5950     if(rt1[i]==31) {
5951       int rt,return_address;
5952       rt=get_reg(branch_regs[i].regmap,31);
5953       if(rt>=0) {
5954         // Save the PC even if the branch is not taken
5955         return_address=start+i*4+8;
5956         emit_movimm(return_address,rt); // PC into link register
5957         #ifdef IMM_PREFETCH
5958         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5959         #endif
5960       }
5961     }
5962     if(!unconditional) {
5963       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5964       if(!only32)
5965       {
5966         assert(s1h>=0);
5967         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5968         {
5969           emit_test(s1h,s1h);
5970           nottaken=(int)out;
5971           emit_jns(1);
5972         }
5973         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5974         {
5975           emit_test(s1h,s1h);
5976           nottaken=(int)out;
5977           emit_js(1);
5978         }
5979       } // if(!only32)
5980       else
5981       {
5982         assert(s1l>=0);
5983         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5984         {
5985           emit_test(s1l,s1l);
5986           nottaken=(int)out;
5987           emit_jns(1);
5988         }
5989         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5990         {
5991           emit_test(s1l,s1l);
5992           nottaken=(int)out;
5993           emit_js(1);
5994         }
5995       }
5996     } // if(!unconditional)
5997     int adj;
5998     uint64_t ds_unneeded=branch_regs[i].u;
5999     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6000     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6001     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6002     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6003     ds_unneeded|=1;
6004     ds_unneeded_upper|=1;
6005     // branch taken
6006     if(!nevertaken) {
6007       //assem_debug("1:\n");
6008       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6009                     ds_unneeded,ds_unneeded_upper);
6010       // load regs
6011       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6012       address_generation(i+1,&branch_regs[i],0);
6013       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6014       ds_assemble(i+1,&branch_regs[i]);
6015       cc=get_reg(branch_regs[i].regmap,CCREG);
6016       if(cc==-1) {
6017         emit_loadreg(CCREG,cc=HOST_CCREG);
6018         // CHECK: Is the following instruction (fall thru) allocated ok?
6019       }
6020       assert(cc==HOST_CCREG);
6021       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6022       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6023       assem_debug("cycle count (adj)\n");
6024       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6025       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6026       if(internal)
6027         assem_debug("branch: internal\n");
6028       else
6029         assem_debug("branch: external\n");
6030       if(internal&&is_ds[(ba[i]-start)>>2]) {
6031         ds_assemble_entry(i);
6032       }
6033       else {
6034         add_to_linker((int)out,ba[i],internal);
6035         emit_jmp(0);
6036       }
6037     }
6038     // branch not taken
6039     cop1_usable=prev_cop1_usable;
6040     if(!unconditional) {
6041       set_jump_target(nottaken,(int)out);
6042       assem_debug("1:\n");
6043       if(!likely[i]) {
6044         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6045                       ds_unneeded,ds_unneeded_upper);
6046         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6047         address_generation(i+1,&branch_regs[i],0);
6048         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6049         ds_assemble(i+1,&branch_regs[i]);
6050       }
6051       cc=get_reg(branch_regs[i].regmap,CCREG);
6052       if(cc==-1&&!likely[i]) {
6053         // Cycle count isn't in a register, temporarily load it then write it out
6054         emit_loadreg(CCREG,HOST_CCREG);
6055         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6056         int jaddr=(int)out;
6057         emit_jns(0);
6058         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6059         emit_storereg(CCREG,HOST_CCREG);
6060       }
6061       else{
6062         cc=get_reg(i_regmap,CCREG);
6063         assert(cc==HOST_CCREG);
6064         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6065         int jaddr=(int)out;
6066         emit_jns(0);
6067         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6068       }
6069     }
6070   }
6071 }
6072
6073 void fjump_assemble(int i,struct regstat *i_regs)
6074 {
6075   signed char *i_regmap=i_regs->regmap;
6076   int cc;
6077   int match;
6078   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6079   assem_debug("fmatch=%d\n",match);
6080   int fs,cs;
6081   int eaddr;
6082   int invert=0;
6083   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6084   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6085   if(!match) invert=1;
6086   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6087   if(i>(ba[i]-start)>>2) invert=1;
6088   #endif
6089
6090   if(ooo[i]) {
6091     fs=get_reg(branch_regs[i].regmap,FSREG);
6092     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6093   }
6094   else {
6095     fs=get_reg(i_regmap,FSREG);
6096   }
6097
6098   // Check cop1 unusable
6099   if(!cop1_usable) {
6100     cs=get_reg(i_regmap,CSREG);
6101     assert(cs>=0);
6102     emit_testimm(cs,0x20000000);
6103     eaddr=(int)out;
6104     emit_jeq(0);
6105     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6106     cop1_usable=1;
6107   }
6108
6109   if(ooo[i]) {
6110     // Out of order execution (delay slot first)
6111     //printf("OOOE\n");
6112     ds_assemble(i+1,i_regs);
6113     int adj;
6114     uint64_t bc_unneeded=branch_regs[i].u;
6115     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6116     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6117     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6118     bc_unneeded|=1;
6119     bc_unneeded_upper|=1;
6120     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6121                   bc_unneeded,bc_unneeded_upper);
6122     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6123     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6124     cc=get_reg(branch_regs[i].regmap,CCREG);
6125     assert(cc==HOST_CCREG);
6126     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6127     assem_debug("cycle count (adj)\n");
6128     if(1) {
6129       int nottaken=0;
6130       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6131       if(1) {
6132         assert(fs>=0);
6133         emit_testimm(fs,0x800000);
6134         if(source[i]&0x10000) // BC1T
6135         {
6136           if(invert){
6137             nottaken=(int)out;
6138             emit_jeq(1);
6139           }else{
6140             add_to_linker((int)out,ba[i],internal);
6141             emit_jne(0);
6142           }
6143         }
6144         else // BC1F
6145           if(invert){
6146             nottaken=(int)out;
6147             emit_jne(1);
6148           }else{
6149             add_to_linker((int)out,ba[i],internal);
6150             emit_jeq(0);
6151           }
6152         {
6153         }
6154       } // if(!only32)
6155           
6156       if(invert) {
6157         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6158         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6159         else if(match) emit_addnop(13);
6160         #endif
6161         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6162         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6163         if(internal)
6164           assem_debug("branch: internal\n");
6165         else
6166           assem_debug("branch: external\n");
6167         if(internal&&is_ds[(ba[i]-start)>>2]) {
6168           ds_assemble_entry(i);
6169         }
6170         else {
6171           add_to_linker((int)out,ba[i],internal);
6172           emit_jmp(0);
6173         }
6174         set_jump_target(nottaken,(int)out);
6175       }
6176
6177       if(adj) {
6178         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6179       }
6180     } // (!unconditional)
6181   } // if(ooo)
6182   else
6183   {
6184     // In-order execution (branch first)
6185     //printf("IOE\n");
6186     int nottaken=0;
6187     if(1) {
6188       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6189       if(1) {
6190         assert(fs>=0);
6191         emit_testimm(fs,0x800000);
6192         if(source[i]&0x10000) // BC1T
6193         {
6194           nottaken=(int)out;
6195           emit_jeq(1);
6196         }
6197         else // BC1F
6198         {
6199           nottaken=(int)out;
6200           emit_jne(1);
6201         }
6202       }
6203     } // if(!unconditional)
6204     int adj;
6205     uint64_t ds_unneeded=branch_regs[i].u;
6206     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6207     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6208     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6209     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6210     ds_unneeded|=1;
6211     ds_unneeded_upper|=1;
6212     // branch taken
6213     //assem_debug("1:\n");
6214     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6215                   ds_unneeded,ds_unneeded_upper);
6216     // load regs
6217     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6218     address_generation(i+1,&branch_regs[i],0);
6219     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6220     ds_assemble(i+1,&branch_regs[i]);
6221     cc=get_reg(branch_regs[i].regmap,CCREG);
6222     if(cc==-1) {
6223       emit_loadreg(CCREG,cc=HOST_CCREG);
6224       // CHECK: Is the following instruction (fall thru) allocated ok?
6225     }
6226     assert(cc==HOST_CCREG);
6227     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6228     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6229     assem_debug("cycle count (adj)\n");
6230     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6231     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6232     if(internal)
6233       assem_debug("branch: internal\n");
6234     else
6235       assem_debug("branch: external\n");
6236     if(internal&&is_ds[(ba[i]-start)>>2]) {
6237       ds_assemble_entry(i);
6238     }
6239     else {
6240       add_to_linker((int)out,ba[i],internal);
6241       emit_jmp(0);
6242     }
6243
6244     // branch not taken
6245     if(1) { // <- FIXME (don't need this)
6246       set_jump_target(nottaken,(int)out);
6247       assem_debug("1:\n");
6248       if(!likely[i]) {
6249         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6250                       ds_unneeded,ds_unneeded_upper);
6251         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6252         address_generation(i+1,&branch_regs[i],0);
6253         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6254         ds_assemble(i+1,&branch_regs[i]);
6255       }
6256       cc=get_reg(branch_regs[i].regmap,CCREG);
6257       if(cc==-1&&!likely[i]) {
6258         // Cycle count isn't in a register, temporarily load it then write it out
6259         emit_loadreg(CCREG,HOST_CCREG);
6260         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6261         int jaddr=(int)out;
6262         emit_jns(0);
6263         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6264         emit_storereg(CCREG,HOST_CCREG);
6265       }
6266       else{
6267         cc=get_reg(i_regmap,CCREG);
6268         assert(cc==HOST_CCREG);
6269         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6270         int jaddr=(int)out;
6271         emit_jns(0);
6272         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6273       }
6274     }
6275   }
6276 }
6277
6278 static void pagespan_assemble(int i,struct regstat *i_regs)
6279 {
6280   int s1l=get_reg(i_regs->regmap,rs1[i]);
6281   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6282   int s2l=get_reg(i_regs->regmap,rs2[i]);
6283   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6284   void *nt_branch=NULL;
6285   int taken=0;
6286   int nottaken=0;
6287   int unconditional=0;
6288   if(rs1[i]==0)
6289   {
6290     s1l=s2l;s1h=s2h;
6291     s2l=s2h=-1;
6292   }
6293   else if(rs2[i]==0)
6294   {
6295     s2l=s2h=-1;
6296   }
6297   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6298     s1h=s2h=-1;
6299   }
6300   int hr=0;
6301   int addr,alt,ntaddr;
6302   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6303   else {
6304     while(hr<HOST_REGS)
6305     {
6306       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6307          (i_regs->regmap[hr]&63)!=rs1[i] &&
6308          (i_regs->regmap[hr]&63)!=rs2[i] )
6309       {
6310         addr=hr++;break;
6311       }
6312       hr++;
6313     }
6314   }
6315   while(hr<HOST_REGS)
6316   {
6317     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6318        (i_regs->regmap[hr]&63)!=rs1[i] &&
6319        (i_regs->regmap[hr]&63)!=rs2[i] )
6320     {
6321       alt=hr++;break;
6322     }
6323     hr++;
6324   }
6325   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6326   {
6327     while(hr<HOST_REGS)
6328     {
6329       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6330          (i_regs->regmap[hr]&63)!=rs1[i] &&
6331          (i_regs->regmap[hr]&63)!=rs2[i] )
6332       {
6333         ntaddr=hr;break;
6334       }
6335       hr++;
6336     }
6337   }
6338   assert(hr<HOST_REGS);
6339   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6340     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6341   }
6342   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6343   if(opcode[i]==2) // J
6344   {
6345     unconditional=1;
6346   }
6347   if(opcode[i]==3) // JAL
6348   {
6349     // TODO: mini_ht
6350     int rt=get_reg(i_regs->regmap,31);
6351     emit_movimm(start+i*4+8,rt);
6352     unconditional=1;
6353   }
6354   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6355   {
6356     emit_mov(s1l,addr);
6357     if(opcode2[i]==9) // JALR
6358     {
6359       int rt=get_reg(i_regs->regmap,rt1[i]);
6360       emit_movimm(start+i*4+8,rt);
6361     }
6362   }
6363   if((opcode[i]&0x3f)==4) // BEQ
6364   {
6365     if(rs1[i]==rs2[i])
6366     {
6367       unconditional=1;
6368     }
6369     else
6370     #ifdef HAVE_CMOV_IMM
6371     if(s1h<0) {
6372       if(s2l>=0) emit_cmp(s1l,s2l);
6373       else emit_test(s1l,s1l);
6374       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6375     }
6376     else
6377     #endif
6378     {
6379       assert(s1l>=0);
6380       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6381       if(s1h>=0) {
6382         if(s2h>=0) emit_cmp(s1h,s2h);
6383         else emit_test(s1h,s1h);
6384         emit_cmovne_reg(alt,addr);
6385       }
6386       if(s2l>=0) emit_cmp(s1l,s2l);
6387       else emit_test(s1l,s1l);
6388       emit_cmovne_reg(alt,addr);
6389     }
6390   }
6391   if((opcode[i]&0x3f)==5) // BNE
6392   {
6393     #ifdef HAVE_CMOV_IMM
6394     if(s1h<0) {
6395       if(s2l>=0) emit_cmp(s1l,s2l);
6396       else emit_test(s1l,s1l);
6397       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6398     }
6399     else
6400     #endif
6401     {
6402       assert(s1l>=0);
6403       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6404       if(s1h>=0) {
6405         if(s2h>=0) emit_cmp(s1h,s2h);
6406         else emit_test(s1h,s1h);
6407         emit_cmovne_reg(alt,addr);
6408       }
6409       if(s2l>=0) emit_cmp(s1l,s2l);
6410       else emit_test(s1l,s1l);
6411       emit_cmovne_reg(alt,addr);
6412     }
6413   }
6414   if((opcode[i]&0x3f)==0x14) // BEQL
6415   {
6416     if(s1h>=0) {
6417       if(s2h>=0) emit_cmp(s1h,s2h);
6418       else emit_test(s1h,s1h);
6419       nottaken=(int)out;
6420       emit_jne(0);
6421     }
6422     if(s2l>=0) emit_cmp(s1l,s2l);
6423     else emit_test(s1l,s1l);
6424     if(nottaken) set_jump_target(nottaken,(int)out);
6425     nottaken=(int)out;
6426     emit_jne(0);
6427   }
6428   if((opcode[i]&0x3f)==0x15) // BNEL
6429   {
6430     if(s1h>=0) {
6431       if(s2h>=0) emit_cmp(s1h,s2h);
6432       else emit_test(s1h,s1h);
6433       taken=(int)out;
6434       emit_jne(0);
6435     }
6436     if(s2l>=0) emit_cmp(s1l,s2l);
6437     else emit_test(s1l,s1l);
6438     nottaken=(int)out;
6439     emit_jeq(0);
6440     if(taken) set_jump_target(taken,(int)out);
6441   }
6442   if((opcode[i]&0x3f)==6) // BLEZ
6443   {
6444     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6445     emit_cmpimm(s1l,1);
6446     if(s1h>=0) emit_mov(addr,ntaddr);
6447     emit_cmovl_reg(alt,addr);
6448     if(s1h>=0) {
6449       emit_test(s1h,s1h);
6450       emit_cmovne_reg(ntaddr,addr);
6451       emit_cmovs_reg(alt,addr);
6452     }
6453   }
6454   if((opcode[i]&0x3f)==7) // BGTZ
6455   {
6456     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6457     emit_cmpimm(s1l,1);
6458     if(s1h>=0) emit_mov(addr,alt);
6459     emit_cmovl_reg(ntaddr,addr);
6460     if(s1h>=0) {
6461       emit_test(s1h,s1h);
6462       emit_cmovne_reg(alt,addr);
6463       emit_cmovs_reg(ntaddr,addr);
6464     }
6465   }
6466   if((opcode[i]&0x3f)==0x16) // BLEZL
6467   {
6468     assert((opcode[i]&0x3f)!=0x16);
6469   }
6470   if((opcode[i]&0x3f)==0x17) // BGTZL
6471   {
6472     assert((opcode[i]&0x3f)!=0x17);
6473   }
6474   assert(opcode[i]!=1); // BLTZ/BGEZ
6475
6476   //FIXME: Check CSREG
6477   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6478     if((source[i]&0x30000)==0) // BC1F
6479     {
6480       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6481       emit_testimm(s1l,0x800000);
6482       emit_cmovne_reg(alt,addr);
6483     }
6484     if((source[i]&0x30000)==0x10000) // BC1T
6485     {
6486       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6487       emit_testimm(s1l,0x800000);
6488       emit_cmovne_reg(alt,addr);
6489     }
6490     if((source[i]&0x30000)==0x20000) // BC1FL
6491     {
6492       emit_testimm(s1l,0x800000);
6493       nottaken=(int)out;
6494       emit_jne(0);
6495     }
6496     if((source[i]&0x30000)==0x30000) // BC1TL
6497     {
6498       emit_testimm(s1l,0x800000);
6499       nottaken=(int)out;
6500       emit_jeq(0);
6501     }
6502   }
6503
6504   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6505   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6506   if(likely[i]||unconditional)
6507   {
6508     emit_movimm(ba[i],HOST_BTREG);
6509   }
6510   else if(addr!=HOST_BTREG)
6511   {
6512     emit_mov(addr,HOST_BTREG);
6513   }
6514   void *branch_addr=out;
6515   emit_jmp(0);
6516   int target_addr=start+i*4+5;
6517   void *stub=out;
6518   void *compiled_target_addr=check_addr(target_addr);
6519   emit_extjump_ds((int)branch_addr,target_addr);
6520   if(compiled_target_addr) {
6521     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6522     add_link(target_addr,stub);
6523   }
6524   else set_jump_target((int)branch_addr,(int)stub);
6525   if(likely[i]) {
6526     // Not-taken path
6527     set_jump_target((int)nottaken,(int)out);
6528     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6529     void *branch_addr=out;
6530     emit_jmp(0);
6531     int target_addr=start+i*4+8;
6532     void *stub=out;
6533     void *compiled_target_addr=check_addr(target_addr);
6534     emit_extjump_ds((int)branch_addr,target_addr);
6535     if(compiled_target_addr) {
6536       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6537       add_link(target_addr,stub);
6538     }
6539     else set_jump_target((int)branch_addr,(int)stub);
6540   }
6541 }
6542
6543 // Assemble the delay slot for the above
6544 static void pagespan_ds()
6545 {
6546   assem_debug("initial delay slot:\n");
6547   u_int vaddr=start+1;
6548   u_int page=get_page(vaddr);
6549   u_int vpage=get_vpage(vaddr);
6550   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6551   do_dirty_stub_ds();
6552   ll_add(jump_in+page,vaddr,(void *)out);
6553   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6554   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6555     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6556   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6557     emit_writeword(HOST_BTREG,(int)&branch_target);
6558   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6559   address_generation(0,&regs[0],regs[0].regmap_entry);
6560   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6561     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6562   cop1_usable=0;
6563   is_delayslot=0;
6564   switch(itype[0]) {
6565     case ALU:
6566       alu_assemble(0,&regs[0]);break;
6567     case IMM16:
6568       imm16_assemble(0,&regs[0]);break;
6569     case SHIFT:
6570       shift_assemble(0,&regs[0]);break;
6571     case SHIFTIMM:
6572       shiftimm_assemble(0,&regs[0]);break;
6573     case LOAD:
6574       load_assemble(0,&regs[0]);break;
6575     case LOADLR:
6576       loadlr_assemble(0,&regs[0]);break;
6577     case STORE:
6578       store_assemble(0,&regs[0]);break;
6579     case STORELR:
6580       storelr_assemble(0,&regs[0]);break;
6581     case COP0:
6582       cop0_assemble(0,&regs[0]);break;
6583     case COP1:
6584       cop1_assemble(0,&regs[0]);break;
6585     case C1LS:
6586       c1ls_assemble(0,&regs[0]);break;
6587     case COP2:
6588       cop2_assemble(0,&regs[0]);break;
6589     case C2LS:
6590       c2ls_assemble(0,&regs[0]);break;
6591     case C2OP:
6592       c2op_assemble(0,&regs[0]);break;
6593     case FCONV:
6594       fconv_assemble(0,&regs[0]);break;
6595     case FLOAT:
6596       float_assemble(0,&regs[0]);break;
6597     case FCOMP:
6598       fcomp_assemble(0,&regs[0]);break;
6599     case MULTDIV:
6600       multdiv_assemble(0,&regs[0]);break;
6601     case MOV:
6602       mov_assemble(0,&regs[0]);break;
6603     case SYSCALL:
6604     case HLECALL:
6605     case INTCALL:
6606     case SPAN:
6607     case UJUMP:
6608     case RJUMP:
6609     case CJUMP:
6610     case SJUMP:
6611     case FJUMP:
6612       printf("Jump in the delay slot.  This is probably a bug.\n");
6613   }
6614   int btaddr=get_reg(regs[0].regmap,BTREG);
6615   if(btaddr<0) {
6616     btaddr=get_reg(regs[0].regmap,-1);
6617     emit_readword((int)&branch_target,btaddr);
6618   }
6619   assert(btaddr!=HOST_CCREG);
6620   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6621 #ifdef HOST_IMM8
6622   emit_movimm(start+4,HOST_TEMPREG);
6623   emit_cmp(btaddr,HOST_TEMPREG);
6624 #else
6625   emit_cmpimm(btaddr,start+4);
6626 #endif
6627   int branch=(int)out;
6628   emit_jeq(0);
6629   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6630   emit_jmp(jump_vaddr_reg[btaddr]);
6631   set_jump_target(branch,(int)out);
6632   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6633   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6634 }
6635
6636 // Basic liveness analysis for MIPS registers
6637 void unneeded_registers(int istart,int iend,int r)
6638 {
6639   int i;
6640   uint64_t u,uu,b,bu;
6641   uint64_t temp_u,temp_uu;
6642   uint64_t tdep;
6643   if(iend==slen-1) {
6644     u=1;uu=1;
6645   }else{
6646     u=unneeded_reg[iend+1];
6647     uu=unneeded_reg_upper[iend+1];
6648     u=1;uu=1;
6649   }
6650   for (i=iend;i>=istart;i--)
6651   {
6652     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6653     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6654     {
6655       // If subroutine call, flag return address as a possible branch target
6656       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6657       
6658       if(ba[i]<start || ba[i]>=(start+slen*4))
6659       {
6660         // Branch out of this block, flush all regs
6661         u=1;
6662         uu=1;
6663         /* Hexagon hack 
6664         if(itype[i]==UJUMP&&rt1[i]==31)
6665         {
6666           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6667         }
6668         if(itype[i]==RJUMP&&rs1[i]==31)
6669         {
6670           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6671         }
6672         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6673           if(itype[i]==UJUMP&&rt1[i]==31)
6674           {
6675             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6676             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6677           }
6678           if(itype[i]==RJUMP&&rs1[i]==31)
6679           {
6680             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6681             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6682           }
6683         }*/
6684         branch_unneeded_reg[i]=u;
6685         branch_unneeded_reg_upper[i]=uu;
6686         // Merge in delay slot
6687         tdep=(~uu>>rt1[i+1])&1;
6688         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6689         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6690         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6691         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6692         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6693         u|=1;uu|=1;
6694         // If branch is "likely" (and conditional)
6695         // then we skip the delay slot on the fall-thru path
6696         if(likely[i]) {
6697           if(i<slen-1) {
6698             u&=unneeded_reg[i+2];
6699             uu&=unneeded_reg_upper[i+2];
6700           }
6701           else
6702           {
6703             u=1;
6704             uu=1;
6705           }
6706         }
6707       }
6708       else
6709       {
6710         // Internal branch, flag target
6711         bt[(ba[i]-start)>>2]=1;
6712         if(ba[i]<=start+i*4) {
6713           // Backward branch
6714           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6715           {
6716             // Unconditional branch
6717             temp_u=1;temp_uu=1;
6718           } else {
6719             // Conditional branch (not taken case)
6720             temp_u=unneeded_reg[i+2];
6721             temp_uu=unneeded_reg_upper[i+2];
6722           }
6723           // Merge in delay slot
6724           tdep=(~temp_uu>>rt1[i+1])&1;
6725           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6726           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6727           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6728           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6729           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6730           temp_u|=1;temp_uu|=1;
6731           // If branch is "likely" (and conditional)
6732           // then we skip the delay slot on the fall-thru path
6733           if(likely[i]) {
6734             if(i<slen-1) {
6735               temp_u&=unneeded_reg[i+2];
6736               temp_uu&=unneeded_reg_upper[i+2];
6737             }
6738             else
6739             {
6740               temp_u=1;
6741               temp_uu=1;
6742             }
6743           }
6744           tdep=(~temp_uu>>rt1[i])&1;
6745           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6746           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6747           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6748           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6749           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6750           temp_u|=1;temp_uu|=1;
6751           unneeded_reg[i]=temp_u;
6752           unneeded_reg_upper[i]=temp_uu;
6753           // Only go three levels deep.  This recursion can take an
6754           // excessive amount of time if there are a lot of nested loops.
6755           if(r<2) {
6756             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6757           }else{
6758             unneeded_reg[(ba[i]-start)>>2]=1;
6759             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6760           }
6761         } /*else*/ if(1) {
6762           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6763           {
6764             // Unconditional branch
6765             u=unneeded_reg[(ba[i]-start)>>2];
6766             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6767             branch_unneeded_reg[i]=u;
6768             branch_unneeded_reg_upper[i]=uu;
6769         //u=1;
6770         //uu=1;
6771         //branch_unneeded_reg[i]=u;
6772         //branch_unneeded_reg_upper[i]=uu;
6773             // Merge in delay slot
6774             tdep=(~uu>>rt1[i+1])&1;
6775             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6776             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6777             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6778             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6779             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6780             u|=1;uu|=1;
6781           } else {
6782             // Conditional branch
6783             b=unneeded_reg[(ba[i]-start)>>2];
6784             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6785             branch_unneeded_reg[i]=b;
6786             branch_unneeded_reg_upper[i]=bu;
6787         //b=1;
6788         //bu=1;
6789         //branch_unneeded_reg[i]=b;
6790         //branch_unneeded_reg_upper[i]=bu;
6791             // Branch delay slot
6792             tdep=(~uu>>rt1[i+1])&1;
6793             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6794             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6795             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6796             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6797             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6798             b|=1;bu|=1;
6799             // If branch is "likely" then we skip the
6800             // delay slot on the fall-thru path
6801             if(likely[i]) {
6802               u=b;
6803               uu=bu;
6804               if(i<slen-1) {
6805                 u&=unneeded_reg[i+2];
6806                 uu&=unneeded_reg_upper[i+2];
6807         //u=1;
6808         //uu=1;
6809               }
6810             } else {
6811               u&=b;
6812               uu&=bu;
6813         //u=1;
6814         //uu=1;
6815             }
6816             if(i<slen-1) {
6817               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6818               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6819         //branch_unneeded_reg[i]=1;
6820         //branch_unneeded_reg_upper[i]=1;
6821             } else {
6822               branch_unneeded_reg[i]=1;
6823               branch_unneeded_reg_upper[i]=1;
6824             }
6825           }
6826         }
6827       }
6828     }
6829     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6830     {
6831       // SYSCALL instruction (software interrupt)
6832       u=1;
6833       uu=1;
6834     }
6835     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6836     {
6837       // ERET instruction (return from interrupt)
6838       u=1;
6839       uu=1;
6840     }
6841     //u=uu=1; // DEBUG
6842     tdep=(~uu>>rt1[i])&1;
6843     // Written registers are unneeded
6844     u|=1LL<<rt1[i];
6845     u|=1LL<<rt2[i];
6846     uu|=1LL<<rt1[i];
6847     uu|=1LL<<rt2[i];
6848     // Accessed registers are needed
6849     u&=~(1LL<<rs1[i]);
6850     u&=~(1LL<<rs2[i]);
6851     uu&=~(1LL<<us1[i]);
6852     uu&=~(1LL<<us2[i]);
6853     // Source-target dependencies
6854     uu&=~(tdep<<dep1[i]);
6855     uu&=~(tdep<<dep2[i]);
6856     // R0 is always unneeded
6857     u|=1;uu|=1;
6858     // Save it
6859     unneeded_reg[i]=u;
6860     unneeded_reg_upper[i]=uu;
6861     /*
6862     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6863     printf("U:");
6864     int r;
6865     for(r=1;r<=CCREG;r++) {
6866       if((unneeded_reg[i]>>r)&1) {
6867         if(r==HIREG) printf(" HI");
6868         else if(r==LOREG) printf(" LO");
6869         else printf(" r%d",r);
6870       }
6871     }
6872     printf(" UU:");
6873     for(r=1;r<=CCREG;r++) {
6874       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6875         if(r==HIREG) printf(" HI");
6876         else if(r==LOREG) printf(" LO");
6877         else printf(" r%d",r);
6878       }
6879     }
6880     printf("\n");*/
6881   }
6882 #ifdef FORCE32
6883   for (i=iend;i>=istart;i--)
6884   {
6885     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6886   }
6887 #endif
6888 }
6889
6890 // Identify registers which are likely to contain 32-bit values
6891 // This is used to predict whether any branches will jump to a
6892 // location with 64-bit values in registers.
6893 static void provisional_32bit()
6894 {
6895   int i,j;
6896   uint64_t is32=1;
6897   uint64_t lastbranch=1;
6898   
6899   for(i=0;i<slen;i++)
6900   {
6901     if(i>0) {
6902       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6903         if(i>1) is32=lastbranch;
6904         else is32=1;
6905       }
6906     }
6907     if(i>1)
6908     {
6909       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6910         if(likely[i-2]) {
6911           if(i>2) is32=lastbranch;
6912           else is32=1;
6913         }
6914       }
6915       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6916       {
6917         if(rs1[i-2]==0||rs2[i-2]==0)
6918         {
6919           if(rs1[i-2]) {
6920             is32|=1LL<<rs1[i-2];
6921           }
6922           if(rs2[i-2]) {
6923             is32|=1LL<<rs2[i-2];
6924           }
6925         }
6926       }
6927     }
6928     // If something jumps here with 64-bit values
6929     // then promote those registers to 64 bits
6930     if(bt[i])
6931     {
6932       uint64_t temp_is32=is32;
6933       for(j=i-1;j>=0;j--)
6934       {
6935         if(ba[j]==start+i*4) 
6936           //temp_is32&=branch_regs[j].is32;
6937           temp_is32&=p32[j];
6938       }
6939       for(j=i;j<slen;j++)
6940       {
6941         if(ba[j]==start+i*4) 
6942           temp_is32=1;
6943       }
6944       is32=temp_is32;
6945     }
6946     int type=itype[i];
6947     int op=opcode[i];
6948     int op2=opcode2[i];
6949     int rt=rt1[i];
6950     int s1=rs1[i];
6951     int s2=rs2[i];
6952     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6953       // Branches don't write registers, consider the delay slot instead.
6954       type=itype[i+1];
6955       op=opcode[i+1];
6956       op2=opcode2[i+1];
6957       rt=rt1[i+1];
6958       s1=rs1[i+1];
6959       s2=rs2[i+1];
6960       lastbranch=is32;
6961     }
6962     switch(type) {
6963       case LOAD:
6964         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6965            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6966           is32&=~(1LL<<rt);
6967         else
6968           is32|=1LL<<rt;
6969         break;
6970       case STORE:
6971       case STORELR:
6972         break;
6973       case LOADLR:
6974         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6975         if(op==0x22) is32|=1LL<<rt; // LWL
6976         break;
6977       case IMM16:
6978         if (op==0x08||op==0x09|| // ADDI/ADDIU
6979             op==0x0a||op==0x0b|| // SLTI/SLTIU
6980             op==0x0c|| // ANDI
6981             op==0x0f)  // LUI
6982         {
6983           is32|=1LL<<rt;
6984         }
6985         if(op==0x18||op==0x19) { // DADDI/DADDIU
6986           is32&=~(1LL<<rt);
6987           //if(imm[i]==0)
6988           //  is32|=((is32>>s1)&1LL)<<rt;
6989         }
6990         if(op==0x0d||op==0x0e) { // ORI/XORI
6991           uint64_t sr=((is32>>s1)&1LL);
6992           is32&=~(1LL<<rt);
6993           is32|=sr<<rt;
6994         }
6995         break;
6996       case UJUMP:
6997         break;
6998       case RJUMP:
6999         break;
7000       case CJUMP:
7001         break;
7002       case SJUMP:
7003         break;
7004       case FJUMP:
7005         break;
7006       case ALU:
7007         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7008           is32|=1LL<<rt;
7009         }
7010         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7011           is32|=1LL<<rt;
7012         }
7013         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7014           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7015           is32&=~(1LL<<rt);
7016           is32|=sr<<rt;
7017         }
7018         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7019           if(s1==0&&s2==0) {
7020             is32|=1LL<<rt;
7021           }
7022           else if(s2==0) {
7023             uint64_t sr=((is32>>s1)&1LL);
7024             is32&=~(1LL<<rt);
7025             is32|=sr<<rt;
7026           }
7027           else if(s1==0) {
7028             uint64_t sr=((is32>>s2)&1LL);
7029             is32&=~(1LL<<rt);
7030             is32|=sr<<rt;
7031           }
7032           else {
7033             is32&=~(1LL<<rt);
7034           }
7035         }
7036         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7037           if(s1==0&&s2==0) {
7038             is32|=1LL<<rt;
7039           }
7040           else if(s2==0) {
7041             uint64_t sr=((is32>>s1)&1LL);
7042             is32&=~(1LL<<rt);
7043             is32|=sr<<rt;
7044           }
7045           else {
7046             is32&=~(1LL<<rt);
7047           }
7048         }
7049         break;
7050       case MULTDIV:
7051         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7052           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7053         }
7054         else {
7055           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7056         }
7057         break;
7058       case MOV:
7059         {
7060           uint64_t sr=((is32>>s1)&1LL);
7061           is32&=~(1LL<<rt);
7062           is32|=sr<<rt;
7063         }
7064         break;
7065       case SHIFT:
7066         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7067         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7068         break;
7069       case SHIFTIMM:
7070         is32|=1LL<<rt;
7071         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7072         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7073         break;
7074       case COP0:
7075         if(op2==0) is32|=1LL<<rt; // MFC0
7076         break;
7077       case COP1:
7078       case COP2:
7079         if(op2==0) is32|=1LL<<rt; // MFC1
7080         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7081         if(op2==2) is32|=1LL<<rt; // CFC1
7082         break;
7083       case C1LS:
7084       case C2LS:
7085         break;
7086       case FLOAT:
7087       case FCONV:
7088         break;
7089       case FCOMP:
7090         break;
7091       case C2OP:
7092       case SYSCALL:
7093       case HLECALL:
7094         break;
7095       default:
7096         break;
7097     }
7098     is32|=1;
7099     p32[i]=is32;
7100
7101     if(i>0)
7102     {
7103       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7104       {
7105         if(rt1[i-1]==31) // JAL/JALR
7106         {
7107           // Subroutine call will return here, don't alloc any registers
7108           is32=1;
7109         }
7110         else if(i+1<slen)
7111         {
7112           // Internal branch will jump here, match registers to caller
7113           is32=0x3FFFFFFFFLL;
7114         }
7115       }
7116     }
7117   }
7118 }
7119
7120 // Identify registers which may be assumed to contain 32-bit values
7121 // and where optimizations will rely on this.
7122 // This is used to determine whether backward branches can safely
7123 // jump to a location with 64-bit values in registers.
7124 static void provisional_r32()
7125 {
7126   u_int r32=0;
7127   int i;
7128   
7129   for (i=slen-1;i>=0;i--)
7130   {
7131     int hr;
7132     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7133     {
7134       if(ba[i]<start || ba[i]>=(start+slen*4))
7135       {
7136         // Branch out of this block, don't need anything
7137         r32=0;
7138       }
7139       else
7140       {
7141         // Internal branch
7142         // Need whatever matches the target
7143         // (and doesn't get overwritten by the delay slot instruction)
7144         r32=0;
7145         int t=(ba[i]-start)>>2;
7146         if(ba[i]>start+i*4) {
7147           // Forward branch
7148           //if(!(requires_32bit[t]&~regs[i].was32))
7149           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7150           if(!(pr32[t]&~regs[i].was32))
7151             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7152         }else{
7153           // Backward branch
7154           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7155             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7156         }
7157       }
7158       // Conditional branch may need registers for following instructions
7159       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7160       {
7161         if(i<slen-2) {
7162           //r32|=requires_32bit[i+2];
7163           r32|=pr32[i+2];
7164           r32&=regs[i].was32;
7165           // Mark this address as a branch target since it may be called
7166           // upon return from interrupt
7167           //bt[i+2]=1;
7168         }
7169       }
7170       // Merge in delay slot
7171       if(!likely[i]) {
7172         // These are overwritten unless the branch is "likely"
7173         // and the delay slot is nullified if not taken
7174         r32&=~(1LL<<rt1[i+1]);
7175         r32&=~(1LL<<rt2[i+1]);
7176       }
7177       // Assume these are needed (delay slot)
7178       if(us1[i+1]>0)
7179       {
7180         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7181       }
7182       if(us2[i+1]>0)
7183       {
7184         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7185       }
7186       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7187       {
7188         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7189       }
7190       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7191       {
7192         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7193       }
7194     }
7195     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7196     {
7197       // SYSCALL instruction (software interrupt)
7198       r32=0;
7199     }
7200     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7201     {
7202       // ERET instruction (return from interrupt)
7203       r32=0;
7204     }
7205     // Check 32 bits
7206     r32&=~(1LL<<rt1[i]);
7207     r32&=~(1LL<<rt2[i]);
7208     if(us1[i]>0)
7209     {
7210       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7211     }
7212     if(us2[i]>0)
7213     {
7214       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7215     }
7216     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7217     {
7218       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7219     }
7220     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7221     {
7222       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7223     }
7224     //requires_32bit[i]=r32;
7225     pr32[i]=r32;
7226     
7227     // Dirty registers which are 32-bit, require 32-bit input
7228     // as they will be written as 32-bit values
7229     for(hr=0;hr<HOST_REGS;hr++)
7230     {
7231       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7232         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7233           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7234           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7235           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7236         }
7237       }
7238     }
7239   }
7240 }
7241
7242 // Write back dirty registers as soon as we will no longer modify them,
7243 // so that we don't end up with lots of writes at the branches.
7244 void clean_registers(int istart,int iend,int wr)
7245 {
7246   int i;
7247   int r;
7248   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7249   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7250   if(iend==slen-1) {
7251     will_dirty_i=will_dirty_next=0;
7252     wont_dirty_i=wont_dirty_next=0;
7253   }else{
7254     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7255     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7256   }
7257   for (i=iend;i>=istart;i--)
7258   {
7259     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7260     {
7261       if(ba[i]<start || ba[i]>=(start+slen*4))
7262       {
7263         // Branch out of this block, flush all regs
7264         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7265         {
7266           // Unconditional branch
7267           will_dirty_i=0;
7268           wont_dirty_i=0;
7269           // Merge in delay slot (will dirty)
7270           for(r=0;r<HOST_REGS;r++) {
7271             if(r!=EXCLUDE_REG) {
7272               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7273               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7274               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7275               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7276               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7277               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7278               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7279               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7280               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7281               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7282               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7283               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7284               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7285               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7286             }
7287           }
7288         }
7289         else
7290         {
7291           // Conditional branch
7292           will_dirty_i=0;
7293           wont_dirty_i=wont_dirty_next;
7294           // Merge in delay slot (will dirty)
7295           for(r=0;r<HOST_REGS;r++) {
7296             if(r!=EXCLUDE_REG) {
7297               if(!likely[i]) {
7298                 // Might not dirty if likely branch is not taken
7299                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7300                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7301                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7302                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7303                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7304                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7305                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7306                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7307                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7308                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7309                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7310                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7311                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7312                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7313               }
7314             }
7315           }
7316         }
7317         // Merge in delay slot (wont dirty)
7318         for(r=0;r<HOST_REGS;r++) {
7319           if(r!=EXCLUDE_REG) {
7320             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7321             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7322             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7323             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7324             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7325             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7326             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7327             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7328             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7329             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7330           }
7331         }
7332         if(wr) {
7333           #ifndef DESTRUCTIVE_WRITEBACK
7334           branch_regs[i].dirty&=wont_dirty_i;
7335           #endif
7336           branch_regs[i].dirty|=will_dirty_i;
7337         }
7338       }
7339       else
7340       {
7341         // Internal branch
7342         if(ba[i]<=start+i*4) {
7343           // Backward branch
7344           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7345           {
7346             // Unconditional branch
7347             temp_will_dirty=0;
7348             temp_wont_dirty=0;
7349             // Merge in delay slot (will dirty)
7350             for(r=0;r<HOST_REGS;r++) {
7351               if(r!=EXCLUDE_REG) {
7352                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7353                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7354                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7355                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7356                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7357                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7358                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7359                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7360                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7361                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7362                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7363                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7364                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7365                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7366               }
7367             }
7368           } else {
7369             // Conditional branch (not taken case)
7370             temp_will_dirty=will_dirty_next;
7371             temp_wont_dirty=wont_dirty_next;
7372             // Merge in delay slot (will dirty)
7373             for(r=0;r<HOST_REGS;r++) {
7374               if(r!=EXCLUDE_REG) {
7375                 if(!likely[i]) {
7376                   // Will not dirty if likely branch is not taken
7377                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7378                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7379                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7380                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7381                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7382                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7383                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7384                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7385                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7386                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7387                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7388                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7389                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7390                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7391                 }
7392               }
7393             }
7394           }
7395           // Merge in delay slot (wont dirty)
7396           for(r=0;r<HOST_REGS;r++) {
7397             if(r!=EXCLUDE_REG) {
7398               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7399               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7400               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7401               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7402               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7403               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7404               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7405               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7406               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7407               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7408             }
7409           }
7410           // Deal with changed mappings
7411           if(i<iend) {
7412             for(r=0;r<HOST_REGS;r++) {
7413               if(r!=EXCLUDE_REG) {
7414                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7415                   temp_will_dirty&=~(1<<r);
7416                   temp_wont_dirty&=~(1<<r);
7417                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7418                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7419                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7420                   } else {
7421                     temp_will_dirty|=1<<r;
7422                     temp_wont_dirty|=1<<r;
7423                   }
7424                 }
7425               }
7426             }
7427           }
7428           if(wr) {
7429             will_dirty[i]=temp_will_dirty;
7430             wont_dirty[i]=temp_wont_dirty;
7431             clean_registers((ba[i]-start)>>2,i-1,0);
7432           }else{
7433             // Limit recursion.  It can take an excessive amount
7434             // of time if there are a lot of nested loops.
7435             will_dirty[(ba[i]-start)>>2]=0;
7436             wont_dirty[(ba[i]-start)>>2]=-1;
7437           }
7438         }
7439         /*else*/ if(1)
7440         {
7441           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7442           {
7443             // Unconditional branch
7444             will_dirty_i=0;
7445             wont_dirty_i=0;
7446           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7447             for(r=0;r<HOST_REGS;r++) {
7448               if(r!=EXCLUDE_REG) {
7449                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7450                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7451                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7452                 }
7453               }
7454             }
7455           //}
7456             // Merge in delay slot
7457             for(r=0;r<HOST_REGS;r++) {
7458               if(r!=EXCLUDE_REG) {
7459                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7460                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7461                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7462                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7463                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7464                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7465                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7466                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7467                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7468                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7469                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7470                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7471                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7472                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7473               }
7474             }
7475           } else {
7476             // Conditional branch
7477             will_dirty_i=will_dirty_next;
7478             wont_dirty_i=wont_dirty_next;
7479           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7480             for(r=0;r<HOST_REGS;r++) {
7481               if(r!=EXCLUDE_REG) {
7482                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7483                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7484                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7485                 }
7486                 else
7487                 {
7488                   will_dirty_i&=~(1<<r);
7489                 }
7490                 // Treat delay slot as part of branch too
7491                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7492                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7493                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7494                 }
7495                 else
7496                 {
7497                   will_dirty[i+1]&=~(1<<r);
7498                 }*/
7499               }
7500             }
7501           //}
7502             // Merge in delay slot
7503             for(r=0;r<HOST_REGS;r++) {
7504               if(r!=EXCLUDE_REG) {
7505                 if(!likely[i]) {
7506                   // Might not dirty if likely branch is not taken
7507                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7508                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7509                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7510                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7511                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7512                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7513                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7514                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7515                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7516                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7517                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7518                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7519                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7520                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7521                 }
7522               }
7523             }
7524           }
7525           // Merge in delay slot
7526           for(r=0;r<HOST_REGS;r++) {
7527             if(r!=EXCLUDE_REG) {
7528               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7529               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7530               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7531               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7532               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7533               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7534               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7535               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7536               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7537               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7538             }
7539           }
7540           if(wr) {
7541             #ifndef DESTRUCTIVE_WRITEBACK
7542             branch_regs[i].dirty&=wont_dirty_i;
7543             #endif
7544             branch_regs[i].dirty|=will_dirty_i;
7545           }
7546         }
7547       }
7548     }
7549     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7550     {
7551       // SYSCALL instruction (software interrupt)
7552       will_dirty_i=0;
7553       wont_dirty_i=0;
7554     }
7555     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7556     {
7557       // ERET instruction (return from interrupt)
7558       will_dirty_i=0;
7559       wont_dirty_i=0;
7560     }
7561     will_dirty_next=will_dirty_i;
7562     wont_dirty_next=wont_dirty_i;
7563     for(r=0;r<HOST_REGS;r++) {
7564       if(r!=EXCLUDE_REG) {
7565         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7566         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7567         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7568         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7569         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7570         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7571         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7572         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7573         if(i>istart) {
7574           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7575           {
7576             // Don't store a register immediately after writing it,
7577             // may prevent dual-issue.
7578             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7579             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7580           }
7581         }
7582       }
7583     }
7584     // Save it
7585     will_dirty[i]=will_dirty_i;
7586     wont_dirty[i]=wont_dirty_i;
7587     // Mark registers that won't be dirtied as not dirty
7588     if(wr) {
7589       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7590       for(r=0;r<HOST_REGS;r++) {
7591         if((will_dirty_i>>r)&1) {
7592           printf(" r%d",r);
7593         }
7594       }
7595       printf("\n");*/
7596
7597       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7598         regs[i].dirty|=will_dirty_i;
7599         #ifndef DESTRUCTIVE_WRITEBACK
7600         regs[i].dirty&=wont_dirty_i;
7601         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7602         {
7603           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7604             for(r=0;r<HOST_REGS;r++) {
7605               if(r!=EXCLUDE_REG) {
7606                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7607                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7608                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7609               }
7610             }
7611           }
7612         }
7613         else
7614         {
7615           if(i<iend) {
7616             for(r=0;r<HOST_REGS;r++) {
7617               if(r!=EXCLUDE_REG) {
7618                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7619                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7620                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7621               }
7622             }
7623           }
7624         }
7625         #endif
7626       //}
7627     }
7628     // Deal with changed mappings
7629     temp_will_dirty=will_dirty_i;
7630     temp_wont_dirty=wont_dirty_i;
7631     for(r=0;r<HOST_REGS;r++) {
7632       if(r!=EXCLUDE_REG) {
7633         int nr;
7634         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7635           if(wr) {
7636             #ifndef DESTRUCTIVE_WRITEBACK
7637             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7638             #endif
7639             regs[i].wasdirty|=will_dirty_i&(1<<r);
7640           }
7641         }
7642         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7643           // Register moved to a different register
7644           will_dirty_i&=~(1<<r);
7645           wont_dirty_i&=~(1<<r);
7646           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7647           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7648           if(wr) {
7649             #ifndef DESTRUCTIVE_WRITEBACK
7650             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7651             #endif
7652             regs[i].wasdirty|=will_dirty_i&(1<<r);
7653           }
7654         }
7655         else {
7656           will_dirty_i&=~(1<<r);
7657           wont_dirty_i&=~(1<<r);
7658           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7659             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7660             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7661           } else {
7662             wont_dirty_i|=1<<r;
7663             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7664           }
7665         }
7666       }
7667     }
7668   }
7669 }
7670
7671   /* disassembly */
7672 void disassemble_inst(int i)
7673 {
7674     if (bt[i]) printf("*"); else printf(" ");
7675     switch(itype[i]) {
7676       case UJUMP:
7677         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7678       case CJUMP:
7679         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7680       case SJUMP:
7681         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7682       case FJUMP:
7683         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7684       case RJUMP:
7685         if (opcode[i]==0x9&&rt1[i]!=31)
7686           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7687         else
7688           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7689         break;
7690       case SPAN:
7691         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7692       case IMM16:
7693         if(opcode[i]==0xf) //LUI
7694           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7695         else
7696           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7697         break;
7698       case LOAD:
7699       case LOADLR:
7700         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7701         break;
7702       case STORE:
7703       case STORELR:
7704         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7705         break;
7706       case ALU:
7707       case SHIFT:
7708         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7709         break;
7710       case MULTDIV:
7711         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7712         break;
7713       case SHIFTIMM:
7714         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7715         break;
7716       case MOV:
7717         if((opcode2[i]&0x1d)==0x10)
7718           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7719         else if((opcode2[i]&0x1d)==0x11)
7720           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7721         else
7722           printf (" %x: %s\n",start+i*4,insn[i]);
7723         break;
7724       case COP0:
7725         if(opcode2[i]==0)
7726           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7727         else if(opcode2[i]==4)
7728           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7729         else printf (" %x: %s\n",start+i*4,insn[i]);
7730         break;
7731       case COP1:
7732         if(opcode2[i]<3)
7733           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7734         else if(opcode2[i]>3)
7735           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7736         else printf (" %x: %s\n",start+i*4,insn[i]);
7737         break;
7738       case COP2:
7739         if(opcode2[i]<3)
7740           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7741         else if(opcode2[i]>3)
7742           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7743         else printf (" %x: %s\n",start+i*4,insn[i]);
7744         break;
7745       case C1LS:
7746         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7747         break;
7748       case C2LS:
7749         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7750         break;
7751       case INTCALL:
7752         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7753         break;
7754       default:
7755         //printf (" %s %8x\n",insn[i],source[i]);
7756         printf (" %x: %s\n",start+i*4,insn[i]);
7757     }
7758 }
7759
7760 // clear the state completely, instead of just marking
7761 // things invalid like invalidate_all_pages() does
7762 void new_dynarec_clear_full()
7763 {
7764   int n;
7765   out=(u_char *)BASE_ADDR;
7766   memset(invalid_code,1,sizeof(invalid_code));
7767   memset(hash_table,0xff,sizeof(hash_table));
7768   memset(mini_ht,-1,sizeof(mini_ht));
7769   memset(restore_candidate,0,sizeof(restore_candidate));
7770   memset(shadow,0,sizeof(shadow));
7771   copy=shadow;
7772   expirep=16384; // Expiry pointer, +2 blocks
7773   pending_exception=0;
7774   literalcount=0;
7775   stop_after_jal=0;
7776   // TLB
7777 #ifndef DISABLE_TLB
7778   using_tlb=0;
7779 #endif
7780   sp_in_mirror=0;
7781   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7782     memory_map[n]=-1;
7783   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7784     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7785   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7786     memory_map[n]=-1;
7787   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7788   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7789   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7790 }
7791
7792 void new_dynarec_init()
7793 {
7794   printf("Init new dynarec\n");
7795   out=(u_char *)BASE_ADDR;
7796   if (mmap (out, 1<<TARGET_SIZE_2,
7797             PROT_READ | PROT_WRITE | PROT_EXEC,
7798             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7799             -1, 0) <= 0) {printf("mmap() failed\n");}
7800 #ifdef MUPEN64
7801   rdword=&readmem_dword;
7802   fake_pc.f.r.rs=&readmem_dword;
7803   fake_pc.f.r.rt=&readmem_dword;
7804   fake_pc.f.r.rd=&readmem_dword;
7805 #endif
7806   int n;
7807   new_dynarec_clear_full();
7808 #ifdef HOST_IMM8
7809   // Copy this into local area so we don't have to put it in every literal pool
7810   invc_ptr=invalid_code;
7811 #endif
7812 #ifdef MUPEN64
7813   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7814     writemem[n] = write_nomem_new;
7815     writememb[n] = write_nomemb_new;
7816     writememh[n] = write_nomemh_new;
7817 #ifndef FORCE32
7818     writememd[n] = write_nomemd_new;
7819 #endif
7820     readmem[n] = read_nomem_new;
7821     readmemb[n] = read_nomemb_new;
7822     readmemh[n] = read_nomemh_new;
7823 #ifndef FORCE32
7824     readmemd[n] = read_nomemd_new;
7825 #endif
7826   }
7827   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7828     writemem[n] = write_rdram_new;
7829     writememb[n] = write_rdramb_new;
7830     writememh[n] = write_rdramh_new;
7831 #ifndef FORCE32
7832     writememd[n] = write_rdramd_new;
7833 #endif
7834   }
7835   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7836     writemem[n] = write_nomem_new;
7837     writememb[n] = write_nomemb_new;
7838     writememh[n] = write_nomemh_new;
7839 #ifndef FORCE32
7840     writememd[n] = write_nomemd_new;
7841 #endif
7842     readmem[n] = read_nomem_new;
7843     readmemb[n] = read_nomemb_new;
7844     readmemh[n] = read_nomemh_new;
7845 #ifndef FORCE32
7846     readmemd[n] = read_nomemd_new;
7847 #endif
7848   }
7849 #endif
7850   tlb_hacks();
7851   arch_init();
7852 }
7853
7854 void new_dynarec_cleanup()
7855 {
7856   int n;
7857   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7858   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7859   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7860   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7861   #ifdef ROM_COPY
7862   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7863   #endif
7864 }
7865
7866 int new_recompile_block(int addr)
7867 {
7868 /*
7869   if(addr==0x800cd050) {
7870     int block;
7871     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7872     int n;
7873     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7874   }
7875 */
7876   //if(Count==365117028) tracedebug=1;
7877   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7878   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7879   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7880   //if(debug) 
7881   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7882   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7883   /*if(Count>=312978186) {
7884     rlist();
7885   }*/
7886   //rlist();
7887   start = (u_int)addr&~3;
7888   //assert(((u_int)addr&1)==0);
7889 #ifdef PCSX
7890   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
7891      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
7892     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp, psxRegs.pc);
7893     sp_in_mirror=1;
7894   }
7895   if (Config.HLE && start == 0x80001000) // hlecall
7896   {
7897     // XXX: is this enough? Maybe check hleSoftCall?
7898     u_int beginning=(u_int)out;
7899     u_int page=get_page(start);
7900     invalid_code[start>>12]=0;
7901     emit_movimm(start,0);
7902     emit_writeword(0,(int)&pcaddr);
7903     emit_jmp((int)new_dyna_leave);
7904 #ifdef __arm__
7905     __clear_cache((void *)beginning,out);
7906 #endif
7907     ll_add(jump_in+page,start,(void *)beginning);
7908     return 0;
7909   }
7910   else if ((u_int)addr < 0x00200000 ||
7911     (0xa0000000 <= addr && addr < 0xa0200000)) {
7912     // used for BIOS calls mostly?
7913     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7914     pagelimit = (addr&0xa0000000)|0x00200000;
7915   }
7916   else if (!Config.HLE && (
7917 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7918     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7919     // BIOS
7920     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7921     pagelimit = (addr&0xfff00000)|0x80000;
7922   }
7923   else
7924 #endif
7925 #ifdef MUPEN64
7926   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7927     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7928     pagelimit = 0xa4001000;
7929   }
7930   else
7931 #endif
7932   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7933     source = (u_int *)((u_int)rdram+start-0x80000000);
7934     pagelimit = 0x80000000+RAM_SIZE;
7935   }
7936 #ifndef DISABLE_TLB
7937   else if ((signed int)addr >= (signed int)0xC0000000) {
7938     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7939     //if(tlb_LUT_r[start>>12])
7940       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7941     if((signed int)memory_map[start>>12]>=0) {
7942       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7943       pagelimit=(start+4096)&0xFFFFF000;
7944       int map=memory_map[start>>12];
7945       int i;
7946       for(i=0;i<5;i++) {
7947         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7948         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7949       }
7950       assem_debug("pagelimit=%x\n",pagelimit);
7951       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7952     }
7953     else {
7954       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7955       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7956       return -1; // Caller will invoke exception handler
7957     }
7958     //printf("source= %x\n",(int)source);
7959   }
7960 #endif
7961   else {
7962     printf("Compile at bogus memory address: %x \n", (int)addr);
7963     exit(1);
7964   }
7965
7966   /* Pass 1: disassemble */
7967   /* Pass 2: register dependencies, branch targets */
7968   /* Pass 3: register allocation */
7969   /* Pass 4: branch dependencies */
7970   /* Pass 5: pre-alloc */
7971   /* Pass 6: optimize clean/dirty state */
7972   /* Pass 7: flag 32-bit registers */
7973   /* Pass 8: assembly */
7974   /* Pass 9: linker */
7975   /* Pass 10: garbage collection / free memory */
7976
7977   int i,j;
7978   int done=0;
7979   unsigned int type,op,op2;
7980
7981   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7982   
7983   /* Pass 1 disassembly */
7984
7985   for(i=0;!done;i++) {
7986     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7987     minimum_free_regs[i]=0;
7988     opcode[i]=op=source[i]>>26;
7989     switch(op)
7990     {
7991       case 0x00: strcpy(insn[i],"special"); type=NI;
7992         op2=source[i]&0x3f;
7993         switch(op2)
7994         {
7995           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7996           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7997           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7998           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7999           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8000           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8001           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8002           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8003           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8004           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8005           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8006           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8007           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8008           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8009           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8010           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8011           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8012           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8013           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8014           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8015           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8016           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8017           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8018           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8019           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8020           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8021           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8022           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8023           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8024           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8025           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8026           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8027           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8028           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8029           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8030 #ifndef FORCE32
8031           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8032           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8033           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8034           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8035           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8036           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8037           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8038           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8039           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8040           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8041           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8042           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8043           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8044           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8045           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8046           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8047           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8048 #endif
8049         }
8050         break;
8051       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8052         op2=(source[i]>>16)&0x1f;
8053         switch(op2)
8054         {
8055           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8056           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8057           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8058           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8059           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8060           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8061           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8062           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8063           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8064           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8065           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8066           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8067           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8068           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8069         }
8070         break;
8071       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8072       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8073       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8074       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8075       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8076       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8077       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8078       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8079       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8080       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8081       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8082       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8083       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8084       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8085       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8086         op2=(source[i]>>21)&0x1f;
8087         switch(op2)
8088         {
8089           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8090           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8091           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8092           switch(source[i]&0x3f)
8093           {
8094             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8095             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8096             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8097             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8098 #ifdef PCSX
8099             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8100 #else
8101             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8102 #endif
8103           }
8104         }
8105         break;
8106       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8107         op2=(source[i]>>21)&0x1f;
8108         switch(op2)
8109         {
8110           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8111           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8112           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8113           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8114           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8115           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8116           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8117           switch((source[i]>>16)&0x3)
8118           {
8119             case 0x00: strcpy(insn[i],"BC1F"); break;
8120             case 0x01: strcpy(insn[i],"BC1T"); break;
8121             case 0x02: strcpy(insn[i],"BC1FL"); break;
8122             case 0x03: strcpy(insn[i],"BC1TL"); break;
8123           }
8124           break;
8125           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8126           switch(source[i]&0x3f)
8127           {
8128             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8129             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8130             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8131             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8132             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8133             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8134             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8135             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8136             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8137             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8138             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8139             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8140             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8141             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8142             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8143             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8144             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8145             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8146             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8147             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8148             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8149             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8150             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8151             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8152             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8153             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8154             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8155             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8156             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8157             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8158             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8159             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8160             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8161             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8162             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8163           }
8164           break;
8165           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8166           switch(source[i]&0x3f)
8167           {
8168             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8169             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8170             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8171             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8172             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8173             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8174             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8175             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8176             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8177             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8178             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8179             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8180             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8181             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8182             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8183             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8184             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8185             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8186             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8187             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8188             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8189             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8190             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8191             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8192             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8193             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8194             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8195             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8196             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8197             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8198             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8199             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8200             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8201             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8202             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8203           }
8204           break;
8205           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8206           switch(source[i]&0x3f)
8207           {
8208             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8209             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8210           }
8211           break;
8212           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8213           switch(source[i]&0x3f)
8214           {
8215             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8216             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8217           }
8218           break;
8219         }
8220         break;
8221 #ifndef FORCE32
8222       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8223       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8224       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8225       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8226       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8227       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8228       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8229       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8230 #endif
8231       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8232       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8233       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8234       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8235       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8236       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8237       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8238       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8239       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8240       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8241       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8242       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8243 #ifndef FORCE32
8244       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8245       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8246 #endif
8247       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8248       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8249       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8250       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8251 #ifndef FORCE32
8252       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8253       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8254       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8255 #endif
8256       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8257       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8258 #ifndef FORCE32
8259       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8260       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8261       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8262 #endif
8263 #ifdef PCSX
8264       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8265         // note: COP MIPS-1 encoding differs from MIPS32
8266         op2=(source[i]>>21)&0x1f;
8267         if (source[i]&0x3f) {
8268           if (gte_handlers[source[i]&0x3f]!=NULL) {
8269             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8270             type=C2OP;
8271           }
8272         }
8273         else switch(op2)
8274         {
8275           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8276           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8277           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8278           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8279         }
8280         break;
8281       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8282       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8283       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8284 #endif
8285       default: strcpy(insn[i],"???"); type=NI;
8286         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8287         break;
8288     }
8289     itype[i]=type;
8290     opcode2[i]=op2;
8291     /* Get registers/immediates */
8292     lt1[i]=0;
8293     us1[i]=0;
8294     us2[i]=0;
8295     dep1[i]=0;
8296     dep2[i]=0;
8297     switch(type) {
8298       case LOAD:
8299         rs1[i]=(source[i]>>21)&0x1f;
8300         rs2[i]=0;
8301         rt1[i]=(source[i]>>16)&0x1f;
8302         rt2[i]=0;
8303         imm[i]=(short)source[i];
8304         break;
8305       case STORE:
8306       case STORELR:
8307         rs1[i]=(source[i]>>21)&0x1f;
8308         rs2[i]=(source[i]>>16)&0x1f;
8309         rt1[i]=0;
8310         rt2[i]=0;
8311         imm[i]=(short)source[i];
8312         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8313         break;
8314       case LOADLR:
8315         // LWL/LWR only load part of the register,
8316         // therefore the target register must be treated as a source too
8317         rs1[i]=(source[i]>>21)&0x1f;
8318         rs2[i]=(source[i]>>16)&0x1f;
8319         rt1[i]=(source[i]>>16)&0x1f;
8320         rt2[i]=0;
8321         imm[i]=(short)source[i];
8322         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8323         if(op==0x26) dep1[i]=rt1[i]; // LWR
8324         break;
8325       case IMM16:
8326         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8327         else rs1[i]=(source[i]>>21)&0x1f;
8328         rs2[i]=0;
8329         rt1[i]=(source[i]>>16)&0x1f;
8330         rt2[i]=0;
8331         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8332           imm[i]=(unsigned short)source[i];
8333         }else{
8334           imm[i]=(short)source[i];
8335         }
8336         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8337         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8338         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8339         break;
8340       case UJUMP:
8341         rs1[i]=0;
8342         rs2[i]=0;
8343         rt1[i]=0;
8344         rt2[i]=0;
8345         // The JAL instruction writes to r31.
8346         if (op&1) {
8347           rt1[i]=31;
8348         }
8349         rs2[i]=CCREG;
8350         break;
8351       case RJUMP:
8352         rs1[i]=(source[i]>>21)&0x1f;
8353         rs2[i]=0;
8354         rt1[i]=0;
8355         rt2[i]=0;
8356         // The JALR instruction writes to rd.
8357         if (op2&1) {
8358           rt1[i]=(source[i]>>11)&0x1f;
8359         }
8360         rs2[i]=CCREG;
8361         break;
8362       case CJUMP:
8363         rs1[i]=(source[i]>>21)&0x1f;
8364         rs2[i]=(source[i]>>16)&0x1f;
8365         rt1[i]=0;
8366         rt2[i]=0;
8367         if(op&2) { // BGTZ/BLEZ
8368           rs2[i]=0;
8369         }
8370         us1[i]=rs1[i];
8371         us2[i]=rs2[i];
8372         likely[i]=op>>4;
8373         break;
8374       case SJUMP:
8375         rs1[i]=(source[i]>>21)&0x1f;
8376         rs2[i]=CCREG;
8377         rt1[i]=0;
8378         rt2[i]=0;
8379         us1[i]=rs1[i];
8380         if(op2&0x10) { // BxxAL
8381           rt1[i]=31;
8382           // NOTE: If the branch is not taken, r31 is still overwritten
8383         }
8384         likely[i]=(op2&2)>>1;
8385         break;
8386       case FJUMP:
8387         rs1[i]=FSREG;
8388         rs2[i]=CSREG;
8389         rt1[i]=0;
8390         rt2[i]=0;
8391         likely[i]=((source[i])>>17)&1;
8392         break;
8393       case ALU:
8394         rs1[i]=(source[i]>>21)&0x1f; // source
8395         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8396         rt1[i]=(source[i]>>11)&0x1f; // destination
8397         rt2[i]=0;
8398         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8399           us1[i]=rs1[i];us2[i]=rs2[i];
8400         }
8401         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8402           dep1[i]=rs1[i];dep2[i]=rs2[i];
8403         }
8404         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8405           dep1[i]=rs1[i];dep2[i]=rs2[i];
8406         }
8407         break;
8408       case MULTDIV:
8409         rs1[i]=(source[i]>>21)&0x1f; // source
8410         rs2[i]=(source[i]>>16)&0x1f; // divisor
8411         rt1[i]=HIREG;
8412         rt2[i]=LOREG;
8413         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8414           us1[i]=rs1[i];us2[i]=rs2[i];
8415         }
8416         break;
8417       case MOV:
8418         rs1[i]=0;
8419         rs2[i]=0;
8420         rt1[i]=0;
8421         rt2[i]=0;
8422         if(op2==0x10) rs1[i]=HIREG; // MFHI
8423         if(op2==0x11) rt1[i]=HIREG; // MTHI
8424         if(op2==0x12) rs1[i]=LOREG; // MFLO
8425         if(op2==0x13) rt1[i]=LOREG; // MTLO
8426         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8427         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8428         dep1[i]=rs1[i];
8429         break;
8430       case SHIFT:
8431         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8432         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8433         rt1[i]=(source[i]>>11)&0x1f; // destination
8434         rt2[i]=0;
8435         // DSLLV/DSRLV/DSRAV are 64-bit
8436         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8437         break;
8438       case SHIFTIMM:
8439         rs1[i]=(source[i]>>16)&0x1f;
8440         rs2[i]=0;
8441         rt1[i]=(source[i]>>11)&0x1f;
8442         rt2[i]=0;
8443         imm[i]=(source[i]>>6)&0x1f;
8444         // DSxx32 instructions
8445         if(op2>=0x3c) imm[i]|=0x20;
8446         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8447         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8448         break;
8449       case COP0:
8450         rs1[i]=0;
8451         rs2[i]=0;
8452         rt1[i]=0;
8453         rt2[i]=0;
8454         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8455         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8456         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8457         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8458         break;
8459       case COP1:
8460       case COP2:
8461         rs1[i]=0;
8462         rs2[i]=0;
8463         rt1[i]=0;
8464         rt2[i]=0;
8465         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8466         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8467         if(op2==5) us1[i]=rs1[i]; // DMTC1
8468         rs2[i]=CSREG;
8469         break;
8470       case C1LS:
8471         rs1[i]=(source[i]>>21)&0x1F;
8472         rs2[i]=CSREG;
8473         rt1[i]=0;
8474         rt2[i]=0;
8475         imm[i]=(short)source[i];
8476         break;
8477       case C2LS:
8478         rs1[i]=(source[i]>>21)&0x1F;
8479         rs2[i]=0;
8480         rt1[i]=0;
8481         rt2[i]=0;
8482         imm[i]=(short)source[i];
8483         break;
8484       case FLOAT:
8485       case FCONV:
8486         rs1[i]=0;
8487         rs2[i]=CSREG;
8488         rt1[i]=0;
8489         rt2[i]=0;
8490         break;
8491       case FCOMP:
8492         rs1[i]=FSREG;
8493         rs2[i]=CSREG;
8494         rt1[i]=FSREG;
8495         rt2[i]=0;
8496         break;
8497       case SYSCALL:
8498       case HLECALL:
8499       case INTCALL:
8500         rs1[i]=CCREG;
8501         rs2[i]=0;
8502         rt1[i]=0;
8503         rt2[i]=0;
8504         break;
8505       default:
8506         rs1[i]=0;
8507         rs2[i]=0;
8508         rt1[i]=0;
8509         rt2[i]=0;
8510     }
8511     /* Calculate branch target addresses */
8512     if(type==UJUMP)
8513       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8514     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8515       ba[i]=start+i*4+8; // Ignore never taken branch
8516     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8517       ba[i]=start+i*4+8; // Ignore never taken branch
8518     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8519       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8520     else ba[i]=-1;
8521 #ifdef PCSX
8522     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8523       int do_in_intrp=0;
8524       // branch in delay slot?
8525       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8526         // don't handle first branch and call interpreter if it's hit
8527         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8528         do_in_intrp=1;
8529       }
8530       // basic load delay detection
8531       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8532         int t=(ba[i-1]-start)/4;
8533         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8534           // jump target wants DS result - potential load delay effect
8535           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8536           do_in_intrp=1;
8537           bt[t+1]=1; // expected return from interpreter
8538         }
8539         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8540               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8541           // v0 overwrite like this is a sign of trouble, bail out
8542           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8543           do_in_intrp=1;
8544         }
8545       }
8546       if(do_in_intrp) {
8547         rs1[i-1]=CCREG;
8548         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8549         ba[i-1]=-1;
8550         itype[i-1]=INTCALL;
8551         done=2;
8552         i--; // don't compile the DS
8553       }
8554     }
8555 #endif
8556     /* Is this the end of the block? */
8557     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8558       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8559         done=2;
8560       }
8561       else {
8562         if(stop_after_jal) done=1;
8563         // Stop on BREAK
8564         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8565       }
8566       // Don't recompile stuff that's already compiled
8567       if(check_addr(start+i*4+4)) done=1;
8568       // Don't get too close to the limit
8569       if(i>MAXBLOCK/2) done=1;
8570     }
8571     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8572     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8573     if(done==2) {
8574       // Does the block continue due to a branch?
8575       for(j=i-1;j>=0;j--)
8576       {
8577         if(ba[j]==start+i*4+4) done=j=0;
8578         if(ba[j]==start+i*4+8) done=j=0;
8579       }
8580     }
8581     //assert(i<MAXBLOCK-1);
8582     if(start+i*4==pagelimit-4) done=1;
8583     assert(start+i*4<pagelimit);
8584     if (i==MAXBLOCK-1) done=1;
8585     // Stop if we're compiling junk
8586     if(itype[i]==NI&&opcode[i]==0x11) {
8587       done=stop_after_jal=1;
8588       printf("Disabled speculative precompilation\n");
8589     }
8590   }
8591   slen=i;
8592   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8593     if(start+i*4==pagelimit) {
8594       itype[i-1]=SPAN;
8595     }
8596   }
8597   assert(slen>0);
8598
8599   /* Pass 2 - Register dependencies and branch targets */
8600
8601   unneeded_registers(0,slen-1,0);
8602   
8603   /* Pass 3 - Register allocation */
8604
8605   struct regstat current; // Current register allocations/status
8606   current.is32=1;
8607   current.dirty=0;
8608   current.u=unneeded_reg[0];
8609   current.uu=unneeded_reg_upper[0];
8610   clear_all_regs(current.regmap);
8611   alloc_reg(&current,0,CCREG);
8612   dirty_reg(&current,CCREG);
8613   current.isconst=0;
8614   current.wasconst=0;
8615   int ds=0;
8616   int cc=0;
8617   int hr=-1;
8618
8619 #ifndef FORCE32
8620   provisional_32bit();
8621 #endif
8622   if((u_int)addr&1) {
8623     // First instruction is delay slot
8624     cc=-1;
8625     bt[1]=1;
8626     ds=1;
8627     unneeded_reg[0]=1;
8628     unneeded_reg_upper[0]=1;
8629     current.regmap[HOST_BTREG]=BTREG;
8630   }
8631   
8632   for(i=0;i<slen;i++)
8633   {
8634     if(bt[i])
8635     {
8636       int hr;
8637       for(hr=0;hr<HOST_REGS;hr++)
8638       {
8639         // Is this really necessary?
8640         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8641       }
8642       current.isconst=0;
8643     }
8644     if(i>1)
8645     {
8646       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8647       {
8648         if(rs1[i-2]==0||rs2[i-2]==0)
8649         {
8650           if(rs1[i-2]) {
8651             current.is32|=1LL<<rs1[i-2];
8652             int hr=get_reg(current.regmap,rs1[i-2]|64);
8653             if(hr>=0) current.regmap[hr]=-1;
8654           }
8655           if(rs2[i-2]) {
8656             current.is32|=1LL<<rs2[i-2];
8657             int hr=get_reg(current.regmap,rs2[i-2]|64);
8658             if(hr>=0) current.regmap[hr]=-1;
8659           }
8660         }
8661       }
8662     }
8663 #ifndef FORCE32
8664     // If something jumps here with 64-bit values
8665     // then promote those registers to 64 bits
8666     if(bt[i])
8667     {
8668       uint64_t temp_is32=current.is32;
8669       for(j=i-1;j>=0;j--)
8670       {
8671         if(ba[j]==start+i*4) 
8672           temp_is32&=branch_regs[j].is32;
8673       }
8674       for(j=i;j<slen;j++)
8675       {
8676         if(ba[j]==start+i*4) 
8677           //temp_is32=1;
8678           temp_is32&=p32[j];
8679       }
8680       if(temp_is32!=current.is32) {
8681         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8682         #ifdef DESTRUCTIVE_WRITEBACK
8683         for(hr=0;hr<HOST_REGS;hr++)
8684         {
8685           int r=current.regmap[hr];
8686           if(r>0&&r<64)
8687           {
8688             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8689               temp_is32|=1LL<<r;
8690               //printf("restore %d\n",r);
8691             }
8692           }
8693         }
8694         #endif
8695         current.is32=temp_is32;
8696       }
8697     }
8698 #else
8699     current.is32=-1LL;
8700 #endif
8701
8702     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8703     regs[i].wasconst=current.isconst;
8704     regs[i].was32=current.is32;
8705     regs[i].wasdirty=current.dirty;
8706     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8707     // To change a dirty register from 32 to 64 bits, we must write
8708     // it out during the previous cycle (for branches, 2 cycles)
8709     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8710     {
8711       uint64_t temp_is32=current.is32;
8712       for(j=i-1;j>=0;j--)
8713       {
8714         if(ba[j]==start+i*4+4) 
8715           temp_is32&=branch_regs[j].is32;
8716       }
8717       for(j=i;j<slen;j++)
8718       {
8719         if(ba[j]==start+i*4+4) 
8720           //temp_is32=1;
8721           temp_is32&=p32[j];
8722       }
8723       if(temp_is32!=current.is32) {
8724         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8725         for(hr=0;hr<HOST_REGS;hr++)
8726         {
8727           int r=current.regmap[hr];
8728           if(r>0)
8729           {
8730             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8731               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8732               {
8733                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8734                 {
8735                   //printf("dump %d/r%d\n",hr,r);
8736                   current.regmap[hr]=-1;
8737                   if(get_reg(current.regmap,r|64)>=0) 
8738                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8739                 }
8740               }
8741             }
8742           }
8743         }
8744       }
8745     }
8746     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8747     {
8748       uint64_t temp_is32=current.is32;
8749       for(j=i-1;j>=0;j--)
8750       {
8751         if(ba[j]==start+i*4+8) 
8752           temp_is32&=branch_regs[j].is32;
8753       }
8754       for(j=i;j<slen;j++)
8755       {
8756         if(ba[j]==start+i*4+8) 
8757           //temp_is32=1;
8758           temp_is32&=p32[j];
8759       }
8760       if(temp_is32!=current.is32) {
8761         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8762         for(hr=0;hr<HOST_REGS;hr++)
8763         {
8764           int r=current.regmap[hr];
8765           if(r>0)
8766           {
8767             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8768               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8769               {
8770                 //printf("dump %d/r%d\n",hr,r);
8771                 current.regmap[hr]=-1;
8772                 if(get_reg(current.regmap,r|64)>=0) 
8773                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8774               }
8775             }
8776           }
8777         }
8778       }
8779     }
8780     #endif
8781     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8782       if(i+1<slen) {
8783         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8784         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8785         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8786         current.u|=1;
8787         current.uu|=1;
8788       } else {
8789         current.u=1;
8790         current.uu=1;
8791       }
8792     } else {
8793       if(i+1<slen) {
8794         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8795         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8796         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8797         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8798         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8799         current.u|=1;
8800         current.uu|=1;
8801       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8802     }
8803     is_ds[i]=ds;
8804     if(ds) {
8805       ds=0; // Skip delay slot, already allocated as part of branch
8806       // ...but we need to alloc it in case something jumps here
8807       if(i+1<slen) {
8808         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8809         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8810       }else{
8811         current.u=branch_unneeded_reg[i-1];
8812         current.uu=branch_unneeded_reg_upper[i-1];
8813       }
8814       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8815       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8816       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8817       current.u|=1;
8818       current.uu|=1;
8819       struct regstat temp;
8820       memcpy(&temp,&current,sizeof(current));
8821       temp.wasdirty=temp.dirty;
8822       temp.was32=temp.is32;
8823       // TODO: Take into account unconditional branches, as below
8824       delayslot_alloc(&temp,i);
8825       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8826       regs[i].wasdirty=temp.wasdirty;
8827       regs[i].was32=temp.was32;
8828       regs[i].dirty=temp.dirty;
8829       regs[i].is32=temp.is32;
8830       regs[i].isconst=0;
8831       regs[i].wasconst=0;
8832       current.isconst=0;
8833       // Create entry (branch target) regmap
8834       for(hr=0;hr<HOST_REGS;hr++)
8835       {
8836         int r=temp.regmap[hr];
8837         if(r>=0) {
8838           if(r!=regmap_pre[i][hr]) {
8839             regs[i].regmap_entry[hr]=-1;
8840           }
8841           else
8842           {
8843             if(r<64){
8844               if((current.u>>r)&1) {
8845                 regs[i].regmap_entry[hr]=-1;
8846                 regs[i].regmap[hr]=-1;
8847                 //Don't clear regs in the delay slot as the branch might need them
8848                 //current.regmap[hr]=-1;
8849               }else
8850                 regs[i].regmap_entry[hr]=r;
8851             }
8852             else {
8853               if((current.uu>>(r&63))&1) {
8854                 regs[i].regmap_entry[hr]=-1;
8855                 regs[i].regmap[hr]=-1;
8856                 //Don't clear regs in the delay slot as the branch might need them
8857                 //current.regmap[hr]=-1;
8858               }else
8859                 regs[i].regmap_entry[hr]=r;
8860             }
8861           }
8862         } else {
8863           // First instruction expects CCREG to be allocated
8864           if(i==0&&hr==HOST_CCREG) 
8865             regs[i].regmap_entry[hr]=CCREG;
8866           else
8867             regs[i].regmap_entry[hr]=-1;
8868         }
8869       }
8870     }
8871     else { // Not delay slot
8872       switch(itype[i]) {
8873         case UJUMP:
8874           //current.isconst=0; // DEBUG
8875           //current.wasconst=0; // DEBUG
8876           //regs[i].wasconst=0; // DEBUG
8877           clear_const(&current,rt1[i]);
8878           alloc_cc(&current,i);
8879           dirty_reg(&current,CCREG);
8880           ooo[i]=1;
8881           delayslot_alloc(&current,i+1);
8882           if (rt1[i]==31) {
8883             alloc_reg(&current,i,31);
8884             dirty_reg(&current,31);
8885             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8886             //assert(rt1[i+1]!=rt1[i]);
8887             #ifdef REG_PREFETCH
8888             alloc_reg(&current,i,PTEMP);
8889             #endif
8890             //current.is32|=1LL<<rt1[i];
8891           }
8892           //current.isconst=0; // DEBUG
8893           ds=1;
8894           //printf("i=%d, isconst=%x\n",i,current.isconst);
8895           break;
8896         case RJUMP:
8897           //current.isconst=0;
8898           //current.wasconst=0;
8899           //regs[i].wasconst=0;
8900           clear_const(&current,rs1[i]);
8901           clear_const(&current,rt1[i]);
8902           alloc_cc(&current,i);
8903           dirty_reg(&current,CCREG);
8904           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8905             alloc_reg(&current,i,rs1[i]);
8906             if (rt1[i]!=0) {
8907               alloc_reg(&current,i,rt1[i]);
8908               dirty_reg(&current,rt1[i]);
8909               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8910               assert(rt1[i+1]!=rt1[i]);
8911               #ifdef REG_PREFETCH
8912               alloc_reg(&current,i,PTEMP);
8913               #endif
8914             }
8915             #ifdef USE_MINI_HT
8916             if(rs1[i]==31) { // JALR
8917               alloc_reg(&current,i,RHASH);
8918               #ifndef HOST_IMM_ADDR32
8919               alloc_reg(&current,i,RHTBL);
8920               #endif
8921             }
8922             #endif
8923             delayslot_alloc(&current,i+1);
8924           } else {
8925             // The delay slot overwrites our source register,
8926             // allocate a temporary register to hold the old value.
8927             current.isconst=0;
8928             current.wasconst=0;
8929             regs[i].wasconst=0;
8930             delayslot_alloc(&current,i+1);
8931             current.isconst=0;
8932             alloc_reg(&current,i,RTEMP);
8933           }
8934           //current.isconst=0; // DEBUG
8935           ooo[i]=1;
8936           ds=1;
8937           break;
8938         case CJUMP:
8939           //current.isconst=0;
8940           //current.wasconst=0;
8941           //regs[i].wasconst=0;
8942           clear_const(&current,rs1[i]);
8943           clear_const(&current,rs2[i]);
8944           if((opcode[i]&0x3E)==4) // BEQ/BNE
8945           {
8946             alloc_cc(&current,i);
8947             dirty_reg(&current,CCREG);
8948             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8949             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8950             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8951             {
8952               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8953               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8954             }
8955             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8956                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8957               // The delay slot overwrites one of our conditions.
8958               // Allocate the branch condition registers instead.
8959               current.isconst=0;
8960               current.wasconst=0;
8961               regs[i].wasconst=0;
8962               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8963               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8964               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8965               {
8966                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8967                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8968               }
8969             }
8970             else
8971             {
8972               ooo[i]=1;
8973               delayslot_alloc(&current,i+1);
8974             }
8975           }
8976           else
8977           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8978           {
8979             alloc_cc(&current,i);
8980             dirty_reg(&current,CCREG);
8981             alloc_reg(&current,i,rs1[i]);
8982             if(!(current.is32>>rs1[i]&1))
8983             {
8984               alloc_reg64(&current,i,rs1[i]);
8985             }
8986             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8987               // The delay slot overwrites one of our conditions.
8988               // Allocate the branch condition registers instead.
8989               current.isconst=0;
8990               current.wasconst=0;
8991               regs[i].wasconst=0;
8992               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8993               if(!((current.is32>>rs1[i])&1))
8994               {
8995                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8996               }
8997             }
8998             else
8999             {
9000               ooo[i]=1;
9001               delayslot_alloc(&current,i+1);
9002             }
9003           }
9004           else
9005           // Don't alloc the delay slot yet because we might not execute it
9006           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9007           {
9008             current.isconst=0;
9009             current.wasconst=0;
9010             regs[i].wasconst=0;
9011             alloc_cc(&current,i);
9012             dirty_reg(&current,CCREG);
9013             alloc_reg(&current,i,rs1[i]);
9014             alloc_reg(&current,i,rs2[i]);
9015             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9016             {
9017               alloc_reg64(&current,i,rs1[i]);
9018               alloc_reg64(&current,i,rs2[i]);
9019             }
9020           }
9021           else
9022           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9023           {
9024             current.isconst=0;
9025             current.wasconst=0;
9026             regs[i].wasconst=0;
9027             alloc_cc(&current,i);
9028             dirty_reg(&current,CCREG);
9029             alloc_reg(&current,i,rs1[i]);
9030             if(!(current.is32>>rs1[i]&1))
9031             {
9032               alloc_reg64(&current,i,rs1[i]);
9033             }
9034           }
9035           ds=1;
9036           //current.isconst=0;
9037           break;
9038         case SJUMP:
9039           //current.isconst=0;
9040           //current.wasconst=0;
9041           //regs[i].wasconst=0;
9042           clear_const(&current,rs1[i]);
9043           clear_const(&current,rt1[i]);
9044           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9045           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9046           {
9047             alloc_cc(&current,i);
9048             dirty_reg(&current,CCREG);
9049             alloc_reg(&current,i,rs1[i]);
9050             if(!(current.is32>>rs1[i]&1))
9051             {
9052               alloc_reg64(&current,i,rs1[i]);
9053             }
9054             if (rt1[i]==31) { // BLTZAL/BGEZAL
9055               alloc_reg(&current,i,31);
9056               dirty_reg(&current,31);
9057               //#ifdef REG_PREFETCH
9058               //alloc_reg(&current,i,PTEMP);
9059               //#endif
9060               //current.is32|=1LL<<rt1[i];
9061             }
9062             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9063                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9064               // Allocate the branch condition registers instead.
9065               current.isconst=0;
9066               current.wasconst=0;
9067               regs[i].wasconst=0;
9068               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9069               if(!((current.is32>>rs1[i])&1))
9070               {
9071                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9072               }
9073             }
9074             else
9075             {
9076               ooo[i]=1;
9077               delayslot_alloc(&current,i+1);
9078             }
9079           }
9080           else
9081           // Don't alloc the delay slot yet because we might not execute it
9082           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9083           {
9084             current.isconst=0;
9085             current.wasconst=0;
9086             regs[i].wasconst=0;
9087             alloc_cc(&current,i);
9088             dirty_reg(&current,CCREG);
9089             alloc_reg(&current,i,rs1[i]);
9090             if(!(current.is32>>rs1[i]&1))
9091             {
9092               alloc_reg64(&current,i,rs1[i]);
9093             }
9094           }
9095           ds=1;
9096           //current.isconst=0;
9097           break;
9098         case FJUMP:
9099           current.isconst=0;
9100           current.wasconst=0;
9101           regs[i].wasconst=0;
9102           if(likely[i]==0) // BC1F/BC1T
9103           {
9104             // TODO: Theoretically we can run out of registers here on x86.
9105             // The delay slot can allocate up to six, and we need to check
9106             // CSREG before executing the delay slot.  Possibly we can drop
9107             // the cycle count and then reload it after checking that the
9108             // FPU is in a usable state, or don't do out-of-order execution.
9109             alloc_cc(&current,i);
9110             dirty_reg(&current,CCREG);
9111             alloc_reg(&current,i,FSREG);
9112             alloc_reg(&current,i,CSREG);
9113             if(itype[i+1]==FCOMP) {
9114               // The delay slot overwrites the branch condition.
9115               // Allocate the branch condition registers instead.
9116               alloc_cc(&current,i);
9117               dirty_reg(&current,CCREG);
9118               alloc_reg(&current,i,CSREG);
9119               alloc_reg(&current,i,FSREG);
9120             }
9121             else {
9122               ooo[i]=1;
9123               delayslot_alloc(&current,i+1);
9124               alloc_reg(&current,i+1,CSREG);
9125             }
9126           }
9127           else
9128           // Don't alloc the delay slot yet because we might not execute it
9129           if(likely[i]) // BC1FL/BC1TL
9130           {
9131             alloc_cc(&current,i);
9132             dirty_reg(&current,CCREG);
9133             alloc_reg(&current,i,CSREG);
9134             alloc_reg(&current,i,FSREG);
9135           }
9136           ds=1;
9137           current.isconst=0;
9138           break;
9139         case IMM16:
9140           imm16_alloc(&current,i);
9141           break;
9142         case LOAD:
9143         case LOADLR:
9144           load_alloc(&current,i);
9145           break;
9146         case STORE:
9147         case STORELR:
9148           store_alloc(&current,i);
9149           break;
9150         case ALU:
9151           alu_alloc(&current,i);
9152           break;
9153         case SHIFT:
9154           shift_alloc(&current,i);
9155           break;
9156         case MULTDIV:
9157           multdiv_alloc(&current,i);
9158           break;
9159         case SHIFTIMM:
9160           shiftimm_alloc(&current,i);
9161           break;
9162         case MOV:
9163           mov_alloc(&current,i);
9164           break;
9165         case COP0:
9166           cop0_alloc(&current,i);
9167           break;
9168         case COP1:
9169         case COP2:
9170           cop1_alloc(&current,i);
9171           break;
9172         case C1LS:
9173           c1ls_alloc(&current,i);
9174           break;
9175         case C2LS:
9176           c2ls_alloc(&current,i);
9177           break;
9178         case C2OP:
9179           c2op_alloc(&current,i);
9180           break;
9181         case FCONV:
9182           fconv_alloc(&current,i);
9183           break;
9184         case FLOAT:
9185           float_alloc(&current,i);
9186           break;
9187         case FCOMP:
9188           fcomp_alloc(&current,i);
9189           break;
9190         case SYSCALL:
9191         case HLECALL:
9192         case INTCALL:
9193           syscall_alloc(&current,i);
9194           break;
9195         case SPAN:
9196           pagespan_alloc(&current,i);
9197           break;
9198       }
9199       
9200       // Drop the upper half of registers that have become 32-bit
9201       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9202       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9203         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9204         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9205         current.uu|=1;
9206       } else {
9207         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9208         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9209         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9210         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9211         current.uu|=1;
9212       }
9213
9214       // Create entry (branch target) regmap
9215       for(hr=0;hr<HOST_REGS;hr++)
9216       {
9217         int r,or,er;
9218         r=current.regmap[hr];
9219         if(r>=0) {
9220           if(r!=regmap_pre[i][hr]) {
9221             // TODO: delay slot (?)
9222             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9223             if(or<0||(r&63)>=TEMPREG){
9224               regs[i].regmap_entry[hr]=-1;
9225             }
9226             else
9227             {
9228               // Just move it to a different register
9229               regs[i].regmap_entry[hr]=r;
9230               // If it was dirty before, it's still dirty
9231               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9232             }
9233           }
9234           else
9235           {
9236             // Unneeded
9237             if(r==0){
9238               regs[i].regmap_entry[hr]=0;
9239             }
9240             else
9241             if(r<64){
9242               if((current.u>>r)&1) {
9243                 regs[i].regmap_entry[hr]=-1;
9244                 //regs[i].regmap[hr]=-1;
9245                 current.regmap[hr]=-1;
9246               }else
9247                 regs[i].regmap_entry[hr]=r;
9248             }
9249             else {
9250               if((current.uu>>(r&63))&1) {
9251                 regs[i].regmap_entry[hr]=-1;
9252                 //regs[i].regmap[hr]=-1;
9253                 current.regmap[hr]=-1;
9254               }else
9255                 regs[i].regmap_entry[hr]=r;
9256             }
9257           }
9258         } else {
9259           // Branches expect CCREG to be allocated at the target
9260           if(regmap_pre[i][hr]==CCREG) 
9261             regs[i].regmap_entry[hr]=CCREG;
9262           else
9263             regs[i].regmap_entry[hr]=-1;
9264         }
9265       }
9266       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9267     }
9268     /* Branch post-alloc */
9269     if(i>0)
9270     {
9271       current.was32=current.is32;
9272       current.wasdirty=current.dirty;
9273       switch(itype[i-1]) {
9274         case UJUMP:
9275           memcpy(&branch_regs[i-1],&current,sizeof(current));
9276           branch_regs[i-1].isconst=0;
9277           branch_regs[i-1].wasconst=0;
9278           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9279           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9280           alloc_cc(&branch_regs[i-1],i-1);
9281           dirty_reg(&branch_regs[i-1],CCREG);
9282           if(rt1[i-1]==31) { // JAL
9283             alloc_reg(&branch_regs[i-1],i-1,31);
9284             dirty_reg(&branch_regs[i-1],31);
9285             branch_regs[i-1].is32|=1LL<<31;
9286           }
9287           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9288           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9289           break;
9290         case RJUMP:
9291           memcpy(&branch_regs[i-1],&current,sizeof(current));
9292           branch_regs[i-1].isconst=0;
9293           branch_regs[i-1].wasconst=0;
9294           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9295           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9296           alloc_cc(&branch_regs[i-1],i-1);
9297           dirty_reg(&branch_regs[i-1],CCREG);
9298           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9299           if(rt1[i-1]!=0) { // JALR
9300             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9301             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9302             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9303           }
9304           #ifdef USE_MINI_HT
9305           if(rs1[i-1]==31) { // JALR
9306             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9307             #ifndef HOST_IMM_ADDR32
9308             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9309             #endif
9310           }
9311           #endif
9312           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9313           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9314           break;
9315         case CJUMP:
9316           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9317           {
9318             alloc_cc(&current,i-1);
9319             dirty_reg(&current,CCREG);
9320             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9321                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9322               // The delay slot overwrote one of our conditions
9323               // Delay slot goes after the test (in order)
9324               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9325               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9326               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9327               current.u|=1;
9328               current.uu|=1;
9329               delayslot_alloc(&current,i);
9330               current.isconst=0;
9331             }
9332             else
9333             {
9334               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9335               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9336               // Alloc the branch condition registers
9337               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9338               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9339               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9340               {
9341                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9342                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9343               }
9344             }
9345             memcpy(&branch_regs[i-1],&current,sizeof(current));
9346             branch_regs[i-1].isconst=0;
9347             branch_regs[i-1].wasconst=0;
9348             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9349             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9350           }
9351           else
9352           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9353           {
9354             alloc_cc(&current,i-1);
9355             dirty_reg(&current,CCREG);
9356             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9357               // The delay slot overwrote the branch condition
9358               // Delay slot goes after the test (in order)
9359               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9360               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9361               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9362               current.u|=1;
9363               current.uu|=1;
9364               delayslot_alloc(&current,i);
9365               current.isconst=0;
9366             }
9367             else
9368             {
9369               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9370               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9371               // Alloc the branch condition register
9372               alloc_reg(&current,i-1,rs1[i-1]);
9373               if(!(current.is32>>rs1[i-1]&1))
9374               {
9375                 alloc_reg64(&current,i-1,rs1[i-1]);
9376               }
9377             }
9378             memcpy(&branch_regs[i-1],&current,sizeof(current));
9379             branch_regs[i-1].isconst=0;
9380             branch_regs[i-1].wasconst=0;
9381             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9382             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9383           }
9384           else
9385           // Alloc the delay slot in case the branch is taken
9386           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9387           {
9388             memcpy(&branch_regs[i-1],&current,sizeof(current));
9389             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9390             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9391             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9392             alloc_cc(&branch_regs[i-1],i);
9393             dirty_reg(&branch_regs[i-1],CCREG);
9394             delayslot_alloc(&branch_regs[i-1],i);
9395             branch_regs[i-1].isconst=0;
9396             alloc_reg(&current,i,CCREG); // Not taken path
9397             dirty_reg(&current,CCREG);
9398             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9399           }
9400           else
9401           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9402           {
9403             memcpy(&branch_regs[i-1],&current,sizeof(current));
9404             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9405             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9406             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9407             alloc_cc(&branch_regs[i-1],i);
9408             dirty_reg(&branch_regs[i-1],CCREG);
9409             delayslot_alloc(&branch_regs[i-1],i);
9410             branch_regs[i-1].isconst=0;
9411             alloc_reg(&current,i,CCREG); // Not taken path
9412             dirty_reg(&current,CCREG);
9413             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9414           }
9415           break;
9416         case SJUMP:
9417           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9418           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9419           {
9420             alloc_cc(&current,i-1);
9421             dirty_reg(&current,CCREG);
9422             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9423               // The delay slot overwrote the branch condition
9424               // Delay slot goes after the test (in order)
9425               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9426               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9427               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9428               current.u|=1;
9429               current.uu|=1;
9430               delayslot_alloc(&current,i);
9431               current.isconst=0;
9432             }
9433             else
9434             {
9435               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9436               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9437               // Alloc the branch condition register
9438               alloc_reg(&current,i-1,rs1[i-1]);
9439               if(!(current.is32>>rs1[i-1]&1))
9440               {
9441                 alloc_reg64(&current,i-1,rs1[i-1]);
9442               }
9443             }
9444             memcpy(&branch_regs[i-1],&current,sizeof(current));
9445             branch_regs[i-1].isconst=0;
9446             branch_regs[i-1].wasconst=0;
9447             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9448             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9449           }
9450           else
9451           // Alloc the delay slot in case the branch is taken
9452           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9453           {
9454             memcpy(&branch_regs[i-1],&current,sizeof(current));
9455             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9456             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9457             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9458             alloc_cc(&branch_regs[i-1],i);
9459             dirty_reg(&branch_regs[i-1],CCREG);
9460             delayslot_alloc(&branch_regs[i-1],i);
9461             branch_regs[i-1].isconst=0;
9462             alloc_reg(&current,i,CCREG); // Not taken path
9463             dirty_reg(&current,CCREG);
9464             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9465           }
9466           // FIXME: BLTZAL/BGEZAL
9467           if(opcode2[i-1]&0x10) { // BxxZAL
9468             alloc_reg(&branch_regs[i-1],i-1,31);
9469             dirty_reg(&branch_regs[i-1],31);
9470             branch_regs[i-1].is32|=1LL<<31;
9471           }
9472           break;
9473         case FJUMP:
9474           if(likely[i-1]==0) // BC1F/BC1T
9475           {
9476             alloc_cc(&current,i-1);
9477             dirty_reg(&current,CCREG);
9478             if(itype[i]==FCOMP) {
9479               // The delay slot overwrote the branch condition
9480               // Delay slot goes after the test (in order)
9481               delayslot_alloc(&current,i);
9482               current.isconst=0;
9483             }
9484             else
9485             {
9486               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9487               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9488               // Alloc the branch condition register
9489               alloc_reg(&current,i-1,FSREG);
9490             }
9491             memcpy(&branch_regs[i-1],&current,sizeof(current));
9492             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9493           }
9494           else // BC1FL/BC1TL
9495           {
9496             // Alloc the delay slot in case the branch is taken
9497             memcpy(&branch_regs[i-1],&current,sizeof(current));
9498             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9499             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9500             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9501             alloc_cc(&branch_regs[i-1],i);
9502             dirty_reg(&branch_regs[i-1],CCREG);
9503             delayslot_alloc(&branch_regs[i-1],i);
9504             branch_regs[i-1].isconst=0;
9505             alloc_reg(&current,i,CCREG); // Not taken path
9506             dirty_reg(&current,CCREG);
9507             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9508           }
9509           break;
9510       }
9511
9512       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9513       {
9514         if(rt1[i-1]==31) // JAL/JALR
9515         {
9516           // Subroutine call will return here, don't alloc any registers
9517           current.is32=1;
9518           current.dirty=0;
9519           clear_all_regs(current.regmap);
9520           alloc_reg(&current,i,CCREG);
9521           dirty_reg(&current,CCREG);
9522         }
9523         else if(i+1<slen)
9524         {
9525           // Internal branch will jump here, match registers to caller
9526           current.is32=0x3FFFFFFFFLL;
9527           current.dirty=0;
9528           clear_all_regs(current.regmap);
9529           alloc_reg(&current,i,CCREG);
9530           dirty_reg(&current,CCREG);
9531           for(j=i-1;j>=0;j--)
9532           {
9533             if(ba[j]==start+i*4+4) {
9534               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9535               current.is32=branch_regs[j].is32;
9536               current.dirty=branch_regs[j].dirty;
9537               break;
9538             }
9539           }
9540           while(j>=0) {
9541             if(ba[j]==start+i*4+4) {
9542               for(hr=0;hr<HOST_REGS;hr++) {
9543                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9544                   current.regmap[hr]=-1;
9545                 }
9546                 current.is32&=branch_regs[j].is32;
9547                 current.dirty&=branch_regs[j].dirty;
9548               }
9549             }
9550             j--;
9551           }
9552         }
9553       }
9554     }
9555
9556     // Count cycles in between branches
9557     ccadj[i]=cc;
9558     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9559     {
9560       cc=0;
9561     }
9562 #ifdef PCSX
9563     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9564     {
9565       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9566     }
9567     else if(itype[i]==C2LS)
9568     {
9569       cc+=4;
9570     }
9571 #endif
9572     else
9573     {
9574       cc++;
9575     }
9576
9577     flush_dirty_uppers(&current);
9578     if(!is_ds[i]) {
9579       regs[i].is32=current.is32;
9580       regs[i].dirty=current.dirty;
9581       regs[i].isconst=current.isconst;
9582       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9583     }
9584     for(hr=0;hr<HOST_REGS;hr++) {
9585       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9586         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9587           regs[i].wasconst&=~(1<<hr);
9588         }
9589       }
9590     }
9591     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9592   }
9593   
9594   /* Pass 4 - Cull unused host registers */
9595   
9596   uint64_t nr=0;
9597   
9598   for (i=slen-1;i>=0;i--)
9599   {
9600     int hr;
9601     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9602     {
9603       if(ba[i]<start || ba[i]>=(start+slen*4))
9604       {
9605         // Branch out of this block, don't need anything
9606         nr=0;
9607       }
9608       else
9609       {
9610         // Internal branch
9611         // Need whatever matches the target
9612         nr=0;
9613         int t=(ba[i]-start)>>2;
9614         for(hr=0;hr<HOST_REGS;hr++)
9615         {
9616           if(regs[i].regmap_entry[hr]>=0) {
9617             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9618           }
9619         }
9620       }
9621       // Conditional branch may need registers for following instructions
9622       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9623       {
9624         if(i<slen-2) {
9625           nr|=needed_reg[i+2];
9626           for(hr=0;hr<HOST_REGS;hr++)
9627           {
9628             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9629             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9630           }
9631         }
9632       }
9633       // Don't need stuff which is overwritten
9634       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9635       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9636       // Merge in delay slot
9637       for(hr=0;hr<HOST_REGS;hr++)
9638       {
9639         if(!likely[i]) {
9640           // These are overwritten unless the branch is "likely"
9641           // and the delay slot is nullified if not taken
9642           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9643           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9644         }
9645         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9646         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9647         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9648         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9649         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9650         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9651         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9652         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9653         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9654           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9655           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9656         }
9657         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9658           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9659           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9660         }
9661         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9662           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9663           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9664         }
9665       }
9666     }
9667     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9668     {
9669       // SYSCALL instruction (software interrupt)
9670       nr=0;
9671     }
9672     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9673     {
9674       // ERET instruction (return from interrupt)
9675       nr=0;
9676     }
9677     else // Non-branch
9678     {
9679       if(i<slen-1) {
9680         for(hr=0;hr<HOST_REGS;hr++) {
9681           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9682           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9683           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9684           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9685         }
9686       }
9687     }
9688     for(hr=0;hr<HOST_REGS;hr++)
9689     {
9690       // Overwritten registers are not needed
9691       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9692       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9693       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9694       // Source registers are needed
9695       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9696       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9697       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9698       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9699       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9700       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9701       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9702       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9703       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9704         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9705         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9706       }
9707       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9708         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9709         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9710       }
9711       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9712         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9713         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9714       }
9715       // Don't store a register immediately after writing it,
9716       // may prevent dual-issue.
9717       // But do so if this is a branch target, otherwise we
9718       // might have to load the register before the branch.
9719       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9720         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9721            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9722           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9723           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9724         }
9725         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9726            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9727           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9728           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9729         }
9730       }
9731     }
9732     // Cycle count is needed at branches.  Assume it is needed at the target too.
9733     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9734       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9735       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9736     }
9737     // Save it
9738     needed_reg[i]=nr;
9739     
9740     // Deallocate unneeded registers
9741     for(hr=0;hr<HOST_REGS;hr++)
9742     {
9743       if(!((nr>>hr)&1)) {
9744         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9745         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9746            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9747            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9748         {
9749           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9750           {
9751             if(likely[i]) {
9752               regs[i].regmap[hr]=-1;
9753               regs[i].isconst&=~(1<<hr);
9754               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9755             }
9756           }
9757         }
9758         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9759         {
9760           int d1=0,d2=0,map=0,temp=0;
9761           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9762           {
9763             d1=dep1[i+1];
9764             d2=dep2[i+1];
9765           }
9766           if(using_tlb) {
9767             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9768                itype[i+1]==STORE || itype[i+1]==STORELR ||
9769                itype[i+1]==C1LS || itype[i+1]==C2LS)
9770             map=TLREG;
9771           } else
9772           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9773              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9774             map=INVCP;
9775           }
9776           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9777              itype[i+1]==C1LS || itype[i+1]==C2LS)
9778             temp=FTEMP;
9779           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9780              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9781              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9782              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9783              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9784              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9785              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9786              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9787              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9788              regs[i].regmap[hr]!=map )
9789           {
9790             regs[i].regmap[hr]=-1;
9791             regs[i].isconst&=~(1<<hr);
9792             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9793                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9794                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9795                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9796                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9797                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9798                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9799                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9800                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9801                branch_regs[i].regmap[hr]!=map)
9802             {
9803               branch_regs[i].regmap[hr]=-1;
9804               branch_regs[i].regmap_entry[hr]=-1;
9805               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9806               {
9807                 if(!likely[i]&&i<slen-2) {
9808                   regmap_pre[i+2][hr]=-1;
9809                 }
9810               }
9811             }
9812           }
9813         }
9814         else
9815         {
9816           // Non-branch
9817           if(i>0)
9818           {
9819             int d1=0,d2=0,map=-1,temp=-1;
9820             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9821             {
9822               d1=dep1[i];
9823               d2=dep2[i];
9824             }
9825             if(using_tlb) {
9826               if(itype[i]==LOAD || itype[i]==LOADLR ||
9827                  itype[i]==STORE || itype[i]==STORELR ||
9828                  itype[i]==C1LS || itype[i]==C2LS)
9829               map=TLREG;
9830             } else if(itype[i]==STORE || itype[i]==STORELR ||
9831                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9832               map=INVCP;
9833             }
9834             if(itype[i]==LOADLR || itype[i]==STORELR ||
9835                itype[i]==C1LS || itype[i]==C2LS)
9836               temp=FTEMP;
9837             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9838                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9839                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9840                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9841                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9842                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9843             {
9844               if(i<slen-1&&!is_ds[i]) {
9845                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9846                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9847                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9848                 {
9849                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9850                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9851                 }
9852                 regmap_pre[i+1][hr]=-1;
9853                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9854               }
9855               regs[i].regmap[hr]=-1;
9856               regs[i].isconst&=~(1<<hr);
9857             }
9858           }
9859         }
9860       }
9861     }
9862   }
9863   
9864   /* Pass 5 - Pre-allocate registers */
9865   
9866   // If a register is allocated during a loop, try to allocate it for the
9867   // entire loop, if possible.  This avoids loading/storing registers
9868   // inside of the loop.
9869
9870   signed char f_regmap[HOST_REGS];
9871   clear_all_regs(f_regmap);
9872   for(i=0;i<slen-1;i++)
9873   {
9874     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9875     {
9876       if(ba[i]>=start && ba[i]<(start+i*4)) 
9877       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9878       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9879       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9880       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9881       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9882       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9883       {
9884         int t=(ba[i]-start)>>2;
9885         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9886         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9887         for(hr=0;hr<HOST_REGS;hr++)
9888         {
9889           if(regs[i].regmap[hr]>64) {
9890             if(!((regs[i].dirty>>hr)&1))
9891               f_regmap[hr]=regs[i].regmap[hr];
9892             else f_regmap[hr]=-1;
9893           }
9894           else if(regs[i].regmap[hr]>=0) {
9895             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9896               // dealloc old register
9897               int n;
9898               for(n=0;n<HOST_REGS;n++)
9899               {
9900                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9901               }
9902               // and alloc new one
9903               f_regmap[hr]=regs[i].regmap[hr];
9904             }
9905           }
9906           if(branch_regs[i].regmap[hr]>64) {
9907             if(!((branch_regs[i].dirty>>hr)&1))
9908               f_regmap[hr]=branch_regs[i].regmap[hr];
9909             else f_regmap[hr]=-1;
9910           }
9911           else if(branch_regs[i].regmap[hr]>=0) {
9912             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9913               // dealloc old register
9914               int n;
9915               for(n=0;n<HOST_REGS;n++)
9916               {
9917                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9918               }
9919               // and alloc new one
9920               f_regmap[hr]=branch_regs[i].regmap[hr];
9921             }
9922           }
9923           if(ooo[i]) {
9924             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9925               f_regmap[hr]=branch_regs[i].regmap[hr];
9926           }else{
9927             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9928               f_regmap[hr]=branch_regs[i].regmap[hr];
9929           }
9930           // Avoid dirty->clean transition
9931           #ifdef DESTRUCTIVE_WRITEBACK
9932           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9933           #endif
9934           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9935           // case above, however it's always a good idea.  We can't hoist the
9936           // load if the register was already allocated, so there's no point
9937           // wasting time analyzing most of these cases.  It only "succeeds"
9938           // when the mapping was different and the load can be replaced with
9939           // a mov, which is of negligible benefit.  So such cases are
9940           // skipped below.
9941           if(f_regmap[hr]>0) {
9942             if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
9943               int r=f_regmap[hr];
9944               for(j=t;j<=i;j++)
9945               {
9946                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9947                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9948                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9949                 if(r>63) {
9950                   // NB This can exclude the case where the upper-half
9951                   // register is lower numbered than the lower-half
9952                   // register.  Not sure if it's worth fixing...
9953                   if(get_reg(regs[j].regmap,r&63)<0) break;
9954                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9955                   if(regs[j].is32&(1LL<<(r&63))) break;
9956                 }
9957                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9958                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9959                   int k;
9960                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9961                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9962                     if(r>63) {
9963                       if(get_reg(regs[i].regmap,r&63)<0) break;
9964                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9965                     }
9966                     k=i;
9967                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9968                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9969                         //printf("no free regs for store %x\n",start+(k-1)*4);
9970                         break;
9971                       }
9972                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9973                         //printf("no-match due to different register\n");
9974                         break;
9975                       }
9976                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9977                         //printf("no-match due to branch\n");
9978                         break;
9979                       }
9980                       // call/ret fast path assumes no registers allocated
9981                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9982                         break;
9983                       }
9984                       if(r>63) {
9985                         // NB This can exclude the case where the upper-half
9986                         // register is lower numbered than the lower-half
9987                         // register.  Not sure if it's worth fixing...
9988                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9989                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9990                       }
9991                       k--;
9992                     }
9993                     if(i<slen-1) {
9994                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9995                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9996                         //printf("bad match after branch\n");
9997                         break;
9998                       }
9999                     }
10000                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10001                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10002                       while(k<i) {
10003                         regs[k].regmap_entry[hr]=f_regmap[hr];
10004                         regs[k].regmap[hr]=f_regmap[hr];
10005                         regmap_pre[k+1][hr]=f_regmap[hr];
10006                         regs[k].wasdirty&=~(1<<hr);
10007                         regs[k].dirty&=~(1<<hr);
10008                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10009                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10010                         regs[k].wasconst&=~(1<<hr);
10011                         regs[k].isconst&=~(1<<hr);
10012                         k++;
10013                       }
10014                     }
10015                     else {
10016                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10017                       break;
10018                     }
10019                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10020                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10021                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10022                       regs[i].regmap_entry[hr]=f_regmap[hr];
10023                       regs[i].regmap[hr]=f_regmap[hr];
10024                       regs[i].wasdirty&=~(1<<hr);
10025                       regs[i].dirty&=~(1<<hr);
10026                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10027                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10028                       regs[i].wasconst&=~(1<<hr);
10029                       regs[i].isconst&=~(1<<hr);
10030                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10031                       branch_regs[i].wasdirty&=~(1<<hr);
10032                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10033                       branch_regs[i].regmap[hr]=f_regmap[hr];
10034                       branch_regs[i].dirty&=~(1<<hr);
10035                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10036                       branch_regs[i].wasconst&=~(1<<hr);
10037                       branch_regs[i].isconst&=~(1<<hr);
10038                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10039                         regmap_pre[i+2][hr]=f_regmap[hr];
10040                         regs[i+2].wasdirty&=~(1<<hr);
10041                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10042                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10043                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10044                       }
10045                     }
10046                   }
10047                   for(k=t;k<j;k++) {
10048                     // Alloc register clean at beginning of loop,
10049                     // but may dirty it in pass 6
10050                     regs[k].regmap_entry[hr]=f_regmap[hr];
10051                     regs[k].regmap[hr]=f_regmap[hr];
10052                     regs[k].dirty&=~(1<<hr);
10053                     regs[k].wasconst&=~(1<<hr);
10054                     regs[k].isconst&=~(1<<hr);
10055                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10056                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10057                       branch_regs[k].regmap[hr]=f_regmap[hr];
10058                       branch_regs[k].dirty&=~(1<<hr);
10059                       branch_regs[k].wasconst&=~(1<<hr);
10060                       branch_regs[k].isconst&=~(1<<hr);
10061                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10062                         regmap_pre[k+2][hr]=f_regmap[hr];
10063                         regs[k+2].wasdirty&=~(1<<hr);
10064                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10065                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10066                       }
10067                     }
10068                     else
10069                     {
10070                       regmap_pre[k+1][hr]=f_regmap[hr];
10071                       regs[k+1].wasdirty&=~(1<<hr);
10072                     }
10073                   }
10074                   if(regs[j].regmap[hr]==f_regmap[hr])
10075                     regs[j].regmap_entry[hr]=f_regmap[hr];
10076                   break;
10077                 }
10078                 if(j==i) break;
10079                 if(regs[j].regmap[hr]>=0)
10080                   break;
10081                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10082                   //printf("no-match due to different register\n");
10083                   break;
10084                 }
10085                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10086                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10087                   break;
10088                 }
10089                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10090                 {
10091                   // Stop on unconditional branch
10092                   break;
10093                 }
10094                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10095                 {
10096                   if(ooo[j]) {
10097                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10098                       break;
10099                   }else{
10100                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10101                       break;
10102                   }
10103                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10104                     //printf("no-match due to different register (branch)\n");
10105                     break;
10106                   }
10107                 }
10108                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10109                   //printf("No free regs for store %x\n",start+j*4);
10110                   break;
10111                 }
10112                 if(f_regmap[hr]>=64) {
10113                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10114                     break;
10115                   }
10116                   else
10117                   {
10118                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10119                       break;
10120                     }
10121                   }
10122                 }
10123               }
10124             }
10125           }
10126         }
10127       }
10128     }else{
10129       int count=0;
10130       for(hr=0;hr<HOST_REGS;hr++)
10131       {
10132         if(hr!=EXCLUDE_REG) {
10133           if(regs[i].regmap[hr]>64) {
10134             if(!((regs[i].dirty>>hr)&1))
10135               f_regmap[hr]=regs[i].regmap[hr];
10136           }
10137           else if(regs[i].regmap[hr]>=0) {
10138             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10139               // dealloc old register
10140               int n;
10141               for(n=0;n<HOST_REGS;n++)
10142               {
10143                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10144               }
10145               // and alloc new one
10146               f_regmap[hr]=regs[i].regmap[hr];
10147             }
10148           }
10149           else if(regs[i].regmap[hr]<0) count++;
10150         }
10151       }
10152       // Try to restore cycle count at branch targets
10153       if(bt[i]) {
10154         for(j=i;j<slen-1;j++) {
10155           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10156           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10157             //printf("no free regs for store %x\n",start+j*4);
10158             break;
10159           }
10160         }
10161         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10162           int k=i;
10163           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10164           while(k<j) {
10165             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10166             regs[k].regmap[HOST_CCREG]=CCREG;
10167             regmap_pre[k+1][HOST_CCREG]=CCREG;
10168             regs[k+1].wasdirty|=1<<HOST_CCREG;
10169             regs[k].dirty|=1<<HOST_CCREG;
10170             regs[k].wasconst&=~(1<<HOST_CCREG);
10171             regs[k].isconst&=~(1<<HOST_CCREG);
10172             k++;
10173           }
10174           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10175         }
10176         // Work backwards from the branch target
10177         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10178         {
10179           //printf("Extend backwards\n");
10180           int k;
10181           k=i;
10182           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10183             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10184               //printf("no free regs for store %x\n",start+(k-1)*4);
10185               break;
10186             }
10187             k--;
10188           }
10189           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10190             //printf("Extend CC, %x ->\n",start+k*4);
10191             while(k<=i) {
10192               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10193               regs[k].regmap[HOST_CCREG]=CCREG;
10194               regmap_pre[k+1][HOST_CCREG]=CCREG;
10195               regs[k+1].wasdirty|=1<<HOST_CCREG;
10196               regs[k].dirty|=1<<HOST_CCREG;
10197               regs[k].wasconst&=~(1<<HOST_CCREG);
10198               regs[k].isconst&=~(1<<HOST_CCREG);
10199               k++;
10200             }
10201           }
10202           else {
10203             //printf("Fail Extend CC, %x ->\n",start+k*4);
10204           }
10205         }
10206       }
10207       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10208          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10209          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10210          itype[i]!=FCONV&&itype[i]!=FCOMP)
10211       {
10212         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10213       }
10214     }
10215   }
10216   
10217   // This allocates registers (if possible) one instruction prior
10218   // to use, which can avoid a load-use penalty on certain CPUs.
10219   for(i=0;i<slen-1;i++)
10220   {
10221     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10222     {
10223       if(!bt[i+1])
10224       {
10225         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10226            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10227         {
10228           if(rs1[i+1]) {
10229             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10230             {
10231               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10232               {
10233                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10234                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10235                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10236                 regs[i].isconst&=~(1<<hr);
10237                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10238                 constmap[i][hr]=constmap[i+1][hr];
10239                 regs[i+1].wasdirty&=~(1<<hr);
10240                 regs[i].dirty&=~(1<<hr);
10241               }
10242             }
10243           }
10244           if(rs2[i+1]) {
10245             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10246             {
10247               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10248               {
10249                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10250                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10251                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10252                 regs[i].isconst&=~(1<<hr);
10253                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10254                 constmap[i][hr]=constmap[i+1][hr];
10255                 regs[i+1].wasdirty&=~(1<<hr);
10256                 regs[i].dirty&=~(1<<hr);
10257               }
10258             }
10259           }
10260           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10261             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10262             {
10263               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10264               {
10265                 regs[i].regmap[hr]=rs1[i+1];
10266                 regmap_pre[i+1][hr]=rs1[i+1];
10267                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10268                 regs[i].isconst&=~(1<<hr);
10269                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10270                 constmap[i][hr]=constmap[i+1][hr];
10271                 regs[i+1].wasdirty&=~(1<<hr);
10272                 regs[i].dirty&=~(1<<hr);
10273               }
10274             }
10275           }
10276           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10277             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10278             {
10279               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10280               {
10281                 regs[i].regmap[hr]=rs1[i+1];
10282                 regmap_pre[i+1][hr]=rs1[i+1];
10283                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10284                 regs[i].isconst&=~(1<<hr);
10285                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10286                 constmap[i][hr]=constmap[i+1][hr];
10287                 regs[i+1].wasdirty&=~(1<<hr);
10288                 regs[i].dirty&=~(1<<hr);
10289               }
10290             }
10291           }
10292           #ifndef HOST_IMM_ADDR32
10293           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10294             hr=get_reg(regs[i+1].regmap,TLREG);
10295             if(hr>=0) {
10296               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10297               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10298                 int nr;
10299                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10300                 {
10301                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10302                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10303                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10304                   regs[i].isconst&=~(1<<hr);
10305                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10306                   constmap[i][hr]=constmap[i+1][hr];
10307                   regs[i+1].wasdirty&=~(1<<hr);
10308                   regs[i].dirty&=~(1<<hr);
10309                 }
10310                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10311                 {
10312                   // move it to another register
10313                   regs[i+1].regmap[hr]=-1;
10314                   regmap_pre[i+2][hr]=-1;
10315                   regs[i+1].regmap[nr]=TLREG;
10316                   regmap_pre[i+2][nr]=TLREG;
10317                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10318                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10319                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10320                   regs[i].isconst&=~(1<<nr);
10321                   regs[i+1].isconst&=~(1<<nr);
10322                   regs[i].dirty&=~(1<<nr);
10323                   regs[i+1].wasdirty&=~(1<<nr);
10324                   regs[i+1].dirty&=~(1<<nr);
10325                   regs[i+2].wasdirty&=~(1<<nr);
10326                 }
10327               }
10328             }
10329           }
10330           #endif
10331           if(itype[i+1]==STORE||itype[i+1]==STORELR
10332              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10333             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10334               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10335               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10336               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10337               assert(hr>=0);
10338               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10339               {
10340                 regs[i].regmap[hr]=rs1[i+1];
10341                 regmap_pre[i+1][hr]=rs1[i+1];
10342                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10343                 regs[i].isconst&=~(1<<hr);
10344                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10345                 constmap[i][hr]=constmap[i+1][hr];
10346                 regs[i+1].wasdirty&=~(1<<hr);
10347                 regs[i].dirty&=~(1<<hr);
10348               }
10349             }
10350           }
10351           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10352             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10353               int nr;
10354               hr=get_reg(regs[i+1].regmap,FTEMP);
10355               assert(hr>=0);
10356               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10357               {
10358                 regs[i].regmap[hr]=rs1[i+1];
10359                 regmap_pre[i+1][hr]=rs1[i+1];
10360                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10361                 regs[i].isconst&=~(1<<hr);
10362                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10363                 constmap[i][hr]=constmap[i+1][hr];
10364                 regs[i+1].wasdirty&=~(1<<hr);
10365                 regs[i].dirty&=~(1<<hr);
10366               }
10367               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10368               {
10369                 // move it to another register
10370                 regs[i+1].regmap[hr]=-1;
10371                 regmap_pre[i+2][hr]=-1;
10372                 regs[i+1].regmap[nr]=FTEMP;
10373                 regmap_pre[i+2][nr]=FTEMP;
10374                 regs[i].regmap[nr]=rs1[i+1];
10375                 regmap_pre[i+1][nr]=rs1[i+1];
10376                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10377                 regs[i].isconst&=~(1<<nr);
10378                 regs[i+1].isconst&=~(1<<nr);
10379                 regs[i].dirty&=~(1<<nr);
10380                 regs[i+1].wasdirty&=~(1<<nr);
10381                 regs[i+1].dirty&=~(1<<nr);
10382                 regs[i+2].wasdirty&=~(1<<nr);
10383               }
10384             }
10385           }
10386           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10387             if(itype[i+1]==LOAD) 
10388               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10389             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10390               hr=get_reg(regs[i+1].regmap,FTEMP);
10391             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10392               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10393               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10394             }
10395             if(hr>=0&&regs[i].regmap[hr]<0) {
10396               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10397               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10398                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10399                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10400                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10401                 regs[i].isconst&=~(1<<hr);
10402                 regs[i+1].wasdirty&=~(1<<hr);
10403                 regs[i].dirty&=~(1<<hr);
10404               }
10405             }
10406           }
10407         }
10408       }
10409     }
10410   }
10411   
10412   /* Pass 6 - Optimize clean/dirty state */
10413   clean_registers(0,slen-1,1);
10414   
10415   /* Pass 7 - Identify 32-bit registers */
10416 #ifndef FORCE32
10417   provisional_r32();
10418
10419   u_int r32=0;
10420   
10421   for (i=slen-1;i>=0;i--)
10422   {
10423     int hr;
10424     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10425     {
10426       if(ba[i]<start || ba[i]>=(start+slen*4))
10427       {
10428         // Branch out of this block, don't need anything
10429         r32=0;
10430       }
10431       else
10432       {
10433         // Internal branch
10434         // Need whatever matches the target
10435         // (and doesn't get overwritten by the delay slot instruction)
10436         r32=0;
10437         int t=(ba[i]-start)>>2;
10438         if(ba[i]>start+i*4) {
10439           // Forward branch
10440           if(!(requires_32bit[t]&~regs[i].was32))
10441             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10442         }else{
10443           // Backward branch
10444           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10445           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10446           if(!(pr32[t]&~regs[i].was32))
10447             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10448         }
10449       }
10450       // Conditional branch may need registers for following instructions
10451       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10452       {
10453         if(i<slen-2) {
10454           r32|=requires_32bit[i+2];
10455           r32&=regs[i].was32;
10456           // Mark this address as a branch target since it may be called
10457           // upon return from interrupt
10458           bt[i+2]=1;
10459         }
10460       }
10461       // Merge in delay slot
10462       if(!likely[i]) {
10463         // These are overwritten unless the branch is "likely"
10464         // and the delay slot is nullified if not taken
10465         r32&=~(1LL<<rt1[i+1]);
10466         r32&=~(1LL<<rt2[i+1]);
10467       }
10468       // Assume these are needed (delay slot)
10469       if(us1[i+1]>0)
10470       {
10471         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10472       }
10473       if(us2[i+1]>0)
10474       {
10475         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10476       }
10477       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10478       {
10479         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10480       }
10481       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10482       {
10483         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10484       }
10485     }
10486     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10487     {
10488       // SYSCALL instruction (software interrupt)
10489       r32=0;
10490     }
10491     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10492     {
10493       // ERET instruction (return from interrupt)
10494       r32=0;
10495     }
10496     // Check 32 bits
10497     r32&=~(1LL<<rt1[i]);
10498     r32&=~(1LL<<rt2[i]);
10499     if(us1[i]>0)
10500     {
10501       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10502     }
10503     if(us2[i]>0)
10504     {
10505       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10506     }
10507     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10508     {
10509       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10510     }
10511     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10512     {
10513       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10514     }
10515     requires_32bit[i]=r32;
10516     
10517     // Dirty registers which are 32-bit, require 32-bit input
10518     // as they will be written as 32-bit values
10519     for(hr=0;hr<HOST_REGS;hr++)
10520     {
10521       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10522         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10523           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10524           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10525         }
10526       }
10527     }
10528     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10529   }
10530 #else
10531   for (i=slen-1;i>=0;i--)
10532   {
10533     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10534     {
10535       // Conditional branch
10536       if((source[i]>>16)!=0x1000&&i<slen-2) {
10537         // Mark this address as a branch target since it may be called
10538         // upon return from interrupt
10539         bt[i+2]=1;
10540       }
10541     }
10542   }
10543 #endif
10544
10545   if(itype[slen-1]==SPAN) {
10546     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10547   }
10548   
10549   /* Debug/disassembly */
10550   if((void*)assem_debug==(void*)printf) 
10551   for(i=0;i<slen;i++)
10552   {
10553     printf("U:");
10554     int r;
10555     for(r=1;r<=CCREG;r++) {
10556       if((unneeded_reg[i]>>r)&1) {
10557         if(r==HIREG) printf(" HI");
10558         else if(r==LOREG) printf(" LO");
10559         else printf(" r%d",r);
10560       }
10561     }
10562 #ifndef FORCE32
10563     printf(" UU:");
10564     for(r=1;r<=CCREG;r++) {
10565       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10566         if(r==HIREG) printf(" HI");
10567         else if(r==LOREG) printf(" LO");
10568         else printf(" r%d",r);
10569       }
10570     }
10571     printf(" 32:");
10572     for(r=0;r<=CCREG;r++) {
10573       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10574       if((regs[i].was32>>r)&1) {
10575         if(r==CCREG) printf(" CC");
10576         else if(r==HIREG) printf(" HI");
10577         else if(r==LOREG) printf(" LO");
10578         else printf(" r%d",r);
10579       }
10580     }
10581 #endif
10582     printf("\n");
10583     #if defined(__i386__) || defined(__x86_64__)
10584     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10585     #endif
10586     #ifdef __arm__
10587     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10588     #endif
10589     printf("needs: ");
10590     if(needed_reg[i]&1) printf("eax ");
10591     if((needed_reg[i]>>1)&1) printf("ecx ");
10592     if((needed_reg[i]>>2)&1) printf("edx ");
10593     if((needed_reg[i]>>3)&1) printf("ebx ");
10594     if((needed_reg[i]>>5)&1) printf("ebp ");
10595     if((needed_reg[i]>>6)&1) printf("esi ");
10596     if((needed_reg[i]>>7)&1) printf("edi ");
10597     printf("r:");
10598     for(r=0;r<=CCREG;r++) {
10599       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10600       if((requires_32bit[i]>>r)&1) {
10601         if(r==CCREG) printf(" CC");
10602         else if(r==HIREG) printf(" HI");
10603         else if(r==LOREG) printf(" LO");
10604         else printf(" r%d",r);
10605       }
10606     }
10607     printf("\n");
10608     /*printf("pr:");
10609     for(r=0;r<=CCREG;r++) {
10610       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10611       if((pr32[i]>>r)&1) {
10612         if(r==CCREG) printf(" CC");
10613         else if(r==HIREG) printf(" HI");
10614         else if(r==LOREG) printf(" LO");
10615         else printf(" r%d",r);
10616       }
10617     }
10618     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10619     printf("\n");*/
10620     #if defined(__i386__) || defined(__x86_64__)
10621     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10622     printf("dirty: ");
10623     if(regs[i].wasdirty&1) printf("eax ");
10624     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10625     if((regs[i].wasdirty>>2)&1) printf("edx ");
10626     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10627     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10628     if((regs[i].wasdirty>>6)&1) printf("esi ");
10629     if((regs[i].wasdirty>>7)&1) printf("edi ");
10630     #endif
10631     #ifdef __arm__
10632     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10633     printf("dirty: ");
10634     if(regs[i].wasdirty&1) printf("r0 ");
10635     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10636     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10637     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10638     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10639     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10640     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10641     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10642     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10643     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10644     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10645     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10646     #endif
10647     printf("\n");
10648     disassemble_inst(i);
10649     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10650     #if defined(__i386__) || defined(__x86_64__)
10651     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10652     if(regs[i].dirty&1) printf("eax ");
10653     if((regs[i].dirty>>1)&1) printf("ecx ");
10654     if((regs[i].dirty>>2)&1) printf("edx ");
10655     if((regs[i].dirty>>3)&1) printf("ebx ");
10656     if((regs[i].dirty>>5)&1) printf("ebp ");
10657     if((regs[i].dirty>>6)&1) printf("esi ");
10658     if((regs[i].dirty>>7)&1) printf("edi ");
10659     #endif
10660     #ifdef __arm__
10661     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10662     if(regs[i].dirty&1) printf("r0 ");
10663     if((regs[i].dirty>>1)&1) printf("r1 ");
10664     if((regs[i].dirty>>2)&1) printf("r2 ");
10665     if((regs[i].dirty>>3)&1) printf("r3 ");
10666     if((regs[i].dirty>>4)&1) printf("r4 ");
10667     if((regs[i].dirty>>5)&1) printf("r5 ");
10668     if((regs[i].dirty>>6)&1) printf("r6 ");
10669     if((regs[i].dirty>>7)&1) printf("r7 ");
10670     if((regs[i].dirty>>8)&1) printf("r8 ");
10671     if((regs[i].dirty>>9)&1) printf("r9 ");
10672     if((regs[i].dirty>>10)&1) printf("r10 ");
10673     if((regs[i].dirty>>12)&1) printf("r12 ");
10674     #endif
10675     printf("\n");
10676     if(regs[i].isconst) {
10677       printf("constants: ");
10678       #if defined(__i386__) || defined(__x86_64__)
10679       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10680       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10681       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10682       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10683       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10684       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10685       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10686       #endif
10687       #ifdef __arm__
10688       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10689       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10690       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10691       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10692       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10693       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10694       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10695       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10696       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10697       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10698       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10699       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10700       #endif
10701       printf("\n");
10702     }
10703 #ifndef FORCE32
10704     printf(" 32:");
10705     for(r=0;r<=CCREG;r++) {
10706       if((regs[i].is32>>r)&1) {
10707         if(r==CCREG) printf(" CC");
10708         else if(r==HIREG) printf(" HI");
10709         else if(r==LOREG) printf(" LO");
10710         else printf(" r%d",r);
10711       }
10712     }
10713     printf("\n");
10714 #endif
10715     /*printf(" p32:");
10716     for(r=0;r<=CCREG;r++) {
10717       if((p32[i]>>r)&1) {
10718         if(r==CCREG) printf(" CC");
10719         else if(r==HIREG) printf(" HI");
10720         else if(r==LOREG) printf(" LO");
10721         else printf(" r%d",r);
10722       }
10723     }
10724     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10725     else printf("\n");*/
10726     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10727       #if defined(__i386__) || defined(__x86_64__)
10728       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10729       if(branch_regs[i].dirty&1) printf("eax ");
10730       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10731       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10732       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10733       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10734       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10735       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10736       #endif
10737       #ifdef __arm__
10738       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10739       if(branch_regs[i].dirty&1) printf("r0 ");
10740       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10741       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10742       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10743       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10744       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10745       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10746       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10747       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10748       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10749       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10750       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10751       #endif
10752 #ifndef FORCE32
10753       printf(" 32:");
10754       for(r=0;r<=CCREG;r++) {
10755         if((branch_regs[i].is32>>r)&1) {
10756           if(r==CCREG) printf(" CC");
10757           else if(r==HIREG) printf(" HI");
10758           else if(r==LOREG) printf(" LO");
10759           else printf(" r%d",r);
10760         }
10761       }
10762       printf("\n");
10763 #endif
10764     }
10765   }
10766
10767   /* Pass 8 - Assembly */
10768   linkcount=0;stubcount=0;
10769   ds=0;is_delayslot=0;
10770   cop1_usable=0;
10771   uint64_t is32_pre=0;
10772   u_int dirty_pre=0;
10773   u_int beginning=(u_int)out;
10774   if((u_int)addr&1) {
10775     ds=1;
10776     pagespan_ds();
10777   }
10778   u_int instr_addr0_override=0;
10779
10780 #ifdef PCSX
10781   if (start == 0x80030000) {
10782     // nasty hack for fastbios thing
10783     instr_addr0_override=(u_int)out;
10784     emit_movimm(start,0);
10785     emit_readword((int)&pcaddr,1);
10786     emit_writeword(0,(int)&pcaddr);
10787     emit_cmp(0,1);
10788     emit_jne((int)new_dyna_leave);
10789   }
10790 #endif
10791   for(i=0;i<slen;i++)
10792   {
10793     //if(ds) printf("ds: ");
10794     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10795     if(ds) {
10796       ds=0; // Skip delay slot
10797       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10798       instr_addr[i]=0;
10799     } else {
10800       #ifndef DESTRUCTIVE_WRITEBACK
10801       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10802       {
10803         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10804               unneeded_reg[i],unneeded_reg_upper[i]);
10805         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10806               unneeded_reg[i],unneeded_reg_upper[i]);
10807       }
10808       is32_pre=regs[i].is32;
10809       dirty_pre=regs[i].dirty;
10810       #endif
10811       // write back
10812       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10813       {
10814         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10815                       unneeded_reg[i],unneeded_reg_upper[i]);
10816         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10817       }
10818       // branch target entry point
10819       instr_addr[i]=(u_int)out;
10820       assem_debug("<->\n");
10821       // load regs
10822       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10823         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10824       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10825       address_generation(i,&regs[i],regs[i].regmap_entry);
10826       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10827       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10828       {
10829         // Load the delay slot registers if necessary
10830         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10831           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10832         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10833           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10834         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10835           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10836       }
10837       else if(i+1<slen)
10838       {
10839         // Preload registers for following instruction
10840         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10841           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10842             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10843         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10844           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10845             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10846       }
10847       // TODO: if(is_ooo(i)) address_generation(i+1);
10848       if(itype[i]==CJUMP||itype[i]==FJUMP)
10849         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10850       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10851         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10852       if(bt[i]) cop1_usable=0;
10853       // assemble
10854       switch(itype[i]) {
10855         case ALU:
10856           alu_assemble(i,&regs[i]);break;
10857         case IMM16:
10858           imm16_assemble(i,&regs[i]);break;
10859         case SHIFT:
10860           shift_assemble(i,&regs[i]);break;
10861         case SHIFTIMM:
10862           shiftimm_assemble(i,&regs[i]);break;
10863         case LOAD:
10864           load_assemble(i,&regs[i]);break;
10865         case LOADLR:
10866           loadlr_assemble(i,&regs[i]);break;
10867         case STORE:
10868           store_assemble(i,&regs[i]);break;
10869         case STORELR:
10870           storelr_assemble(i,&regs[i]);break;
10871         case COP0:
10872           cop0_assemble(i,&regs[i]);break;
10873         case COP1:
10874           cop1_assemble(i,&regs[i]);break;
10875         case C1LS:
10876           c1ls_assemble(i,&regs[i]);break;
10877         case COP2:
10878           cop2_assemble(i,&regs[i]);break;
10879         case C2LS:
10880           c2ls_assemble(i,&regs[i]);break;
10881         case C2OP:
10882           c2op_assemble(i,&regs[i]);break;
10883         case FCONV:
10884           fconv_assemble(i,&regs[i]);break;
10885         case FLOAT:
10886           float_assemble(i,&regs[i]);break;
10887         case FCOMP:
10888           fcomp_assemble(i,&regs[i]);break;
10889         case MULTDIV:
10890           multdiv_assemble(i,&regs[i]);break;
10891         case MOV:
10892           mov_assemble(i,&regs[i]);break;
10893         case SYSCALL:
10894           syscall_assemble(i,&regs[i]);break;
10895         case HLECALL:
10896           hlecall_assemble(i,&regs[i]);break;
10897         case INTCALL:
10898           intcall_assemble(i,&regs[i]);break;
10899         case UJUMP:
10900           ujump_assemble(i,&regs[i]);ds=1;break;
10901         case RJUMP:
10902           rjump_assemble(i,&regs[i]);ds=1;break;
10903         case CJUMP:
10904           cjump_assemble(i,&regs[i]);ds=1;break;
10905         case SJUMP:
10906           sjump_assemble(i,&regs[i]);ds=1;break;
10907         case FJUMP:
10908           fjump_assemble(i,&regs[i]);ds=1;break;
10909         case SPAN:
10910           pagespan_assemble(i,&regs[i]);break;
10911       }
10912       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10913         literal_pool(1024);
10914       else
10915         literal_pool_jumpover(256);
10916     }
10917   }
10918   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10919   // If the block did not end with an unconditional branch,
10920   // add a jump to the next instruction.
10921   if(i>1) {
10922     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10923       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10924       assert(i==slen);
10925       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10926         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10927         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10928           emit_loadreg(CCREG,HOST_CCREG);
10929         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10930       }
10931       else if(!likely[i-2])
10932       {
10933         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10934         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10935       }
10936       else
10937       {
10938         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10939         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10940       }
10941       add_to_linker((int)out,start+i*4,0);
10942       emit_jmp(0);
10943     }
10944   }
10945   else
10946   {
10947     assert(i>0);
10948     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10949     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10950     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10951       emit_loadreg(CCREG,HOST_CCREG);
10952     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10953     add_to_linker((int)out,start+i*4,0);
10954     emit_jmp(0);
10955   }
10956
10957   // TODO: delay slot stubs?
10958   // Stubs
10959   for(i=0;i<stubcount;i++)
10960   {
10961     switch(stubs[i][0])
10962     {
10963       case LOADB_STUB:
10964       case LOADH_STUB:
10965       case LOADW_STUB:
10966       case LOADD_STUB:
10967       case LOADBU_STUB:
10968       case LOADHU_STUB:
10969         do_readstub(i);break;
10970       case STOREB_STUB:
10971       case STOREH_STUB:
10972       case STOREW_STUB:
10973       case STORED_STUB:
10974         do_writestub(i);break;
10975       case CC_STUB:
10976         do_ccstub(i);break;
10977       case INVCODE_STUB:
10978         do_invstub(i);break;
10979       case FP_STUB:
10980         do_cop1stub(i);break;
10981       case STORELR_STUB:
10982         do_unalignedwritestub(i);break;
10983     }
10984   }
10985
10986   if (instr_addr0_override)
10987     instr_addr[0] = instr_addr0_override;
10988
10989   /* Pass 9 - Linker */
10990   for(i=0;i<linkcount;i++)
10991   {
10992     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10993     literal_pool(64);
10994     if(!link_addr[i][2])
10995     {
10996       void *stub=out;
10997       void *addr=check_addr(link_addr[i][1]);
10998       emit_extjump(link_addr[i][0],link_addr[i][1]);
10999       if(addr) {
11000         set_jump_target(link_addr[i][0],(int)addr);
11001         add_link(link_addr[i][1],stub);
11002       }
11003       else set_jump_target(link_addr[i][0],(int)stub);
11004     }
11005     else
11006     {
11007       // Internal branch
11008       int target=(link_addr[i][1]-start)>>2;
11009       assert(target>=0&&target<slen);
11010       assert(instr_addr[target]);
11011       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11012       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11013       //#else
11014       set_jump_target(link_addr[i][0],instr_addr[target]);
11015       //#endif
11016     }
11017   }
11018   // External Branch Targets (jump_in)
11019   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11020   for(i=0;i<slen;i++)
11021   {
11022     if(bt[i]||i==0)
11023     {
11024       if(instr_addr[i]) // TODO - delay slots (=null)
11025       {
11026         u_int vaddr=start+i*4;
11027         u_int page=get_page(vaddr);
11028         u_int vpage=get_vpage(vaddr);
11029         literal_pool(256);
11030         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11031 #ifndef FORCE32
11032         if(!requires_32bit[i])
11033 #else
11034         if(1)
11035 #endif
11036         {
11037           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11038           assem_debug("jump_in: %x\n",start+i*4);
11039           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11040           int entry_point=do_dirty_stub(i);
11041           ll_add(jump_in+page,vaddr,(void *)entry_point);
11042           // If there was an existing entry in the hash table,
11043           // replace it with the new address.
11044           // Don't add new entries.  We'll insert the
11045           // ones that actually get used in check_addr().
11046           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11047           if(ht_bin[0]==vaddr) {
11048             ht_bin[1]=entry_point;
11049           }
11050           if(ht_bin[2]==vaddr) {
11051             ht_bin[3]=entry_point;
11052           }
11053         }
11054         else
11055         {
11056           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11057           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11058           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11059           //int entry_point=(int)out;
11060           ////assem_debug("entry_point: %x\n",entry_point);
11061           //load_regs_entry(i);
11062           //if(entry_point==(int)out)
11063           //  entry_point=instr_addr[i];
11064           //else
11065           //  emit_jmp(instr_addr[i]);
11066           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11067           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11068           int entry_point=do_dirty_stub(i);
11069           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11070         }
11071       }
11072     }
11073   }
11074   // Write out the literal pool if necessary
11075   literal_pool(0);
11076   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11077   // Align code
11078   if(((u_int)out)&7) emit_addnop(13);
11079   #endif
11080   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11081   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11082   memcpy(copy,source,slen*4);
11083   copy+=slen*4;
11084   
11085   #ifdef __arm__
11086   __clear_cache((void *)beginning,out);
11087   #endif
11088   
11089   // If we're within 256K of the end of the buffer,
11090   // start over from the beginning. (Is 256K enough?)
11091   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11092   
11093   // Trap writes to any of the pages we compiled
11094   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11095     invalid_code[i]=0;
11096 #ifndef DISABLE_TLB
11097     memory_map[i]|=0x40000000;
11098     if((signed int)start>=(signed int)0xC0000000) {
11099       assert(using_tlb);
11100       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11101       invalid_code[j]=0;
11102       memory_map[j]|=0x40000000;
11103       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11104     }
11105 #endif
11106   }
11107 #ifdef PCSX
11108   // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
11109   if(get_page(start)<(RAM_SIZE>>12))
11110     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11111       invalid_code[((u_int)0x80000000>>12)|i]=0;
11112 #endif
11113   
11114   /* Pass 10 - Free memory by expiring oldest blocks */
11115   
11116   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11117   while(expirep!=end)
11118   {
11119     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11120     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11121     inv_debug("EXP: Phase %d\n",expirep);
11122     switch((expirep>>11)&3)
11123     {
11124       case 0:
11125         // Clear jump_in and jump_dirty
11126         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11127         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11128         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11129         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11130         break;
11131       case 1:
11132         // Clear pointers
11133         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11134         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11135         break;
11136       case 2:
11137         // Clear hash table
11138         for(i=0;i<32;i++) {
11139           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11140           if((ht_bin[3]>>shift)==(base>>shift) ||
11141              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11142             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11143             ht_bin[2]=ht_bin[3]=-1;
11144           }
11145           if((ht_bin[1]>>shift)==(base>>shift) ||
11146              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11147             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11148             ht_bin[0]=ht_bin[2];
11149             ht_bin[1]=ht_bin[3];
11150             ht_bin[2]=ht_bin[3]=-1;
11151           }
11152         }
11153         break;
11154       case 3:
11155         // Clear jump_out
11156         #ifdef __arm__
11157         if((expirep&2047)==0) 
11158           do_clear_cache();
11159         #endif
11160         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11161         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11162         break;
11163     }
11164     expirep=(expirep+1)&65535;
11165   }
11166   return 0;
11167 }
11168
11169 // vim:shiftwidth=2:expandtab