edd141311b42347f6346e4e663807aaf591629c5
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   char ooo[MAXBLOCK];
88   uint64_t unneeded_reg[MAXBLOCK];
89   uint64_t unneeded_reg_upper[MAXBLOCK];
90   uint64_t branch_unneeded_reg[MAXBLOCK];
91   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
92   uint64_t p32[MAXBLOCK];
93   uint64_t pr32[MAXBLOCK];
94   signed char regmap_pre[MAXBLOCK][HOST_REGS];
95   signed char regmap[MAXBLOCK][HOST_REGS];
96   signed char regmap_entry[MAXBLOCK][HOST_REGS];
97   uint64_t constmap[MAXBLOCK][HOST_REGS];
98   struct regstat regs[MAXBLOCK];
99   struct regstat branch_regs[MAXBLOCK];
100   signed char minimum_free_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124 #ifndef PCSX
125   u_int using_tlb;
126 #else
127   static const u_int using_tlb=0;
128 #endif
129   u_int stop_after_jal;
130   extern u_char restore_candidate[512];
131   extern int cycle_count;
132
133   /* registers that may be allocated */
134   /* 1-31 gpr */
135 #define HIREG 32 // hi
136 #define LOREG 33 // lo
137 #define FSREG 34 // FPU status (FCSR)
138 #define CSREG 35 // Coprocessor status
139 #define CCREG 36 // Cycle count
140 #define INVCP 37 // Pointer to invalid_code
141 #define MMREG 38 // Pointer to memory_map
142 #define ROREG 39 // ram offset (if rdram!=0x80000000)
143 #define TEMPREG 40
144 #define FTEMP 40 // FPU temporary register
145 #define PTEMP 41 // Prefetch temporary register
146 #define TLREG 42 // TLB mapping offset
147 #define RHASH 43 // Return address hash
148 #define RHTBL 44 // Return address hash table address
149 #define RTEMP 45 // JR/JALR address register
150 #define MAXREG 45
151 #define AGEN1 46 // Address generation temporary register
152 #define AGEN2 47 // Address generation temporary register
153 #define MGEN1 48 // Maptable address generation temporary register
154 #define MGEN2 49 // Maptable address generation temporary register
155 #define BTREG 50 // Branch target temporary register
156
157   /* instruction types */
158 #define NOP 0     // No operation
159 #define LOAD 1    // Load
160 #define STORE 2   // Store
161 #define LOADLR 3  // Unaligned load
162 #define STORELR 4 // Unaligned store
163 #define MOV 5     // Move 
164 #define ALU 6     // Arithmetic/logic
165 #define MULTDIV 7 // Multiply/divide
166 #define SHIFT 8   // Shift by register
167 #define SHIFTIMM 9// Shift by immediate
168 #define IMM16 10  // 16-bit immediate
169 #define RJUMP 11  // Unconditional jump to register
170 #define UJUMP 12  // Unconditional jump
171 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
172 #define SJUMP 14  // Conditional branch (regimm format)
173 #define COP0 15   // Coprocessor 0
174 #define COP1 16   // Coprocessor 1
175 #define C1LS 17   // Coprocessor 1 load/store
176 #define FJUMP 18  // Conditional branch (floating point)
177 #define FLOAT 19  // Floating point unit
178 #define FCONV 20  // Convert integer to float
179 #define FCOMP 21  // Floating point compare (sets FSREG)
180 #define SYSCALL 22// SYSCALL
181 #define OTHER 23  // Other
182 #define SPAN 24   // Branch/delay slot spans 2 pages
183 #define NI 25     // Not implemented
184 #define HLECALL 26// PCSX fake opcodes for HLE
185 #define COP2 27   // Coprocessor 2 move
186 #define C2LS 28   // Coprocessor 2 load/store
187 #define C2OP 29   // Coprocessor 2 operation
188 #define INTCALL 30// Call interpreter to handle rare corner cases
189
190   /* stubs */
191 #define CC_STUB 1
192 #define FP_STUB 2
193 #define LOADB_STUB 3
194 #define LOADH_STUB 4
195 #define LOADW_STUB 5
196 #define LOADD_STUB 6
197 #define LOADBU_STUB 7
198 #define LOADHU_STUB 8
199 #define STOREB_STUB 9
200 #define STOREH_STUB 10
201 #define STOREW_STUB 11
202 #define STORED_STUB 12
203 #define STORELR_STUB 13
204 #define INVCODE_STUB 14
205
206   /* branch codes */
207 #define TAKEN 1
208 #define NOTTAKEN 2
209 #define NULLDS 3
210
211 // asm linkage
212 int new_recompile_block(int addr);
213 void *get_addr_ht(u_int vaddr);
214 void invalidate_block(u_int block);
215 void invalidate_addr(u_int addr);
216 void remove_hash(int vaddr);
217 void jump_vaddr();
218 void dyna_linker();
219 void dyna_linker_ds();
220 void verify_code();
221 void verify_code_vm();
222 void verify_code_ds();
223 void cc_interrupt();
224 void fp_exception();
225 void fp_exception_ds();
226 void jump_syscall();
227 void jump_syscall_hle();
228 void jump_eret();
229 void jump_hlecall();
230 void jump_intcall();
231 void new_dyna_leave();
232
233 // TLB
234 void TLBWI_new();
235 void TLBWR_new();
236 void read_nomem_new();
237 void read_nomemb_new();
238 void read_nomemh_new();
239 void read_nomemd_new();
240 void write_nomem_new();
241 void write_nomemb_new();
242 void write_nomemh_new();
243 void write_nomemd_new();
244 void write_rdram_new();
245 void write_rdramb_new();
246 void write_rdramh_new();
247 void write_rdramd_new();
248 extern u_int memory_map[1048576];
249
250 // Needed by assembler
251 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
252 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
253 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
254 void load_all_regs(signed char i_regmap[]);
255 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
256 void load_regs_entry(int t);
257 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
258
259 int tracedebug=0;
260
261 //#define DEBUG_CYCLE_COUNT 1
262
263 void nullf() {}
264 //#define assem_debug printf
265 //#define inv_debug printf
266 #define assem_debug nullf
267 #define inv_debug nullf
268
269 static void tlb_hacks()
270 {
271 #ifndef DISABLE_TLB
272   // Goldeneye hack
273   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
274   {
275     u_int addr;
276     int n;
277     switch (ROM_HEADER->Country_code&0xFF) 
278     {
279       case 0x45: // U
280         addr=0x34b30;
281         break;                   
282       case 0x4A: // J 
283         addr=0x34b70;    
284         break;    
285       case 0x50: // E 
286         addr=0x329f0;
287         break;                        
288       default: 
289         // Unknown country code
290         addr=0;
291         break;
292     }
293     u_int rom_addr=(u_int)rom;
294     #ifdef ROM_COPY
295     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
296     // in the lower 4G of memory to use this hack.  Copy it if necessary.
297     if((void *)rom>(void *)0xffffffff) {
298       munmap(ROM_COPY, 67108864);
299       if(mmap(ROM_COPY, 12582912,
300               PROT_READ | PROT_WRITE,
301               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
302               -1, 0) <= 0) {printf("mmap() failed\n");}
303       memcpy(ROM_COPY,rom,12582912);
304       rom_addr=(u_int)ROM_COPY;
305     }
306     #endif
307     if(addr) {
308       for(n=0x7F000;n<0x80000;n++) {
309         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
310       }
311     }
312   }
313 #endif
314 }
315
316 static u_int get_page(u_int vaddr)
317 {
318 #ifndef PCSX
319   u_int page=(vaddr^0x80000000)>>12;
320 #else
321   u_int page=vaddr&~0xe0000000;
322   if (page < 0x1000000)
323     page &= ~0x0e00000; // RAM mirrors
324   page>>=12;
325 #endif
326 #ifndef DISABLE_TLB
327   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
328 #endif
329   if(page>2048) page=2048+(page&2047);
330   return page;
331 }
332
333 static u_int get_vpage(u_int vaddr)
334 {
335   u_int vpage=(vaddr^0x80000000)>>12;
336 #ifndef DISABLE_TLB
337   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
338 #endif
339   if(vpage>2048) vpage=2048+(vpage&2047);
340   return vpage;
341 }
342
343 // Get address from virtual address
344 // This is called from the recompiled JR/JALR instructions
345 void *get_addr(u_int vaddr)
346 {
347   u_int page=get_page(vaddr);
348   u_int vpage=get_vpage(vaddr);
349   struct ll_entry *head;
350   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
351   head=jump_in[page];
352   while(head!=NULL) {
353     if(head->vaddr==vaddr&&head->reg32==0) {
354   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
355       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
356       ht_bin[3]=ht_bin[1];
357       ht_bin[2]=ht_bin[0];
358       ht_bin[1]=(int)head->addr;
359       ht_bin[0]=vaddr;
360       return head->addr;
361     }
362     head=head->next;
363   }
364   head=jump_dirty[vpage];
365   while(head!=NULL) {
366     if(head->vaddr==vaddr&&head->reg32==0) {
367       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
368       // Don't restore blocks which are about to expire from the cache
369       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
370       if(verify_dirty(head->addr)) {
371         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
372         invalid_code[vaddr>>12]=0;
373         memory_map[vaddr>>12]|=0x40000000;
374         if(vpage<2048) {
375 #ifndef DISABLE_TLB
376           if(tlb_LUT_r[vaddr>>12]) {
377             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
378             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
379           }
380 #endif
381           restore_candidate[vpage>>3]|=1<<(vpage&7);
382         }
383         else restore_candidate[page>>3]|=1<<(page&7);
384         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
385         if(ht_bin[0]==vaddr) {
386           ht_bin[1]=(int)head->addr; // Replace existing entry
387         }
388         else
389         {
390           ht_bin[3]=ht_bin[1];
391           ht_bin[2]=ht_bin[0];
392           ht_bin[1]=(int)head->addr;
393           ht_bin[0]=vaddr;
394         }
395         return head->addr;
396       }
397     }
398     head=head->next;
399   }
400   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
401   int r=new_recompile_block(vaddr);
402   if(r==0) return get_addr(vaddr);
403   // Execute in unmapped page, generate pagefault execption
404   Status|=2;
405   Cause=(vaddr<<31)|0x8;
406   EPC=(vaddr&1)?vaddr-5:vaddr;
407   BadVAddr=(vaddr&~1);
408   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
409   EntryHi=BadVAddr&0xFFFFE000;
410   return get_addr_ht(0x80000000);
411 }
412 // Look up address in hash table first
413 void *get_addr_ht(u_int vaddr)
414 {
415   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
416   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
417   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
418   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
419   return get_addr(vaddr);
420 }
421
422 void *get_addr_32(u_int vaddr,u_int flags)
423 {
424 #ifdef FORCE32
425   return get_addr(vaddr);
426 #else
427   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
428   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
429   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
430   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
431   u_int page=get_page(vaddr);
432   u_int vpage=get_vpage(vaddr);
433   struct ll_entry *head;
434   head=jump_in[page];
435   while(head!=NULL) {
436     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
437       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
438       if(head->reg32==0) {
439         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
440         if(ht_bin[0]==-1) {
441           ht_bin[1]=(int)head->addr;
442           ht_bin[0]=vaddr;
443         }else if(ht_bin[2]==-1) {
444           ht_bin[3]=(int)head->addr;
445           ht_bin[2]=vaddr;
446         }
447         //ht_bin[3]=ht_bin[1];
448         //ht_bin[2]=ht_bin[0];
449         //ht_bin[1]=(int)head->addr;
450         //ht_bin[0]=vaddr;
451       }
452       return head->addr;
453     }
454     head=head->next;
455   }
456   head=jump_dirty[vpage];
457   while(head!=NULL) {
458     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
459       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
460       // Don't restore blocks which are about to expire from the cache
461       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
462       if(verify_dirty(head->addr)) {
463         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
464         invalid_code[vaddr>>12]=0;
465         memory_map[vaddr>>12]|=0x40000000;
466         if(vpage<2048) {
467 #ifndef DISABLE_TLB
468           if(tlb_LUT_r[vaddr>>12]) {
469             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
470             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
471           }
472 #endif
473           restore_candidate[vpage>>3]|=1<<(vpage&7);
474         }
475         else restore_candidate[page>>3]|=1<<(page&7);
476         if(head->reg32==0) {
477           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
478           if(ht_bin[0]==-1) {
479             ht_bin[1]=(int)head->addr;
480             ht_bin[0]=vaddr;
481           }else if(ht_bin[2]==-1) {
482             ht_bin[3]=(int)head->addr;
483             ht_bin[2]=vaddr;
484           }
485           //ht_bin[3]=ht_bin[1];
486           //ht_bin[2]=ht_bin[0];
487           //ht_bin[1]=(int)head->addr;
488           //ht_bin[0]=vaddr;
489         }
490         return head->addr;
491       }
492     }
493     head=head->next;
494   }
495   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
496   int r=new_recompile_block(vaddr);
497   if(r==0) return get_addr(vaddr);
498   // Execute in unmapped page, generate pagefault execption
499   Status|=2;
500   Cause=(vaddr<<31)|0x8;
501   EPC=(vaddr&1)?vaddr-5:vaddr;
502   BadVAddr=(vaddr&~1);
503   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
504   EntryHi=BadVAddr&0xFFFFE000;
505   return get_addr_ht(0x80000000);
506 #endif
507 }
508
509 void clear_all_regs(signed char regmap[])
510 {
511   int hr;
512   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
513 }
514
515 signed char get_reg(signed char regmap[],int r)
516 {
517   int hr;
518   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
519   return -1;
520 }
521
522 // Find a register that is available for two consecutive cycles
523 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
524 {
525   int hr;
526   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
527   return -1;
528 }
529
530 int count_free_regs(signed char regmap[])
531 {
532   int count=0;
533   int hr;
534   for(hr=0;hr<HOST_REGS;hr++)
535   {
536     if(hr!=EXCLUDE_REG) {
537       if(regmap[hr]<0) count++;
538     }
539   }
540   return count;
541 }
542
543 void dirty_reg(struct regstat *cur,signed char reg)
544 {
545   int hr;
546   if(!reg) return;
547   for (hr=0;hr<HOST_REGS;hr++) {
548     if((cur->regmap[hr]&63)==reg) {
549       cur->dirty|=1<<hr;
550     }
551   }
552 }
553
554 // If we dirty the lower half of a 64 bit register which is now being
555 // sign-extended, we need to dump the upper half.
556 // Note: Do this only after completion of the instruction, because
557 // some instructions may need to read the full 64-bit value even if
558 // overwriting it (eg SLTI, DSRA32).
559 static void flush_dirty_uppers(struct regstat *cur)
560 {
561   int hr,reg;
562   for (hr=0;hr<HOST_REGS;hr++) {
563     if((cur->dirty>>hr)&1) {
564       reg=cur->regmap[hr];
565       if(reg>=64) 
566         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
567     }
568   }
569 }
570
571 void set_const(struct regstat *cur,signed char reg,uint64_t value)
572 {
573   int hr;
574   if(!reg) return;
575   for (hr=0;hr<HOST_REGS;hr++) {
576     if(cur->regmap[hr]==reg) {
577       cur->isconst|=1<<hr;
578       cur->constmap[hr]=value;
579     }
580     else if((cur->regmap[hr]^64)==reg) {
581       cur->isconst|=1<<hr;
582       cur->constmap[hr]=value>>32;
583     }
584   }
585 }
586
587 void clear_const(struct regstat *cur,signed char reg)
588 {
589   int hr;
590   if(!reg) return;
591   for (hr=0;hr<HOST_REGS;hr++) {
592     if((cur->regmap[hr]&63)==reg) {
593       cur->isconst&=~(1<<hr);
594     }
595   }
596 }
597
598 int is_const(struct regstat *cur,signed char reg)
599 {
600   int hr;
601   if(!reg) return 1;
602   for (hr=0;hr<HOST_REGS;hr++) {
603     if((cur->regmap[hr]&63)==reg) {
604       return (cur->isconst>>hr)&1;
605     }
606   }
607   return 0;
608 }
609 uint64_t get_const(struct regstat *cur,signed char reg)
610 {
611   int hr;
612   if(!reg) return 0;
613   for (hr=0;hr<HOST_REGS;hr++) {
614     if(cur->regmap[hr]==reg) {
615       return cur->constmap[hr];
616     }
617   }
618   printf("Unknown constant in r%d\n",reg);
619   exit(1);
620 }
621
622 // Least soon needed registers
623 // Look at the next ten instructions and see which registers
624 // will be used.  Try not to reallocate these.
625 void lsn(u_char hsn[], int i, int *preferred_reg)
626 {
627   int j;
628   int b=-1;
629   for(j=0;j<9;j++)
630   {
631     if(i+j>=slen) {
632       j=slen-i-1;
633       break;
634     }
635     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
636     {
637       // Don't go past an unconditonal jump
638       j++;
639       break;
640     }
641   }
642   for(;j>=0;j--)
643   {
644     if(rs1[i+j]) hsn[rs1[i+j]]=j;
645     if(rs2[i+j]) hsn[rs2[i+j]]=j;
646     if(rt1[i+j]) hsn[rt1[i+j]]=j;
647     if(rt2[i+j]) hsn[rt2[i+j]]=j;
648     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
649       // Stores can allocate zero
650       hsn[rs1[i+j]]=j;
651       hsn[rs2[i+j]]=j;
652     }
653     // On some architectures stores need invc_ptr
654     #if defined(HOST_IMM8)
655     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
656       hsn[INVCP]=j;
657     }
658     #endif
659     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
660     {
661       hsn[CCREG]=j;
662       b=j;
663     }
664   }
665   if(b>=0)
666   {
667     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
668     {
669       // Follow first branch
670       int t=(ba[i+b]-start)>>2;
671       j=7-b;if(t+j>=slen) j=slen-t-1;
672       for(;j>=0;j--)
673       {
674         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
675         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
676         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
677         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
678       }
679     }
680     // TODO: preferred register based on backward branch
681   }
682   // Delay slot should preferably not overwrite branch conditions or cycle count
683   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
684     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
685     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
686     hsn[CCREG]=1;
687     // ...or hash tables
688     hsn[RHASH]=1;
689     hsn[RHTBL]=1;
690   }
691   // Coprocessor load/store needs FTEMP, even if not declared
692   if(itype[i]==C1LS||itype[i]==C2LS) {
693     hsn[FTEMP]=0;
694   }
695   // Load L/R also uses FTEMP as a temporary register
696   if(itype[i]==LOADLR) {
697     hsn[FTEMP]=0;
698   }
699   // Also SWL/SWR/SDL/SDR
700   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
701     hsn[FTEMP]=0;
702   }
703   // Don't remove the TLB registers either
704   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
705     hsn[TLREG]=0;
706   }
707   // Don't remove the miniht registers
708   if(itype[i]==UJUMP||itype[i]==RJUMP)
709   {
710     hsn[RHASH]=0;
711     hsn[RHTBL]=0;
712   }
713 }
714
715 // We only want to allocate registers if we're going to use them again soon
716 int needed_again(int r, int i)
717 {
718   int j;
719   int b=-1;
720   int rn=10;
721   int hr;
722   u_char hsn[MAXREG+1];
723   int preferred_reg;
724   
725   memset(hsn,10,sizeof(hsn));
726   lsn(hsn,i,&preferred_reg);
727   
728   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
729   {
730     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
731       return 0; // Don't need any registers if exiting the block
732   }
733   for(j=0;j<9;j++)
734   {
735     if(i+j>=slen) {
736       j=slen-i-1;
737       break;
738     }
739     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
740     {
741       // Don't go past an unconditonal jump
742       j++;
743       break;
744     }
745     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
746     {
747       break;
748     }
749   }
750   for(;j>=1;j--)
751   {
752     if(rs1[i+j]==r) rn=j;
753     if(rs2[i+j]==r) rn=j;
754     if((unneeded_reg[i+j]>>r)&1) rn=10;
755     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
756     {
757       b=j;
758     }
759   }
760   /*
761   if(b>=0)
762   {
763     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
764     {
765       // Follow first branch
766       int o=rn;
767       int t=(ba[i+b]-start)>>2;
768       j=7-b;if(t+j>=slen) j=slen-t-1;
769       for(;j>=0;j--)
770       {
771         if(!((unneeded_reg[t+j]>>r)&1)) {
772           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
773           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
774         }
775         else rn=o;
776       }
777     }
778   }*/
779   for(hr=0;hr<HOST_REGS;hr++) {
780     if(hr!=EXCLUDE_REG) {
781       if(rn<hsn[hr]) return 1;
782     }
783   }
784   return 0;
785 }
786
787 // Try to match register allocations at the end of a loop with those
788 // at the beginning
789 int loop_reg(int i, int r, int hr)
790 {
791   int j,k;
792   for(j=0;j<9;j++)
793   {
794     if(i+j>=slen) {
795       j=slen-i-1;
796       break;
797     }
798     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
799     {
800       // Don't go past an unconditonal jump
801       j++;
802       break;
803     }
804   }
805   k=0;
806   if(i>0){
807     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
808       k--;
809   }
810   for(;k<j;k++)
811   {
812     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
813     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
814     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
815     {
816       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
817       {
818         int t=(ba[i+k]-start)>>2;
819         int reg=get_reg(regs[t].regmap_entry,r);
820         if(reg>=0) return reg;
821         //reg=get_reg(regs[t+1].regmap_entry,r);
822         //if(reg>=0) return reg;
823       }
824     }
825   }
826   return hr;
827 }
828
829
830 // Allocate every register, preserving source/target regs
831 void alloc_all(struct regstat *cur,int i)
832 {
833   int hr;
834   
835   for(hr=0;hr<HOST_REGS;hr++) {
836     if(hr!=EXCLUDE_REG) {
837       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
838          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
839       {
840         cur->regmap[hr]=-1;
841         cur->dirty&=~(1<<hr);
842       }
843       // Don't need zeros
844       if((cur->regmap[hr]&63)==0)
845       {
846         cur->regmap[hr]=-1;
847         cur->dirty&=~(1<<hr);
848       }
849     }
850   }
851 }
852
853
854 void div64(int64_t dividend,int64_t divisor)
855 {
856   lo=dividend/divisor;
857   hi=dividend%divisor;
858   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
859   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
860 }
861 void divu64(uint64_t dividend,uint64_t divisor)
862 {
863   lo=dividend/divisor;
864   hi=dividend%divisor;
865   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
866   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
867 }
868
869 void mult64(uint64_t m1,uint64_t m2)
870 {
871    unsigned long long int op1, op2, op3, op4;
872    unsigned long long int result1, result2, result3, result4;
873    unsigned long long int temp1, temp2, temp3, temp4;
874    int sign = 0;
875    
876    if (m1 < 0)
877      {
878     op2 = -m1;
879     sign = 1 - sign;
880      }
881    else op2 = m1;
882    if (m2 < 0)
883      {
884     op4 = -m2;
885     sign = 1 - sign;
886      }
887    else op4 = m2;
888    
889    op1 = op2 & 0xFFFFFFFF;
890    op2 = (op2 >> 32) & 0xFFFFFFFF;
891    op3 = op4 & 0xFFFFFFFF;
892    op4 = (op4 >> 32) & 0xFFFFFFFF;
893    
894    temp1 = op1 * op3;
895    temp2 = (temp1 >> 32) + op1 * op4;
896    temp3 = op2 * op3;
897    temp4 = (temp3 >> 32) + op2 * op4;
898    
899    result1 = temp1 & 0xFFFFFFFF;
900    result2 = temp2 + (temp3 & 0xFFFFFFFF);
901    result3 = (result2 >> 32) + temp4;
902    result4 = (result3 >> 32);
903    
904    lo = result1 | (result2 << 32);
905    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
906    if (sign)
907      {
908     hi = ~hi;
909     if (!lo) hi++;
910     else lo = ~lo + 1;
911      }
912 }
913
914 void multu64(uint64_t m1,uint64_t m2)
915 {
916    unsigned long long int op1, op2, op3, op4;
917    unsigned long long int result1, result2, result3, result4;
918    unsigned long long int temp1, temp2, temp3, temp4;
919    
920    op1 = m1 & 0xFFFFFFFF;
921    op2 = (m1 >> 32) & 0xFFFFFFFF;
922    op3 = m2 & 0xFFFFFFFF;
923    op4 = (m2 >> 32) & 0xFFFFFFFF;
924    
925    temp1 = op1 * op3;
926    temp2 = (temp1 >> 32) + op1 * op4;
927    temp3 = op2 * op3;
928    temp4 = (temp3 >> 32) + op2 * op4;
929    
930    result1 = temp1 & 0xFFFFFFFF;
931    result2 = temp2 + (temp3 & 0xFFFFFFFF);
932    result3 = (result2 >> 32) + temp4;
933    result4 = (result3 >> 32);
934    
935    lo = result1 | (result2 << 32);
936    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
937    
938   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
939   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
940 }
941
942 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
943 {
944   if(bits) {
945     original<<=64-bits;
946     original>>=64-bits;
947     loaded<<=bits;
948     original|=loaded;
949   }
950   else original=loaded;
951   return original;
952 }
953 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
954 {
955   if(bits^56) {
956     original>>=64-(bits^56);
957     original<<=64-(bits^56);
958     loaded>>=bits^56;
959     original|=loaded;
960   }
961   else original=loaded;
962   return original;
963 }
964
965 #ifdef __i386__
966 #include "assem_x86.c"
967 #endif
968 #ifdef __x86_64__
969 #include "assem_x64.c"
970 #endif
971 #ifdef __arm__
972 #include "assem_arm.c"
973 #endif
974
975 // Add virtual address mapping to linked list
976 void ll_add(struct ll_entry **head,int vaddr,void *addr)
977 {
978   struct ll_entry *new_entry;
979   new_entry=malloc(sizeof(struct ll_entry));
980   assert(new_entry!=NULL);
981   new_entry->vaddr=vaddr;
982   new_entry->reg32=0;
983   new_entry->addr=addr;
984   new_entry->next=*head;
985   *head=new_entry;
986 }
987
988 // Add virtual address mapping for 32-bit compiled block
989 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
990 {
991   ll_add(head,vaddr,addr);
992 #ifndef FORCE32
993   (*head)->reg32=reg32;
994 #endif
995 }
996
997 // Check if an address is already compiled
998 // but don't return addresses which are about to expire from the cache
999 void *check_addr(u_int vaddr)
1000 {
1001   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1002   if(ht_bin[0]==vaddr) {
1003     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1004       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1005   }
1006   if(ht_bin[2]==vaddr) {
1007     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1008       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1009   }
1010   u_int page=get_page(vaddr);
1011   struct ll_entry *head;
1012   head=jump_in[page];
1013   while(head!=NULL) {
1014     if(head->vaddr==vaddr&&head->reg32==0) {
1015       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1016         // Update existing entry with current address
1017         if(ht_bin[0]==vaddr) {
1018           ht_bin[1]=(int)head->addr;
1019           return head->addr;
1020         }
1021         if(ht_bin[2]==vaddr) {
1022           ht_bin[3]=(int)head->addr;
1023           return head->addr;
1024         }
1025         // Insert into hash table with low priority.
1026         // Don't evict existing entries, as they are probably
1027         // addresses that are being accessed frequently.
1028         if(ht_bin[0]==-1) {
1029           ht_bin[1]=(int)head->addr;
1030           ht_bin[0]=vaddr;
1031         }else if(ht_bin[2]==-1) {
1032           ht_bin[3]=(int)head->addr;
1033           ht_bin[2]=vaddr;
1034         }
1035         return head->addr;
1036       }
1037     }
1038     head=head->next;
1039   }
1040   return 0;
1041 }
1042
1043 void remove_hash(int vaddr)
1044 {
1045   //printf("remove hash: %x\n",vaddr);
1046   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1047   if(ht_bin[2]==vaddr) {
1048     ht_bin[2]=ht_bin[3]=-1;
1049   }
1050   if(ht_bin[0]==vaddr) {
1051     ht_bin[0]=ht_bin[2];
1052     ht_bin[1]=ht_bin[3];
1053     ht_bin[2]=ht_bin[3]=-1;
1054   }
1055 }
1056
1057 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1058 {
1059   struct ll_entry *next;
1060   while(*head) {
1061     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1062        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1063     {
1064       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1065       remove_hash((*head)->vaddr);
1066       next=(*head)->next;
1067       free(*head);
1068       *head=next;
1069     }
1070     else
1071     {
1072       head=&((*head)->next);
1073     }
1074   }
1075 }
1076
1077 // Remove all entries from linked list
1078 void ll_clear(struct ll_entry **head)
1079 {
1080   struct ll_entry *cur;
1081   struct ll_entry *next;
1082   if(cur=*head) {
1083     *head=0;
1084     while(cur) {
1085       next=cur->next;
1086       free(cur);
1087       cur=next;
1088     }
1089   }
1090 }
1091
1092 // Dereference the pointers and remove if it matches
1093 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1094 {
1095   while(head) {
1096     int ptr=get_pointer(head->addr);
1097     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1098     if(((ptr>>shift)==(addr>>shift)) ||
1099        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1100     {
1101       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1102       u_int host_addr=(u_int)kill_pointer(head->addr);
1103       #ifdef __arm__
1104         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1105       #endif
1106     }
1107     head=head->next;
1108   }
1109 }
1110
1111 // This is called when we write to a compiled block (see do_invstub)
1112 void invalidate_page(u_int page)
1113 {
1114   struct ll_entry *head;
1115   struct ll_entry *next;
1116   head=jump_in[page];
1117   jump_in[page]=0;
1118   while(head!=NULL) {
1119     inv_debug("INVALIDATE: %x\n",head->vaddr);
1120     remove_hash(head->vaddr);
1121     next=head->next;
1122     free(head);
1123     head=next;
1124   }
1125   head=jump_out[page];
1126   jump_out[page]=0;
1127   while(head!=NULL) {
1128     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1129     u_int host_addr=(u_int)kill_pointer(head->addr);
1130     #ifdef __arm__
1131       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1132     #endif
1133     next=head->next;
1134     free(head);
1135     head=next;
1136   }
1137 }
1138 void invalidate_block(u_int block)
1139 {
1140   u_int page=get_page(block<<12);
1141   u_int vpage=get_vpage(block<<12);
1142   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1143   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1144   u_int first,last;
1145   first=last=page;
1146   struct ll_entry *head;
1147   head=jump_dirty[vpage];
1148   //printf("page=%d vpage=%d\n",page,vpage);
1149   while(head!=NULL) {
1150     u_int start,end;
1151     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1152       get_bounds((int)head->addr,&start,&end);
1153       //printf("start: %x end: %x\n",start,end);
1154       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1155         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1156           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1157           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1158         }
1159       }
1160 #ifndef DISABLE_TLB
1161       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1162         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1163           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1164           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1165         }
1166       }
1167 #endif
1168     }
1169     head=head->next;
1170   }
1171   //printf("first=%d last=%d\n",first,last);
1172   invalidate_page(page);
1173   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1174   assert(last<page+5);
1175   // Invalidate the adjacent pages if a block crosses a 4K boundary
1176   while(first<page) {
1177     invalidate_page(first);
1178     first++;
1179   }
1180   for(first=page+1;first<last;first++) {
1181     invalidate_page(first);
1182   }
1183   #ifdef __arm__
1184     do_clear_cache();
1185   #endif
1186   
1187   // Don't trap writes
1188   invalid_code[block]=1;
1189 #ifndef DISABLE_TLB
1190   // If there is a valid TLB entry for this page, remove write protect
1191   if(tlb_LUT_w[block]) {
1192     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1193     // CHECK: Is this right?
1194     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1195     u_int real_block=tlb_LUT_w[block]>>12;
1196     invalid_code[real_block]=1;
1197     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1198   }
1199   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1200 #endif
1201
1202   #ifdef USE_MINI_HT
1203   memset(mini_ht,-1,sizeof(mini_ht));
1204   #endif
1205 }
1206 void invalidate_addr(u_int addr)
1207 {
1208   invalidate_block(addr>>12);
1209 }
1210 // This is called when loading a save state.
1211 // Anything could have changed, so invalidate everything.
1212 void invalidate_all_pages()
1213 {
1214   u_int page,n;
1215   for(page=0;page<4096;page++)
1216     invalidate_page(page);
1217   for(page=0;page<1048576;page++)
1218     if(!invalid_code[page]) {
1219       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1220       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1221     }
1222   #ifdef __arm__
1223   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1224   #endif
1225   #ifdef USE_MINI_HT
1226   memset(mini_ht,-1,sizeof(mini_ht));
1227   #endif
1228   #ifndef DISABLE_TLB
1229   // TLB
1230   for(page=0;page<0x100000;page++) {
1231     if(tlb_LUT_r[page]) {
1232       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1233       if(!tlb_LUT_w[page]||!invalid_code[page])
1234         memory_map[page]|=0x40000000; // Write protect
1235     }
1236     else memory_map[page]=-1;
1237     if(page==0x80000) page=0xC0000;
1238   }
1239   tlb_hacks();
1240   #endif
1241 }
1242
1243 // Add an entry to jump_out after making a link
1244 void add_link(u_int vaddr,void *src)
1245 {
1246   u_int page=get_page(vaddr);
1247   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1248   ll_add(jump_out+page,vaddr,src);
1249   //int ptr=get_pointer(src);
1250   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1251 }
1252
1253 // If a code block was found to be unmodified (bit was set in
1254 // restore_candidate) and it remains unmodified (bit is clear
1255 // in invalid_code) then move the entries for that 4K page from
1256 // the dirty list to the clean list.
1257 void clean_blocks(u_int page)
1258 {
1259   struct ll_entry *head;
1260   inv_debug("INV: clean_blocks page=%d\n",page);
1261   head=jump_dirty[page];
1262   while(head!=NULL) {
1263     if(!invalid_code[head->vaddr>>12]) {
1264       // Don't restore blocks which are about to expire from the cache
1265       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1266         u_int start,end;
1267         if(verify_dirty((int)head->addr)) {
1268           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1269           u_int i;
1270           u_int inv=0;
1271           get_bounds((int)head->addr,&start,&end);
1272           if(start-(u_int)rdram<RAM_SIZE) {
1273             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1274               inv|=invalid_code[i];
1275             }
1276           }
1277           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1278             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1279             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1280             if(addr<start||addr>=end) inv=1;
1281           }
1282           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1283             inv=1;
1284           }
1285           if(!inv) {
1286             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1287             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1288               u_int ppage=page;
1289 #ifndef DISABLE_TLB
1290               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1291 #endif
1292               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1293               //printf("page=%x, addr=%x\n",page,head->vaddr);
1294               //assert(head->vaddr>>12==(page|0x80000));
1295               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1296               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1297               if(!head->reg32) {
1298                 if(ht_bin[0]==head->vaddr) {
1299                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1300                 }
1301                 if(ht_bin[2]==head->vaddr) {
1302                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1303                 }
1304               }
1305             }
1306           }
1307         }
1308       }
1309     }
1310     head=head->next;
1311   }
1312 }
1313
1314
1315 void mov_alloc(struct regstat *current,int i)
1316 {
1317   // Note: Don't need to actually alloc the source registers
1318   if((~current->is32>>rs1[i])&1) {
1319     //alloc_reg64(current,i,rs1[i]);
1320     alloc_reg64(current,i,rt1[i]);
1321     current->is32&=~(1LL<<rt1[i]);
1322   } else {
1323     //alloc_reg(current,i,rs1[i]);
1324     alloc_reg(current,i,rt1[i]);
1325     current->is32|=(1LL<<rt1[i]);
1326   }
1327   clear_const(current,rs1[i]);
1328   clear_const(current,rt1[i]);
1329   dirty_reg(current,rt1[i]);
1330 }
1331
1332 void shiftimm_alloc(struct regstat *current,int i)
1333 {
1334   clear_const(current,rs1[i]);
1335   clear_const(current,rt1[i]);
1336   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1337   {
1338     if(rt1[i]) {
1339       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1340       else lt1[i]=rs1[i];
1341       alloc_reg(current,i,rt1[i]);
1342       current->is32|=1LL<<rt1[i];
1343       dirty_reg(current,rt1[i]);
1344     }
1345   }
1346   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1347   {
1348     if(rt1[i]) {
1349       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1350       alloc_reg64(current,i,rt1[i]);
1351       current->is32&=~(1LL<<rt1[i]);
1352       dirty_reg(current,rt1[i]);
1353     }
1354   }
1355   if(opcode2[i]==0x3c) // DSLL32
1356   {
1357     if(rt1[i]) {
1358       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1359       alloc_reg64(current,i,rt1[i]);
1360       current->is32&=~(1LL<<rt1[i]);
1361       dirty_reg(current,rt1[i]);
1362     }
1363   }
1364   if(opcode2[i]==0x3e) // DSRL32
1365   {
1366     if(rt1[i]) {
1367       alloc_reg64(current,i,rs1[i]);
1368       if(imm[i]==32) {
1369         alloc_reg64(current,i,rt1[i]);
1370         current->is32&=~(1LL<<rt1[i]);
1371       } else {
1372         alloc_reg(current,i,rt1[i]);
1373         current->is32|=1LL<<rt1[i];
1374       }
1375       dirty_reg(current,rt1[i]);
1376     }
1377   }
1378   if(opcode2[i]==0x3f) // DSRA32
1379   {
1380     if(rt1[i]) {
1381       alloc_reg64(current,i,rs1[i]);
1382       alloc_reg(current,i,rt1[i]);
1383       current->is32|=1LL<<rt1[i];
1384       dirty_reg(current,rt1[i]);
1385     }
1386   }
1387 }
1388
1389 void shift_alloc(struct regstat *current,int i)
1390 {
1391   if(rt1[i]) {
1392     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1393     {
1394       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1395       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1396       alloc_reg(current,i,rt1[i]);
1397       if(rt1[i]==rs2[i]) {
1398         alloc_reg_temp(current,i,-1);
1399         minimum_free_regs[i]=1;
1400       }
1401       current->is32|=1LL<<rt1[i];
1402     } else { // DSLLV/DSRLV/DSRAV
1403       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1404       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1405       alloc_reg64(current,i,rt1[i]);
1406       current->is32&=~(1LL<<rt1[i]);
1407       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1408       {
1409         alloc_reg_temp(current,i,-1);
1410         minimum_free_regs[i]=1;
1411       }
1412     }
1413     clear_const(current,rs1[i]);
1414     clear_const(current,rs2[i]);
1415     clear_const(current,rt1[i]);
1416     dirty_reg(current,rt1[i]);
1417   }
1418 }
1419
1420 void alu_alloc(struct regstat *current,int i)
1421 {
1422   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1423     if(rt1[i]) {
1424       if(rs1[i]&&rs2[i]) {
1425         alloc_reg(current,i,rs1[i]);
1426         alloc_reg(current,i,rs2[i]);
1427       }
1428       else {
1429         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1430         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1431       }
1432       alloc_reg(current,i,rt1[i]);
1433     }
1434     current->is32|=1LL<<rt1[i];
1435   }
1436   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1437     if(rt1[i]) {
1438       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1439       {
1440         alloc_reg64(current,i,rs1[i]);
1441         alloc_reg64(current,i,rs2[i]);
1442         alloc_reg(current,i,rt1[i]);
1443       } else {
1444         alloc_reg(current,i,rs1[i]);
1445         alloc_reg(current,i,rs2[i]);
1446         alloc_reg(current,i,rt1[i]);
1447       }
1448     }
1449     current->is32|=1LL<<rt1[i];
1450   }
1451   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1452     if(rt1[i]) {
1453       if(rs1[i]&&rs2[i]) {
1454         alloc_reg(current,i,rs1[i]);
1455         alloc_reg(current,i,rs2[i]);
1456       }
1457       else
1458       {
1459         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1460         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1461       }
1462       alloc_reg(current,i,rt1[i]);
1463       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1464       {
1465         if(!((current->uu>>rt1[i])&1)) {
1466           alloc_reg64(current,i,rt1[i]);
1467         }
1468         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1469           if(rs1[i]&&rs2[i]) {
1470             alloc_reg64(current,i,rs1[i]);
1471             alloc_reg64(current,i,rs2[i]);
1472           }
1473           else
1474           {
1475             // Is is really worth it to keep 64-bit values in registers?
1476             #ifdef NATIVE_64BIT
1477             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1478             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1479             #endif
1480           }
1481         }
1482         current->is32&=~(1LL<<rt1[i]);
1483       } else {
1484         current->is32|=1LL<<rt1[i];
1485       }
1486     }
1487   }
1488   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1489     if(rt1[i]) {
1490       if(rs1[i]&&rs2[i]) {
1491         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1492           alloc_reg64(current,i,rs1[i]);
1493           alloc_reg64(current,i,rs2[i]);
1494           alloc_reg64(current,i,rt1[i]);
1495         } else {
1496           alloc_reg(current,i,rs1[i]);
1497           alloc_reg(current,i,rs2[i]);
1498           alloc_reg(current,i,rt1[i]);
1499         }
1500       }
1501       else {
1502         alloc_reg(current,i,rt1[i]);
1503         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1504           // DADD used as move, or zeroing
1505           // If we have a 64-bit source, then make the target 64 bits too
1506           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1507             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1508             alloc_reg64(current,i,rt1[i]);
1509           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1510             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1511             alloc_reg64(current,i,rt1[i]);
1512           }
1513           if(opcode2[i]>=0x2e&&rs2[i]) {
1514             // DSUB used as negation - 64-bit result
1515             // If we have a 32-bit register, extend it to 64 bits
1516             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1517             alloc_reg64(current,i,rt1[i]);
1518           }
1519         }
1520       }
1521       if(rs1[i]&&rs2[i]) {
1522         current->is32&=~(1LL<<rt1[i]);
1523       } else if(rs1[i]) {
1524         current->is32&=~(1LL<<rt1[i]);
1525         if((current->is32>>rs1[i])&1)
1526           current->is32|=1LL<<rt1[i];
1527       } else if(rs2[i]) {
1528         current->is32&=~(1LL<<rt1[i]);
1529         if((current->is32>>rs2[i])&1)
1530           current->is32|=1LL<<rt1[i];
1531       } else {
1532         current->is32|=1LL<<rt1[i];
1533       }
1534     }
1535   }
1536   clear_const(current,rs1[i]);
1537   clear_const(current,rs2[i]);
1538   clear_const(current,rt1[i]);
1539   dirty_reg(current,rt1[i]);
1540 }
1541
1542 void imm16_alloc(struct regstat *current,int i)
1543 {
1544   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1545   else lt1[i]=rs1[i];
1546   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1547   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1548     current->is32&=~(1LL<<rt1[i]);
1549     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1550       // TODO: Could preserve the 32-bit flag if the immediate is zero
1551       alloc_reg64(current,i,rt1[i]);
1552       alloc_reg64(current,i,rs1[i]);
1553     }
1554     clear_const(current,rs1[i]);
1555     clear_const(current,rt1[i]);
1556   }
1557   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1558     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1559     current->is32|=1LL<<rt1[i];
1560     clear_const(current,rs1[i]);
1561     clear_const(current,rt1[i]);
1562   }
1563   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1564     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1565       if(rs1[i]!=rt1[i]) {
1566         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1567         alloc_reg64(current,i,rt1[i]);
1568         current->is32&=~(1LL<<rt1[i]);
1569       }
1570     }
1571     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1572     if(is_const(current,rs1[i])) {
1573       int v=get_const(current,rs1[i]);
1574       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1575       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1576       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1577     }
1578     else clear_const(current,rt1[i]);
1579   }
1580   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1581     if(is_const(current,rs1[i])) {
1582       int v=get_const(current,rs1[i]);
1583       set_const(current,rt1[i],v+imm[i]);
1584     }
1585     else clear_const(current,rt1[i]);
1586     current->is32|=1LL<<rt1[i];
1587   }
1588   else {
1589     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1590     current->is32|=1LL<<rt1[i];
1591   }
1592   dirty_reg(current,rt1[i]);
1593 }
1594
1595 void load_alloc(struct regstat *current,int i)
1596 {
1597   clear_const(current,rt1[i]);
1598   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1599   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1600   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1601   if(rt1[i]) {
1602     alloc_reg(current,i,rt1[i]);
1603     if(get_reg(current->regmap,rt1[i])<0) {
1604       // dummy load, but we still need a register to calculate the address
1605       alloc_reg_temp(current,i,-1);
1606       minimum_free_regs[i]=1;
1607     }
1608     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1609     {
1610       current->is32&=~(1LL<<rt1[i]);
1611       alloc_reg64(current,i,rt1[i]);
1612     }
1613     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1614     {
1615       current->is32&=~(1LL<<rt1[i]);
1616       alloc_reg64(current,i,rt1[i]);
1617       alloc_all(current,i);
1618       alloc_reg64(current,i,FTEMP);
1619       minimum_free_regs[i]=HOST_REGS;
1620     }
1621     else current->is32|=1LL<<rt1[i];
1622     dirty_reg(current,rt1[i]);
1623     // If using TLB, need a register for pointer to the mapping table
1624     if(using_tlb) alloc_reg(current,i,TLREG);
1625     // LWL/LWR need a temporary register for the old value
1626     if(opcode[i]==0x22||opcode[i]==0x26)
1627     {
1628       alloc_reg(current,i,FTEMP);
1629       alloc_reg_temp(current,i,-1);
1630       minimum_free_regs[i]=1;
1631     }
1632   }
1633   else
1634   {
1635     // Load to r0 (dummy load)
1636     // but we still need a register to calculate the address
1637     if(opcode[i]==0x22||opcode[i]==0x26)
1638     {
1639       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1640     }
1641     alloc_reg_temp(current,i,-1);
1642     minimum_free_regs[i]=1;
1643     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1644     {
1645       alloc_all(current,i);
1646       alloc_reg64(current,i,FTEMP);
1647       minimum_free_regs[i]=HOST_REGS;
1648     }
1649   }
1650 }
1651
1652 void store_alloc(struct regstat *current,int i)
1653 {
1654   clear_const(current,rs2[i]);
1655   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1656   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1657   alloc_reg(current,i,rs2[i]);
1658   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1659     alloc_reg64(current,i,rs2[i]);
1660     if(rs2[i]) alloc_reg(current,i,FTEMP);
1661   }
1662   // If using TLB, need a register for pointer to the mapping table
1663   if(using_tlb) alloc_reg(current,i,TLREG);
1664   #if defined(HOST_IMM8)
1665   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1666   else alloc_reg(current,i,INVCP);
1667   #endif
1668   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1669     alloc_reg(current,i,FTEMP);
1670   }
1671   // We need a temporary register for address generation
1672   alloc_reg_temp(current,i,-1);
1673   minimum_free_regs[i]=1;
1674 }
1675
1676 void c1ls_alloc(struct regstat *current,int i)
1677 {
1678   //clear_const(current,rs1[i]); // FIXME
1679   clear_const(current,rt1[i]);
1680   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1681   alloc_reg(current,i,CSREG); // Status
1682   alloc_reg(current,i,FTEMP);
1683   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1684     alloc_reg64(current,i,FTEMP);
1685   }
1686   // If using TLB, need a register for pointer to the mapping table
1687   if(using_tlb) alloc_reg(current,i,TLREG);
1688   #if defined(HOST_IMM8)
1689   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1690   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1691     alloc_reg(current,i,INVCP);
1692   #endif
1693   // We need a temporary register for address generation
1694   alloc_reg_temp(current,i,-1);
1695 }
1696
1697 void c2ls_alloc(struct regstat *current,int i)
1698 {
1699   clear_const(current,rt1[i]);
1700   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1701   alloc_reg(current,i,FTEMP);
1702   // If using TLB, need a register for pointer to the mapping table
1703   if(using_tlb) alloc_reg(current,i,TLREG);
1704   #if defined(HOST_IMM8)
1705   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1706   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1707     alloc_reg(current,i,INVCP);
1708   #endif
1709   // We need a temporary register for address generation
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713
1714 #ifndef multdiv_alloc
1715 void multdiv_alloc(struct regstat *current,int i)
1716 {
1717   //  case 0x18: MULT
1718   //  case 0x19: MULTU
1719   //  case 0x1A: DIV
1720   //  case 0x1B: DIVU
1721   //  case 0x1C: DMULT
1722   //  case 0x1D: DMULTU
1723   //  case 0x1E: DDIV
1724   //  case 0x1F: DDIVU
1725   clear_const(current,rs1[i]);
1726   clear_const(current,rs2[i]);
1727   if(rs1[i]&&rs2[i])
1728   {
1729     if((opcode2[i]&4)==0) // 32-bit
1730     {
1731       current->u&=~(1LL<<HIREG);
1732       current->u&=~(1LL<<LOREG);
1733       alloc_reg(current,i,HIREG);
1734       alloc_reg(current,i,LOREG);
1735       alloc_reg(current,i,rs1[i]);
1736       alloc_reg(current,i,rs2[i]);
1737       current->is32|=1LL<<HIREG;
1738       current->is32|=1LL<<LOREG;
1739       dirty_reg(current,HIREG);
1740       dirty_reg(current,LOREG);
1741     }
1742     else // 64-bit
1743     {
1744       current->u&=~(1LL<<HIREG);
1745       current->u&=~(1LL<<LOREG);
1746       current->uu&=~(1LL<<HIREG);
1747       current->uu&=~(1LL<<LOREG);
1748       alloc_reg64(current,i,HIREG);
1749       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1750       alloc_reg64(current,i,rs1[i]);
1751       alloc_reg64(current,i,rs2[i]);
1752       alloc_all(current,i);
1753       current->is32&=~(1LL<<HIREG);
1754       current->is32&=~(1LL<<LOREG);
1755       dirty_reg(current,HIREG);
1756       dirty_reg(current,LOREG);
1757       minimum_free_regs[i]=HOST_REGS;
1758     }
1759   }
1760   else
1761   {
1762     // Multiply by zero is zero.
1763     // MIPS does not have a divide by zero exception.
1764     // The result is undefined, we return zero.
1765     alloc_reg(current,i,HIREG);
1766     alloc_reg(current,i,LOREG);
1767     current->is32|=1LL<<HIREG;
1768     current->is32|=1LL<<LOREG;
1769     dirty_reg(current,HIREG);
1770     dirty_reg(current,LOREG);
1771   }
1772 }
1773 #endif
1774
1775 void cop0_alloc(struct regstat *current,int i)
1776 {
1777   if(opcode2[i]==0) // MFC0
1778   {
1779     if(rt1[i]) {
1780       clear_const(current,rt1[i]);
1781       alloc_all(current,i);
1782       alloc_reg(current,i,rt1[i]);
1783       current->is32|=1LL<<rt1[i];
1784       dirty_reg(current,rt1[i]);
1785     }
1786   }
1787   else if(opcode2[i]==4) // MTC0
1788   {
1789     if(rs1[i]){
1790       clear_const(current,rs1[i]);
1791       alloc_reg(current,i,rs1[i]);
1792       alloc_all(current,i);
1793     }
1794     else {
1795       alloc_all(current,i); // FIXME: Keep r0
1796       current->u&=~1LL;
1797       alloc_reg(current,i,0);
1798     }
1799   }
1800   else
1801   {
1802     // TLBR/TLBWI/TLBWR/TLBP/ERET
1803     assert(opcode2[i]==0x10);
1804     alloc_all(current,i);
1805   }
1806   minimum_free_regs[i]=HOST_REGS;
1807 }
1808
1809 void cop1_alloc(struct regstat *current,int i)
1810 {
1811   alloc_reg(current,i,CSREG); // Load status
1812   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1813   {
1814     if(rt1[i]){
1815       clear_const(current,rt1[i]);
1816       if(opcode2[i]==1) {
1817         alloc_reg64(current,i,rt1[i]); // DMFC1
1818         current->is32&=~(1LL<<rt1[i]);
1819       }else{
1820         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1821         current->is32|=1LL<<rt1[i];
1822       }
1823       dirty_reg(current,rt1[i]);
1824     }
1825     alloc_reg_temp(current,i,-1);
1826   }
1827   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1828   {
1829     if(rs1[i]){
1830       clear_const(current,rs1[i]);
1831       if(opcode2[i]==5)
1832         alloc_reg64(current,i,rs1[i]); // DMTC1
1833       else
1834         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1835       alloc_reg_temp(current,i,-1);
1836     }
1837     else {
1838       current->u&=~1LL;
1839       alloc_reg(current,i,0);
1840       alloc_reg_temp(current,i,-1);
1841     }
1842   }
1843   minimum_free_regs[i]=1;
1844 }
1845 void fconv_alloc(struct regstat *current,int i)
1846 {
1847   alloc_reg(current,i,CSREG); // Load status
1848   alloc_reg_temp(current,i,-1);
1849   minimum_free_regs[i]=1;
1850 }
1851 void float_alloc(struct regstat *current,int i)
1852 {
1853   alloc_reg(current,i,CSREG); // Load status
1854   alloc_reg_temp(current,i,-1);
1855   minimum_free_regs[i]=1;
1856 }
1857 void c2op_alloc(struct regstat *current,int i)
1858 {
1859   alloc_reg_temp(current,i,-1);
1860 }
1861 void fcomp_alloc(struct regstat *current,int i)
1862 {
1863   alloc_reg(current,i,CSREG); // Load status
1864   alloc_reg(current,i,FSREG); // Load flags
1865   dirty_reg(current,FSREG); // Flag will be modified
1866   alloc_reg_temp(current,i,-1);
1867   minimum_free_regs[i]=1;
1868 }
1869
1870 void syscall_alloc(struct regstat *current,int i)
1871 {
1872   alloc_cc(current,i);
1873   dirty_reg(current,CCREG);
1874   alloc_all(current,i);
1875   minimum_free_regs[i]=HOST_REGS;
1876   current->isconst=0;
1877 }
1878
1879 void delayslot_alloc(struct regstat *current,int i)
1880 {
1881   switch(itype[i]) {
1882     case UJUMP:
1883     case CJUMP:
1884     case SJUMP:
1885     case RJUMP:
1886     case FJUMP:
1887     case SYSCALL:
1888     case HLECALL:
1889     case SPAN:
1890       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1891       printf("Disabled speculative precompilation\n");
1892       stop_after_jal=1;
1893       break;
1894     case IMM16:
1895       imm16_alloc(current,i);
1896       break;
1897     case LOAD:
1898     case LOADLR:
1899       load_alloc(current,i);
1900       break;
1901     case STORE:
1902     case STORELR:
1903       store_alloc(current,i);
1904       break;
1905     case ALU:
1906       alu_alloc(current,i);
1907       break;
1908     case SHIFT:
1909       shift_alloc(current,i);
1910       break;
1911     case MULTDIV:
1912       multdiv_alloc(current,i);
1913       break;
1914     case SHIFTIMM:
1915       shiftimm_alloc(current,i);
1916       break;
1917     case MOV:
1918       mov_alloc(current,i);
1919       break;
1920     case COP0:
1921       cop0_alloc(current,i);
1922       break;
1923     case COP1:
1924     case COP2:
1925       cop1_alloc(current,i);
1926       break;
1927     case C1LS:
1928       c1ls_alloc(current,i);
1929       break;
1930     case C2LS:
1931       c2ls_alloc(current,i);
1932       break;
1933     case FCONV:
1934       fconv_alloc(current,i);
1935       break;
1936     case FLOAT:
1937       float_alloc(current,i);
1938       break;
1939     case FCOMP:
1940       fcomp_alloc(current,i);
1941       break;
1942     case C2OP:
1943       c2op_alloc(current,i);
1944       break;
1945   }
1946 }
1947
1948 // Special case where a branch and delay slot span two pages in virtual memory
1949 static void pagespan_alloc(struct regstat *current,int i)
1950 {
1951   current->isconst=0;
1952   current->wasconst=0;
1953   regs[i].wasconst=0;
1954   minimum_free_regs[i]=HOST_REGS;
1955   alloc_all(current,i);
1956   alloc_cc(current,i);
1957   dirty_reg(current,CCREG);
1958   if(opcode[i]==3) // JAL
1959   {
1960     alloc_reg(current,i,31);
1961     dirty_reg(current,31);
1962   }
1963   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1964   {
1965     alloc_reg(current,i,rs1[i]);
1966     if (rt1[i]!=0) {
1967       alloc_reg(current,i,rt1[i]);
1968       dirty_reg(current,rt1[i]);
1969     }
1970   }
1971   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1972   {
1973     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1974     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1975     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1976     {
1977       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1978       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1979     }
1980   }
1981   else
1982   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1983   {
1984     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1985     if(!((current->is32>>rs1[i])&1))
1986     {
1987       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1988     }
1989   }
1990   else
1991   if(opcode[i]==0x11) // BC1
1992   {
1993     alloc_reg(current,i,FSREG);
1994     alloc_reg(current,i,CSREG);
1995   }
1996   //else ...
1997 }
1998
1999 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2000 {
2001   stubs[stubcount][0]=type;
2002   stubs[stubcount][1]=addr;
2003   stubs[stubcount][2]=retaddr;
2004   stubs[stubcount][3]=a;
2005   stubs[stubcount][4]=b;
2006   stubs[stubcount][5]=c;
2007   stubs[stubcount][6]=d;
2008   stubs[stubcount][7]=e;
2009   stubcount++;
2010 }
2011
2012 // Write out a single register
2013 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2014 {
2015   int hr;
2016   for(hr=0;hr<HOST_REGS;hr++) {
2017     if(hr!=EXCLUDE_REG) {
2018       if((regmap[hr]&63)==r) {
2019         if((dirty>>hr)&1) {
2020           if(regmap[hr]<64) {
2021             emit_storereg(r,hr);
2022 #ifndef FORCE32
2023             if((is32>>regmap[hr])&1) {
2024               emit_sarimm(hr,31,hr);
2025               emit_storereg(r|64,hr);
2026             }
2027 #endif
2028           }else{
2029             emit_storereg(r|64,hr);
2030           }
2031         }
2032       }
2033     }
2034   }
2035 }
2036
2037 int mchecksum()
2038 {
2039   //if(!tracedebug) return 0;
2040   int i;
2041   int sum=0;
2042   for(i=0;i<2097152;i++) {
2043     unsigned int temp=sum;
2044     sum<<=1;
2045     sum|=(~temp)>>31;
2046     sum^=((u_int *)rdram)[i];
2047   }
2048   return sum;
2049 }
2050 int rchecksum()
2051 {
2052   int i;
2053   int sum=0;
2054   for(i=0;i<64;i++)
2055     sum^=((u_int *)reg)[i];
2056   return sum;
2057 }
2058 void rlist()
2059 {
2060   int i;
2061   printf("TRACE: ");
2062   for(i=0;i<32;i++)
2063     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2064   printf("\n");
2065 #ifndef DISABLE_COP1
2066   printf("TRACE: ");
2067   for(i=0;i<32;i++)
2068     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2069   printf("\n");
2070 #endif
2071 }
2072
2073 void enabletrace()
2074 {
2075   tracedebug=1;
2076 }
2077
2078 void memdebug(int i)
2079 {
2080   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2081   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2082   //rlist();
2083   //if(tracedebug) {
2084   //if(Count>=-2084597794) {
2085   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2086   //if(0) {
2087     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2088     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2089     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2090     rlist();
2091     #ifdef __i386__
2092     printf("TRACE: %x\n",(&i)[-1]);
2093     #endif
2094     #ifdef __arm__
2095     int j;
2096     printf("TRACE: %x \n",(&j)[10]);
2097     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2098     #endif
2099     //fflush(stdout);
2100   }
2101   //printf("TRACE: %x\n",(&i)[-1]);
2102 }
2103
2104 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2105 {
2106   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2107 }
2108
2109 void alu_assemble(int i,struct regstat *i_regs)
2110 {
2111   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2112     if(rt1[i]) {
2113       signed char s1,s2,t;
2114       t=get_reg(i_regs->regmap,rt1[i]);
2115       if(t>=0) {
2116         s1=get_reg(i_regs->regmap,rs1[i]);
2117         s2=get_reg(i_regs->regmap,rs2[i]);
2118         if(rs1[i]&&rs2[i]) {
2119           assert(s1>=0);
2120           assert(s2>=0);
2121           if(opcode2[i]&2) emit_sub(s1,s2,t);
2122           else emit_add(s1,s2,t);
2123         }
2124         else if(rs1[i]) {
2125           if(s1>=0) emit_mov(s1,t);
2126           else emit_loadreg(rs1[i],t);
2127         }
2128         else if(rs2[i]) {
2129           if(s2>=0) {
2130             if(opcode2[i]&2) emit_neg(s2,t);
2131             else emit_mov(s2,t);
2132           }
2133           else {
2134             emit_loadreg(rs2[i],t);
2135             if(opcode2[i]&2) emit_neg(t,t);
2136           }
2137         }
2138         else emit_zeroreg(t);
2139       }
2140     }
2141   }
2142   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2143     if(rt1[i]) {
2144       signed char s1l,s2l,s1h,s2h,tl,th;
2145       tl=get_reg(i_regs->regmap,rt1[i]);
2146       th=get_reg(i_regs->regmap,rt1[i]|64);
2147       if(tl>=0) {
2148         s1l=get_reg(i_regs->regmap,rs1[i]);
2149         s2l=get_reg(i_regs->regmap,rs2[i]);
2150         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2151         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2152         if(rs1[i]&&rs2[i]) {
2153           assert(s1l>=0);
2154           assert(s2l>=0);
2155           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2156           else emit_adds(s1l,s2l,tl);
2157           if(th>=0) {
2158             #ifdef INVERTED_CARRY
2159             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2160             #else
2161             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2162             #endif
2163             else emit_add(s1h,s2h,th);
2164           }
2165         }
2166         else if(rs1[i]) {
2167           if(s1l>=0) emit_mov(s1l,tl);
2168           else emit_loadreg(rs1[i],tl);
2169           if(th>=0) {
2170             if(s1h>=0) emit_mov(s1h,th);
2171             else emit_loadreg(rs1[i]|64,th);
2172           }
2173         }
2174         else if(rs2[i]) {
2175           if(s2l>=0) {
2176             if(opcode2[i]&2) emit_negs(s2l,tl);
2177             else emit_mov(s2l,tl);
2178           }
2179           else {
2180             emit_loadreg(rs2[i],tl);
2181             if(opcode2[i]&2) emit_negs(tl,tl);
2182           }
2183           if(th>=0) {
2184             #ifdef INVERTED_CARRY
2185             if(s2h>=0) emit_mov(s2h,th);
2186             else emit_loadreg(rs2[i]|64,th);
2187             if(opcode2[i]&2) {
2188               emit_adcimm(-1,th); // x86 has inverted carry flag
2189               emit_not(th,th);
2190             }
2191             #else
2192             if(opcode2[i]&2) {
2193               if(s2h>=0) emit_rscimm(s2h,0,th);
2194               else {
2195                 emit_loadreg(rs2[i]|64,th);
2196                 emit_rscimm(th,0,th);
2197               }
2198             }else{
2199               if(s2h>=0) emit_mov(s2h,th);
2200               else emit_loadreg(rs2[i]|64,th);
2201             }
2202             #endif
2203           }
2204         }
2205         else {
2206           emit_zeroreg(tl);
2207           if(th>=0) emit_zeroreg(th);
2208         }
2209       }
2210     }
2211   }
2212   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2213     if(rt1[i]) {
2214       signed char s1l,s1h,s2l,s2h,t;
2215       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2216       {
2217         t=get_reg(i_regs->regmap,rt1[i]);
2218         //assert(t>=0);
2219         if(t>=0) {
2220           s1l=get_reg(i_regs->regmap,rs1[i]);
2221           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2222           s2l=get_reg(i_regs->regmap,rs2[i]);
2223           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2224           if(rs2[i]==0) // rx<r0
2225           {
2226             assert(s1h>=0);
2227             if(opcode2[i]==0x2a) // SLT
2228               emit_shrimm(s1h,31,t);
2229             else // SLTU (unsigned can not be less than zero)
2230               emit_zeroreg(t);
2231           }
2232           else if(rs1[i]==0) // r0<rx
2233           {
2234             assert(s2h>=0);
2235             if(opcode2[i]==0x2a) // SLT
2236               emit_set_gz64_32(s2h,s2l,t);
2237             else // SLTU (set if not zero)
2238               emit_set_nz64_32(s2h,s2l,t);
2239           }
2240           else {
2241             assert(s1l>=0);assert(s1h>=0);
2242             assert(s2l>=0);assert(s2h>=0);
2243             if(opcode2[i]==0x2a) // SLT
2244               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2245             else // SLTU
2246               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2247           }
2248         }
2249       } else {
2250         t=get_reg(i_regs->regmap,rt1[i]);
2251         //assert(t>=0);
2252         if(t>=0) {
2253           s1l=get_reg(i_regs->regmap,rs1[i]);
2254           s2l=get_reg(i_regs->regmap,rs2[i]);
2255           if(rs2[i]==0) // rx<r0
2256           {
2257             assert(s1l>=0);
2258             if(opcode2[i]==0x2a) // SLT
2259               emit_shrimm(s1l,31,t);
2260             else // SLTU (unsigned can not be less than zero)
2261               emit_zeroreg(t);
2262           }
2263           else if(rs1[i]==0) // r0<rx
2264           {
2265             assert(s2l>=0);
2266             if(opcode2[i]==0x2a) // SLT
2267               emit_set_gz32(s2l,t);
2268             else // SLTU (set if not zero)
2269               emit_set_nz32(s2l,t);
2270           }
2271           else{
2272             assert(s1l>=0);assert(s2l>=0);
2273             if(opcode2[i]==0x2a) // SLT
2274               emit_set_if_less32(s1l,s2l,t);
2275             else // SLTU
2276               emit_set_if_carry32(s1l,s2l,t);
2277           }
2278         }
2279       }
2280     }
2281   }
2282   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2283     if(rt1[i]) {
2284       signed char s1l,s1h,s2l,s2h,th,tl;
2285       tl=get_reg(i_regs->regmap,rt1[i]);
2286       th=get_reg(i_regs->regmap,rt1[i]|64);
2287       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2288       {
2289         assert(tl>=0);
2290         if(tl>=0) {
2291           s1l=get_reg(i_regs->regmap,rs1[i]);
2292           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2293           s2l=get_reg(i_regs->regmap,rs2[i]);
2294           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2295           if(rs1[i]&&rs2[i]) {
2296             assert(s1l>=0);assert(s1h>=0);
2297             assert(s2l>=0);assert(s2h>=0);
2298             if(opcode2[i]==0x24) { // AND
2299               emit_and(s1l,s2l,tl);
2300               emit_and(s1h,s2h,th);
2301             } else
2302             if(opcode2[i]==0x25) { // OR
2303               emit_or(s1l,s2l,tl);
2304               emit_or(s1h,s2h,th);
2305             } else
2306             if(opcode2[i]==0x26) { // XOR
2307               emit_xor(s1l,s2l,tl);
2308               emit_xor(s1h,s2h,th);
2309             } else
2310             if(opcode2[i]==0x27) { // NOR
2311               emit_or(s1l,s2l,tl);
2312               emit_or(s1h,s2h,th);
2313               emit_not(tl,tl);
2314               emit_not(th,th);
2315             }
2316           }
2317           else
2318           {
2319             if(opcode2[i]==0x24) { // AND
2320               emit_zeroreg(tl);
2321               emit_zeroreg(th);
2322             } else
2323             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2324               if(rs1[i]){
2325                 if(s1l>=0) emit_mov(s1l,tl);
2326                 else emit_loadreg(rs1[i],tl);
2327                 if(s1h>=0) emit_mov(s1h,th);
2328                 else emit_loadreg(rs1[i]|64,th);
2329               }
2330               else
2331               if(rs2[i]){
2332                 if(s2l>=0) emit_mov(s2l,tl);
2333                 else emit_loadreg(rs2[i],tl);
2334                 if(s2h>=0) emit_mov(s2h,th);
2335                 else emit_loadreg(rs2[i]|64,th);
2336               }
2337               else{
2338                 emit_zeroreg(tl);
2339                 emit_zeroreg(th);
2340               }
2341             } else
2342             if(opcode2[i]==0x27) { // NOR
2343               if(rs1[i]){
2344                 if(s1l>=0) emit_not(s1l,tl);
2345                 else{
2346                   emit_loadreg(rs1[i],tl);
2347                   emit_not(tl,tl);
2348                 }
2349                 if(s1h>=0) emit_not(s1h,th);
2350                 else{
2351                   emit_loadreg(rs1[i]|64,th);
2352                   emit_not(th,th);
2353                 }
2354               }
2355               else
2356               if(rs2[i]){
2357                 if(s2l>=0) emit_not(s2l,tl);
2358                 else{
2359                   emit_loadreg(rs2[i],tl);
2360                   emit_not(tl,tl);
2361                 }
2362                 if(s2h>=0) emit_not(s2h,th);
2363                 else{
2364                   emit_loadreg(rs2[i]|64,th);
2365                   emit_not(th,th);
2366                 }
2367               }
2368               else {
2369                 emit_movimm(-1,tl);
2370                 emit_movimm(-1,th);
2371               }
2372             }
2373           }
2374         }
2375       }
2376       else
2377       {
2378         // 32 bit
2379         if(tl>=0) {
2380           s1l=get_reg(i_regs->regmap,rs1[i]);
2381           s2l=get_reg(i_regs->regmap,rs2[i]);
2382           if(rs1[i]&&rs2[i]) {
2383             assert(s1l>=0);
2384             assert(s2l>=0);
2385             if(opcode2[i]==0x24) { // AND
2386               emit_and(s1l,s2l,tl);
2387             } else
2388             if(opcode2[i]==0x25) { // OR
2389               emit_or(s1l,s2l,tl);
2390             } else
2391             if(opcode2[i]==0x26) { // XOR
2392               emit_xor(s1l,s2l,tl);
2393             } else
2394             if(opcode2[i]==0x27) { // NOR
2395               emit_or(s1l,s2l,tl);
2396               emit_not(tl,tl);
2397             }
2398           }
2399           else
2400           {
2401             if(opcode2[i]==0x24) { // AND
2402               emit_zeroreg(tl);
2403             } else
2404             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2405               if(rs1[i]){
2406                 if(s1l>=0) emit_mov(s1l,tl);
2407                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2408               }
2409               else
2410               if(rs2[i]){
2411                 if(s2l>=0) emit_mov(s2l,tl);
2412                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2413               }
2414               else emit_zeroreg(tl);
2415             } else
2416             if(opcode2[i]==0x27) { // NOR
2417               if(rs1[i]){
2418                 if(s1l>=0) emit_not(s1l,tl);
2419                 else {
2420                   emit_loadreg(rs1[i],tl);
2421                   emit_not(tl,tl);
2422                 }
2423               }
2424               else
2425               if(rs2[i]){
2426                 if(s2l>=0) emit_not(s2l,tl);
2427                 else {
2428                   emit_loadreg(rs2[i],tl);
2429                   emit_not(tl,tl);
2430                 }
2431               }
2432               else emit_movimm(-1,tl);
2433             }
2434           }
2435         }
2436       }
2437     }
2438   }
2439 }
2440
2441 void imm16_assemble(int i,struct regstat *i_regs)
2442 {
2443   if (opcode[i]==0x0f) { // LUI
2444     if(rt1[i]) {
2445       signed char t;
2446       t=get_reg(i_regs->regmap,rt1[i]);
2447       //assert(t>=0);
2448       if(t>=0) {
2449         if(!((i_regs->isconst>>t)&1))
2450           emit_movimm(imm[i]<<16,t);
2451       }
2452     }
2453   }
2454   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2455     if(rt1[i]) {
2456       signed char s,t;
2457       t=get_reg(i_regs->regmap,rt1[i]);
2458       s=get_reg(i_regs->regmap,rs1[i]);
2459       if(rs1[i]) {
2460         //assert(t>=0);
2461         //assert(s>=0);
2462         if(t>=0) {
2463           if(!((i_regs->isconst>>t)&1)) {
2464             if(s<0) {
2465               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2466               emit_addimm(t,imm[i],t);
2467             }else{
2468               if(!((i_regs->wasconst>>s)&1))
2469                 emit_addimm(s,imm[i],t);
2470               else
2471                 emit_movimm(constmap[i][s]+imm[i],t);
2472             }
2473           }
2474         }
2475       } else {
2476         if(t>=0) {
2477           if(!((i_regs->isconst>>t)&1))
2478             emit_movimm(imm[i],t);
2479         }
2480       }
2481     }
2482   }
2483   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2484     if(rt1[i]) {
2485       signed char sh,sl,th,tl;
2486       th=get_reg(i_regs->regmap,rt1[i]|64);
2487       tl=get_reg(i_regs->regmap,rt1[i]);
2488       sh=get_reg(i_regs->regmap,rs1[i]|64);
2489       sl=get_reg(i_regs->regmap,rs1[i]);
2490       if(tl>=0) {
2491         if(rs1[i]) {
2492           assert(sh>=0);
2493           assert(sl>=0);
2494           if(th>=0) {
2495             emit_addimm64_32(sh,sl,imm[i],th,tl);
2496           }
2497           else {
2498             emit_addimm(sl,imm[i],tl);
2499           }
2500         } else {
2501           emit_movimm(imm[i],tl);
2502           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2503         }
2504       }
2505     }
2506   }
2507   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2508     if(rt1[i]) {
2509       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2510       signed char sh,sl,t;
2511       t=get_reg(i_regs->regmap,rt1[i]);
2512       sh=get_reg(i_regs->regmap,rs1[i]|64);
2513       sl=get_reg(i_regs->regmap,rs1[i]);
2514       //assert(t>=0);
2515       if(t>=0) {
2516         if(rs1[i]>0) {
2517           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2518           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2519             if(opcode[i]==0x0a) { // SLTI
2520               if(sl<0) {
2521                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2522                 emit_slti32(t,imm[i],t);
2523               }else{
2524                 emit_slti32(sl,imm[i],t);
2525               }
2526             }
2527             else { // SLTIU
2528               if(sl<0) {
2529                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2530                 emit_sltiu32(t,imm[i],t);
2531               }else{
2532                 emit_sltiu32(sl,imm[i],t);
2533               }
2534             }
2535           }else{ // 64-bit
2536             assert(sl>=0);
2537             if(opcode[i]==0x0a) // SLTI
2538               emit_slti64_32(sh,sl,imm[i],t);
2539             else // SLTIU
2540               emit_sltiu64_32(sh,sl,imm[i],t);
2541           }
2542         }else{
2543           // SLTI(U) with r0 is just stupid,
2544           // nonetheless examples can be found
2545           if(opcode[i]==0x0a) // SLTI
2546             if(0<imm[i]) emit_movimm(1,t);
2547             else emit_zeroreg(t);
2548           else // SLTIU
2549           {
2550             if(imm[i]) emit_movimm(1,t);
2551             else emit_zeroreg(t);
2552           }
2553         }
2554       }
2555     }
2556   }
2557   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2558     if(rt1[i]) {
2559       signed char sh,sl,th,tl;
2560       th=get_reg(i_regs->regmap,rt1[i]|64);
2561       tl=get_reg(i_regs->regmap,rt1[i]);
2562       sh=get_reg(i_regs->regmap,rs1[i]|64);
2563       sl=get_reg(i_regs->regmap,rs1[i]);
2564       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2565         if(opcode[i]==0x0c) //ANDI
2566         {
2567           if(rs1[i]) {
2568             if(sl<0) {
2569               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2570               emit_andimm(tl,imm[i],tl);
2571             }else{
2572               if(!((i_regs->wasconst>>sl)&1))
2573                 emit_andimm(sl,imm[i],tl);
2574               else
2575                 emit_movimm(constmap[i][sl]&imm[i],tl);
2576             }
2577           }
2578           else
2579             emit_zeroreg(tl);
2580           if(th>=0) emit_zeroreg(th);
2581         }
2582         else
2583         {
2584           if(rs1[i]) {
2585             if(sl<0) {
2586               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2587             }
2588             if(th>=0) {
2589               if(sh<0) {
2590                 emit_loadreg(rs1[i]|64,th);
2591               }else{
2592                 emit_mov(sh,th);
2593               }
2594             }
2595             if(opcode[i]==0x0d) //ORI
2596             if(sl<0) {
2597               emit_orimm(tl,imm[i],tl);
2598             }else{
2599               if(!((i_regs->wasconst>>sl)&1))
2600                 emit_orimm(sl,imm[i],tl);
2601               else
2602                 emit_movimm(constmap[i][sl]|imm[i],tl);
2603             }
2604             if(opcode[i]==0x0e) //XORI
2605             if(sl<0) {
2606               emit_xorimm(tl,imm[i],tl);
2607             }else{
2608               if(!((i_regs->wasconst>>sl)&1))
2609                 emit_xorimm(sl,imm[i],tl);
2610               else
2611                 emit_movimm(constmap[i][sl]^imm[i],tl);
2612             }
2613           }
2614           else {
2615             emit_movimm(imm[i],tl);
2616             if(th>=0) emit_zeroreg(th);
2617           }
2618         }
2619       }
2620     }
2621   }
2622 }
2623
2624 void shiftimm_assemble(int i,struct regstat *i_regs)
2625 {
2626   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2627   {
2628     if(rt1[i]) {
2629       signed char s,t;
2630       t=get_reg(i_regs->regmap,rt1[i]);
2631       s=get_reg(i_regs->regmap,rs1[i]);
2632       //assert(t>=0);
2633       if(t>=0){
2634         if(rs1[i]==0)
2635         {
2636           emit_zeroreg(t);
2637         }
2638         else
2639         {
2640           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2641           if(imm[i]) {
2642             if(opcode2[i]==0) // SLL
2643             {
2644               emit_shlimm(s<0?t:s,imm[i],t);
2645             }
2646             if(opcode2[i]==2) // SRL
2647             {
2648               emit_shrimm(s<0?t:s,imm[i],t);
2649             }
2650             if(opcode2[i]==3) // SRA
2651             {
2652               emit_sarimm(s<0?t:s,imm[i],t);
2653             }
2654           }else{
2655             // Shift by zero
2656             if(s>=0 && s!=t) emit_mov(s,t);
2657           }
2658         }
2659       }
2660       //emit_storereg(rt1[i],t); //DEBUG
2661     }
2662   }
2663   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2664   {
2665     if(rt1[i]) {
2666       signed char sh,sl,th,tl;
2667       th=get_reg(i_regs->regmap,rt1[i]|64);
2668       tl=get_reg(i_regs->regmap,rt1[i]);
2669       sh=get_reg(i_regs->regmap,rs1[i]|64);
2670       sl=get_reg(i_regs->regmap,rs1[i]);
2671       if(tl>=0) {
2672         if(rs1[i]==0)
2673         {
2674           emit_zeroreg(tl);
2675           if(th>=0) emit_zeroreg(th);
2676         }
2677         else
2678         {
2679           assert(sl>=0);
2680           assert(sh>=0);
2681           if(imm[i]) {
2682             if(opcode2[i]==0x38) // DSLL
2683             {
2684               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2685               emit_shlimm(sl,imm[i],tl);
2686             }
2687             if(opcode2[i]==0x3a) // DSRL
2688             {
2689               emit_shrdimm(sl,sh,imm[i],tl);
2690               if(th>=0) emit_shrimm(sh,imm[i],th);
2691             }
2692             if(opcode2[i]==0x3b) // DSRA
2693             {
2694               emit_shrdimm(sl,sh,imm[i],tl);
2695               if(th>=0) emit_sarimm(sh,imm[i],th);
2696             }
2697           }else{
2698             // Shift by zero
2699             if(sl!=tl) emit_mov(sl,tl);
2700             if(th>=0&&sh!=th) emit_mov(sh,th);
2701           }
2702         }
2703       }
2704     }
2705   }
2706   if(opcode2[i]==0x3c) // DSLL32
2707   {
2708     if(rt1[i]) {
2709       signed char sl,tl,th;
2710       tl=get_reg(i_regs->regmap,rt1[i]);
2711       th=get_reg(i_regs->regmap,rt1[i]|64);
2712       sl=get_reg(i_regs->regmap,rs1[i]);
2713       if(th>=0||tl>=0){
2714         assert(tl>=0);
2715         assert(th>=0);
2716         assert(sl>=0);
2717         emit_mov(sl,th);
2718         emit_zeroreg(tl);
2719         if(imm[i]>32)
2720         {
2721           emit_shlimm(th,imm[i]&31,th);
2722         }
2723       }
2724     }
2725   }
2726   if(opcode2[i]==0x3e) // DSRL32
2727   {
2728     if(rt1[i]) {
2729       signed char sh,tl,th;
2730       tl=get_reg(i_regs->regmap,rt1[i]);
2731       th=get_reg(i_regs->regmap,rt1[i]|64);
2732       sh=get_reg(i_regs->regmap,rs1[i]|64);
2733       if(tl>=0){
2734         assert(sh>=0);
2735         emit_mov(sh,tl);
2736         if(th>=0) emit_zeroreg(th);
2737         if(imm[i]>32)
2738         {
2739           emit_shrimm(tl,imm[i]&31,tl);
2740         }
2741       }
2742     }
2743   }
2744   if(opcode2[i]==0x3f) // DSRA32
2745   {
2746     if(rt1[i]) {
2747       signed char sh,tl;
2748       tl=get_reg(i_regs->regmap,rt1[i]);
2749       sh=get_reg(i_regs->regmap,rs1[i]|64);
2750       if(tl>=0){
2751         assert(sh>=0);
2752         emit_mov(sh,tl);
2753         if(imm[i]>32)
2754         {
2755           emit_sarimm(tl,imm[i]&31,tl);
2756         }
2757       }
2758     }
2759   }
2760 }
2761
2762 #ifndef shift_assemble
2763 void shift_assemble(int i,struct regstat *i_regs)
2764 {
2765   printf("Need shift_assemble for this architecture.\n");
2766   exit(1);
2767 }
2768 #endif
2769
2770 void load_assemble(int i,struct regstat *i_regs)
2771 {
2772   int s,th,tl,addr,map=-1;
2773   int offset;
2774   int jaddr=0;
2775   int memtarget=0,c=0;
2776   u_int hr,reglist=0;
2777   th=get_reg(i_regs->regmap,rt1[i]|64);
2778   tl=get_reg(i_regs->regmap,rt1[i]);
2779   s=get_reg(i_regs->regmap,rs1[i]);
2780   offset=imm[i];
2781   for(hr=0;hr<HOST_REGS;hr++) {
2782     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2783   }
2784   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2785   if(s>=0) {
2786     c=(i_regs->wasconst>>s)&1;
2787     if (c) {
2788       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2789       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2790     }
2791   }
2792   //printf("load_assemble: c=%d\n",c);
2793   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2794   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2795 #ifdef PCSX
2796   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2797     ||rt1[i]==0) {
2798       // could be FIFO, must perform the read
2799       // ||dummy read
2800       assem_debug("(forced read)\n");
2801       tl=get_reg(i_regs->regmap,-1);
2802       assert(tl>=0);
2803   }
2804 #endif
2805   if(offset||s<0||c) addr=tl;
2806   else addr=s;
2807   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2808  if(tl>=0) {
2809   //printf("load_assemble: c=%d\n",c);
2810   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2811   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2812   reglist&=~(1<<tl);
2813   if(th>=0) reglist&=~(1<<th);
2814   if(!using_tlb) {
2815     if(!c) {
2816       #ifdef RAM_OFFSET
2817       map=get_reg(i_regs->regmap,ROREG);
2818       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2819       #endif
2820 //#define R29_HACK 1
2821       #ifdef R29_HACK
2822       // Strmnnrmn's speed hack
2823       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2824       #endif
2825       {
2826         emit_cmpimm(addr,RAM_SIZE);
2827         jaddr=(int)out;
2828         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2829         // Hint to branch predictor that the branch is unlikely to be taken
2830         if(rs1[i]>=28)
2831           emit_jno_unlikely(0);
2832         else
2833         #endif
2834         emit_jno(0);
2835       }
2836     }
2837   }else{ // using tlb
2838     int x=0;
2839     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2840     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2841     map=get_reg(i_regs->regmap,TLREG);
2842     assert(map>=0);
2843     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2844     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2845   }
2846   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2847   if (opcode[i]==0x20) { // LB
2848     if(!c||memtarget) {
2849       if(!dummy) {
2850         #ifdef HOST_IMM_ADDR32
2851         if(c)
2852           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2853         else
2854         #endif
2855         {
2856           //emit_xorimm(addr,3,tl);
2857           //gen_tlb_addr_r(tl,map);
2858           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2859           int x=0,a=tl;
2860 #ifdef BIG_ENDIAN_MIPS
2861           if(!c) emit_xorimm(addr,3,tl);
2862           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2863 #else
2864           if(!c) a=addr;
2865 #endif
2866           emit_movsbl_indexed_tlb(x,a,map,tl);
2867         }
2868       }
2869       if(jaddr)
2870         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2871     }
2872     else
2873       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2874   }
2875   if (opcode[i]==0x21) { // LH
2876     if(!c||memtarget) {
2877       if(!dummy) {
2878         #ifdef HOST_IMM_ADDR32
2879         if(c)
2880           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2881         else
2882         #endif
2883         {
2884           int x=0,a=tl;
2885 #ifdef BIG_ENDIAN_MIPS
2886           if(!c) emit_xorimm(addr,2,tl);
2887           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2888 #else
2889           if(!c) a=addr;
2890 #endif
2891           //#ifdef
2892           //emit_movswl_indexed_tlb(x,tl,map,tl);
2893           //else
2894           if(map>=0) {
2895             gen_tlb_addr_r(a,map);
2896             emit_movswl_indexed(x,a,tl);
2897           }else{
2898             #ifdef RAM_OFFSET
2899             emit_movswl_indexed(x,a,tl);
2900             #else
2901             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2902             #endif
2903           }
2904         }
2905       }
2906       if(jaddr)
2907         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2908     }
2909     else
2910       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2911   }
2912   if (opcode[i]==0x23) { // LW
2913     if(!c||memtarget) {
2914       if(!dummy) {
2915         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2916         #ifdef HOST_IMM_ADDR32
2917         if(c)
2918           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2919         else
2920         #endif
2921         emit_readword_indexed_tlb(0,addr,map,tl);
2922       }
2923       if(jaddr)
2924         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2925     }
2926     else
2927       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2928   }
2929   if (opcode[i]==0x24) { // LBU
2930     if(!c||memtarget) {
2931       if(!dummy) {
2932         #ifdef HOST_IMM_ADDR32
2933         if(c)
2934           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2935         else
2936         #endif
2937         {
2938           //emit_xorimm(addr,3,tl);
2939           //gen_tlb_addr_r(tl,map);
2940           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2941           int x=0,a=tl;
2942 #ifdef BIG_ENDIAN_MIPS
2943           if(!c) emit_xorimm(addr,3,tl);
2944           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2945 #else
2946           if(!c) a=addr;
2947 #endif
2948           emit_movzbl_indexed_tlb(x,a,map,tl);
2949         }
2950       }
2951       if(jaddr)
2952         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2953     }
2954     else
2955       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2956   }
2957   if (opcode[i]==0x25) { // LHU
2958     if(!c||memtarget) {
2959       if(!dummy) {
2960         #ifdef HOST_IMM_ADDR32
2961         if(c)
2962           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2963         else
2964         #endif
2965         {
2966           int x=0,a=tl;
2967 #ifdef BIG_ENDIAN_MIPS
2968           if(!c) emit_xorimm(addr,2,tl);
2969           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2970 #else
2971           if(!c) a=addr;
2972 #endif
2973           //#ifdef
2974           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2975           //#else
2976           if(map>=0) {
2977             gen_tlb_addr_r(a,map);
2978             emit_movzwl_indexed(x,a,tl);
2979           }else{
2980             #ifdef RAM_OFFSET
2981             emit_movzwl_indexed(x,a,tl);
2982             #else
2983             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2984             #endif
2985           }
2986         }
2987       }
2988       if(jaddr)
2989         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2990     }
2991     else
2992       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2993   }
2994   if (opcode[i]==0x27) { // LWU
2995     assert(th>=0);
2996     if(!c||memtarget) {
2997       if(!dummy) {
2998         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2999         #ifdef HOST_IMM_ADDR32
3000         if(c)
3001           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3002         else
3003         #endif
3004         emit_readword_indexed_tlb(0,addr,map,tl);
3005       }
3006       if(jaddr)
3007         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3008     }
3009     else {
3010       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3011     }
3012     emit_zeroreg(th);
3013   }
3014   if (opcode[i]==0x37) { // LD
3015     if(!c||memtarget) {
3016       if(!dummy) {
3017         //gen_tlb_addr_r(tl,map);
3018         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3019         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3020         #ifdef HOST_IMM_ADDR32
3021         if(c)
3022           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3023         else
3024         #endif
3025         emit_readdword_indexed_tlb(0,addr,map,th,tl);
3026       }
3027       if(jaddr)
3028         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3029     }
3030     else
3031       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3032   }
3033  }
3034   //emit_storereg(rt1[i],tl); // DEBUG
3035   //if(opcode[i]==0x23)
3036   //if(opcode[i]==0x24)
3037   //if(opcode[i]==0x23||opcode[i]==0x24)
3038   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3039   {
3040     //emit_pusha();
3041     save_regs(0x100f);
3042         emit_readword((int)&last_count,ECX);
3043         #ifdef __i386__
3044         if(get_reg(i_regs->regmap,CCREG)<0)
3045           emit_loadreg(CCREG,HOST_CCREG);
3046         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3047         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3048         emit_writeword(HOST_CCREG,(int)&Count);
3049         #endif
3050         #ifdef __arm__
3051         if(get_reg(i_regs->regmap,CCREG)<0)
3052           emit_loadreg(CCREG,0);
3053         else
3054           emit_mov(HOST_CCREG,0);
3055         emit_add(0,ECX,0);
3056         emit_addimm(0,2*ccadj[i],0);
3057         emit_writeword(0,(int)&Count);
3058         #endif
3059     emit_call((int)memdebug);
3060     //emit_popa();
3061     restore_regs(0x100f);
3062   }/**/
3063 }
3064
3065 #ifndef loadlr_assemble
3066 void loadlr_assemble(int i,struct regstat *i_regs)
3067 {
3068   printf("Need loadlr_assemble for this architecture.\n");
3069   exit(1);
3070 }
3071 #endif
3072
3073 void store_assemble(int i,struct regstat *i_regs)
3074 {
3075   int s,th,tl,map=-1;
3076   int addr,temp;
3077   int offset;
3078   int jaddr=0,jaddr2,type;
3079   int memtarget=0,c=0;
3080   int agr=AGEN1+(i&1);
3081   u_int hr,reglist=0;
3082   th=get_reg(i_regs->regmap,rs2[i]|64);
3083   tl=get_reg(i_regs->regmap,rs2[i]);
3084   s=get_reg(i_regs->regmap,rs1[i]);
3085   temp=get_reg(i_regs->regmap,agr);
3086   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3087   offset=imm[i];
3088   if(s>=0) {
3089     c=(i_regs->wasconst>>s)&1;
3090     if(c) {
3091       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3092       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3093     }
3094   }
3095   assert(tl>=0);
3096   assert(temp>=0);
3097   for(hr=0;hr<HOST_REGS;hr++) {
3098     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3099   }
3100   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3101   if(offset||s<0||c) addr=temp;
3102   else addr=s;
3103   if(!using_tlb) {
3104     if(!c) {
3105       #ifdef R29_HACK
3106       // Strmnnrmn's speed hack
3107       memtarget=1;
3108       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3109       #endif
3110       emit_cmpimm(addr,RAM_SIZE);
3111       #ifdef DESTRUCTIVE_SHIFT
3112       if(s==addr) emit_mov(s,temp);
3113       #endif
3114       #ifdef R29_HACK
3115       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3116       #endif
3117       {
3118         jaddr=(int)out;
3119         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3120         // Hint to branch predictor that the branch is unlikely to be taken
3121         if(rs1[i]>=28)
3122           emit_jno_unlikely(0);
3123         else
3124         #endif
3125         emit_jno(0);
3126       }
3127     }
3128   }else{ // using tlb
3129     int x=0;
3130     if (opcode[i]==0x28) x=3; // SB
3131     if (opcode[i]==0x29) x=2; // SH
3132     map=get_reg(i_regs->regmap,TLREG);
3133     assert(map>=0);
3134     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3135     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3136   }
3137
3138   if (opcode[i]==0x28) { // SB
3139     if(!c||memtarget) {
3140       int x=0,a=temp;
3141 #ifdef BIG_ENDIAN_MIPS
3142       if(!c) emit_xorimm(addr,3,temp);
3143       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3144 #else
3145       if(!c) a=addr;
3146 #endif
3147       //gen_tlb_addr_w(temp,map);
3148       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3149       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3150     }
3151     type=STOREB_STUB;
3152   }
3153   if (opcode[i]==0x29) { // SH
3154     if(!c||memtarget) {
3155       int x=0,a=temp;
3156 #ifdef BIG_ENDIAN_MIPS
3157       if(!c) emit_xorimm(addr,2,temp);
3158       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3159 #else
3160       if(!c) a=addr;
3161 #endif
3162       //#ifdef
3163       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3164       //#else
3165       if(map>=0) {
3166         gen_tlb_addr_w(a,map);
3167         emit_writehword_indexed(tl,x,a);
3168       }else
3169         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3170     }
3171     type=STOREH_STUB;
3172   }
3173   if (opcode[i]==0x2B) { // SW
3174     if(!c||memtarget)
3175       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3176       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3177     type=STOREW_STUB;
3178   }
3179   if (opcode[i]==0x3F) { // SD
3180     if(!c||memtarget) {
3181       if(rs2[i]) {
3182         assert(th>=0);
3183         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3184         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3185         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3186       }else{
3187         // Store zero
3188         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3189         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3190         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3191       }
3192     }
3193     type=STORED_STUB;
3194   }
3195   if(!using_tlb) {
3196     if(!c||memtarget) {
3197       #ifdef DESTRUCTIVE_SHIFT
3198       // The x86 shift operation is 'destructive'; it overwrites the
3199       // source register, so we need to make a copy first and use that.
3200       addr=temp;
3201       #endif
3202       #if defined(HOST_IMM8)
3203       int ir=get_reg(i_regs->regmap,INVCP);
3204       assert(ir>=0);
3205       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3206       #else
3207       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3208       #endif
3209       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3210       emit_callne(invalidate_addr_reg[addr]);
3211       #else
3212       jaddr2=(int)out;
3213       emit_jne(0);
3214       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3215       #endif
3216     }
3217   }
3218   if(jaddr) {
3219     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3220   } else if(c&&!memtarget) {
3221     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3222   }
3223   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3224   //if(opcode[i]==0x2B || opcode[i]==0x28)
3225   //if(opcode[i]==0x2B || opcode[i]==0x29)
3226   //if(opcode[i]==0x2B)
3227   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3228   {
3229     //emit_pusha();
3230     save_regs(0x100f);
3231         emit_readword((int)&last_count,ECX);
3232         #ifdef __i386__
3233         if(get_reg(i_regs->regmap,CCREG)<0)
3234           emit_loadreg(CCREG,HOST_CCREG);
3235         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3236         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3237         emit_writeword(HOST_CCREG,(int)&Count);
3238         #endif
3239         #ifdef __arm__
3240         if(get_reg(i_regs->regmap,CCREG)<0)
3241           emit_loadreg(CCREG,0);
3242         else
3243           emit_mov(HOST_CCREG,0);
3244         emit_add(0,ECX,0);
3245         emit_addimm(0,2*ccadj[i],0);
3246         emit_writeword(0,(int)&Count);
3247         #endif
3248     emit_call((int)memdebug);
3249     //emit_popa();
3250     restore_regs(0x100f);
3251   }/**/
3252 }
3253
3254 void storelr_assemble(int i,struct regstat *i_regs)
3255 {
3256   int s,th,tl;
3257   int temp;
3258   int temp2;
3259   int offset;
3260   int jaddr=0,jaddr2;
3261   int case1,case2,case3;
3262   int done0,done1,done2;
3263   int memtarget=0,c=0;
3264   int agr=AGEN1+(i&1);
3265   u_int hr,reglist=0;
3266   th=get_reg(i_regs->regmap,rs2[i]|64);
3267   tl=get_reg(i_regs->regmap,rs2[i]);
3268   s=get_reg(i_regs->regmap,rs1[i]);
3269   temp=get_reg(i_regs->regmap,agr);
3270   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3271   offset=imm[i];
3272   if(s>=0) {
3273     c=(i_regs->isconst>>s)&1;
3274     if(c) {
3275       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3276       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3277     }
3278   }
3279   assert(tl>=0);
3280   for(hr=0;hr<HOST_REGS;hr++) {
3281     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3282   }
3283   assert(temp>=0);
3284   if(!using_tlb) {
3285     if(!c) {
3286       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3287       if(!offset&&s!=temp) emit_mov(s,temp);
3288       jaddr=(int)out;
3289       emit_jno(0);
3290     }
3291     else
3292     {
3293       if(!memtarget||!rs1[i]) {
3294         jaddr=(int)out;
3295         emit_jmp(0);
3296       }
3297     }
3298     #ifdef RAM_OFFSET
3299     int map=get_reg(i_regs->regmap,ROREG);
3300     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3301     gen_tlb_addr_w(temp,map);
3302     #else
3303     if((u_int)rdram!=0x80000000) 
3304       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3305     #endif
3306   }else{ // using tlb
3307     int map=get_reg(i_regs->regmap,TLREG);
3308     assert(map>=0);
3309     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3310     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3311     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3312     if(!jaddr&&!memtarget) {
3313       jaddr=(int)out;
3314       emit_jmp(0);
3315     }
3316     gen_tlb_addr_w(temp,map);
3317   }
3318
3319   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3320     temp2=get_reg(i_regs->regmap,FTEMP);
3321     if(!rs2[i]) temp2=th=tl;
3322   }
3323
3324 #ifndef BIG_ENDIAN_MIPS
3325     emit_xorimm(temp,3,temp);
3326 #endif
3327   emit_testimm(temp,2);
3328   case2=(int)out;
3329   emit_jne(0);
3330   emit_testimm(temp,1);
3331   case1=(int)out;
3332   emit_jne(0);
3333   // 0
3334   if (opcode[i]==0x2A) { // SWL
3335     emit_writeword_indexed(tl,0,temp);
3336   }
3337   if (opcode[i]==0x2E) { // SWR
3338     emit_writebyte_indexed(tl,3,temp);
3339   }
3340   if (opcode[i]==0x2C) { // SDL
3341     emit_writeword_indexed(th,0,temp);
3342     if(rs2[i]) emit_mov(tl,temp2);
3343   }
3344   if (opcode[i]==0x2D) { // SDR
3345     emit_writebyte_indexed(tl,3,temp);
3346     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3347   }
3348   done0=(int)out;
3349   emit_jmp(0);
3350   // 1
3351   set_jump_target(case1,(int)out);
3352   if (opcode[i]==0x2A) { // SWL
3353     // Write 3 msb into three least significant bytes
3354     if(rs2[i]) emit_rorimm(tl,8,tl);
3355     emit_writehword_indexed(tl,-1,temp);
3356     if(rs2[i]) emit_rorimm(tl,16,tl);
3357     emit_writebyte_indexed(tl,1,temp);
3358     if(rs2[i]) emit_rorimm(tl,8,tl);
3359   }
3360   if (opcode[i]==0x2E) { // SWR
3361     // Write two lsb into two most significant bytes
3362     emit_writehword_indexed(tl,1,temp);
3363   }
3364   if (opcode[i]==0x2C) { // SDL
3365     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3366     // Write 3 msb into three least significant bytes
3367     if(rs2[i]) emit_rorimm(th,8,th);
3368     emit_writehword_indexed(th,-1,temp);
3369     if(rs2[i]) emit_rorimm(th,16,th);
3370     emit_writebyte_indexed(th,1,temp);
3371     if(rs2[i]) emit_rorimm(th,8,th);
3372   }
3373   if (opcode[i]==0x2D) { // SDR
3374     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3375     // Write two lsb into two most significant bytes
3376     emit_writehword_indexed(tl,1,temp);
3377   }
3378   done1=(int)out;
3379   emit_jmp(0);
3380   // 2
3381   set_jump_target(case2,(int)out);
3382   emit_testimm(temp,1);
3383   case3=(int)out;
3384   emit_jne(0);
3385   if (opcode[i]==0x2A) { // SWL
3386     // Write two msb into two least significant bytes
3387     if(rs2[i]) emit_rorimm(tl,16,tl);
3388     emit_writehword_indexed(tl,-2,temp);
3389     if(rs2[i]) emit_rorimm(tl,16,tl);
3390   }
3391   if (opcode[i]==0x2E) { // SWR
3392     // Write 3 lsb into three most significant bytes
3393     emit_writebyte_indexed(tl,-1,temp);
3394     if(rs2[i]) emit_rorimm(tl,8,tl);
3395     emit_writehword_indexed(tl,0,temp);
3396     if(rs2[i]) emit_rorimm(tl,24,tl);
3397   }
3398   if (opcode[i]==0x2C) { // SDL
3399     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3400     // Write two msb into two least significant bytes
3401     if(rs2[i]) emit_rorimm(th,16,th);
3402     emit_writehword_indexed(th,-2,temp);
3403     if(rs2[i]) emit_rorimm(th,16,th);
3404   }
3405   if (opcode[i]==0x2D) { // SDR
3406     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3407     // Write 3 lsb into three most significant bytes
3408     emit_writebyte_indexed(tl,-1,temp);
3409     if(rs2[i]) emit_rorimm(tl,8,tl);
3410     emit_writehword_indexed(tl,0,temp);
3411     if(rs2[i]) emit_rorimm(tl,24,tl);
3412   }
3413   done2=(int)out;
3414   emit_jmp(0);
3415   // 3
3416   set_jump_target(case3,(int)out);
3417   if (opcode[i]==0x2A) { // SWL
3418     // Write msb into least significant byte
3419     if(rs2[i]) emit_rorimm(tl,24,tl);
3420     emit_writebyte_indexed(tl,-3,temp);
3421     if(rs2[i]) emit_rorimm(tl,8,tl);
3422   }
3423   if (opcode[i]==0x2E) { // SWR
3424     // Write entire word
3425     emit_writeword_indexed(tl,-3,temp);
3426   }
3427   if (opcode[i]==0x2C) { // SDL
3428     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3429     // Write msb into least significant byte
3430     if(rs2[i]) emit_rorimm(th,24,th);
3431     emit_writebyte_indexed(th,-3,temp);
3432     if(rs2[i]) emit_rorimm(th,8,th);
3433   }
3434   if (opcode[i]==0x2D) { // SDR
3435     if(rs2[i]) emit_mov(th,temp2);
3436     // Write entire word
3437     emit_writeword_indexed(tl,-3,temp);
3438   }
3439   set_jump_target(done0,(int)out);
3440   set_jump_target(done1,(int)out);
3441   set_jump_target(done2,(int)out);
3442   if (opcode[i]==0x2C) { // SDL
3443     emit_testimm(temp,4);
3444     done0=(int)out;
3445     emit_jne(0);
3446     emit_andimm(temp,~3,temp);
3447     emit_writeword_indexed(temp2,4,temp);
3448     set_jump_target(done0,(int)out);
3449   }
3450   if (opcode[i]==0x2D) { // SDR
3451     emit_testimm(temp,4);
3452     done0=(int)out;
3453     emit_jeq(0);
3454     emit_andimm(temp,~3,temp);
3455     emit_writeword_indexed(temp2,-4,temp);
3456     set_jump_target(done0,(int)out);
3457   }
3458   if(!c||!memtarget)
3459     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3460   if(!using_tlb) {
3461     #ifdef RAM_OFFSET
3462     int map=get_reg(i_regs->regmap,ROREG);
3463     if(map<0) map=HOST_TEMPREG;
3464     gen_orig_addr_w(temp,map);
3465     #else
3466     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3467     #endif
3468     #if defined(HOST_IMM8)
3469     int ir=get_reg(i_regs->regmap,INVCP);
3470     assert(ir>=0);
3471     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3472     #else
3473     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3474     #endif
3475     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3476     emit_callne(invalidate_addr_reg[temp]);
3477     #else
3478     jaddr2=(int)out;
3479     emit_jne(0);
3480     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3481     #endif
3482   }
3483   /*
3484     emit_pusha();
3485     //save_regs(0x100f);
3486         emit_readword((int)&last_count,ECX);
3487         if(get_reg(i_regs->regmap,CCREG)<0)
3488           emit_loadreg(CCREG,HOST_CCREG);
3489         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3490         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3491         emit_writeword(HOST_CCREG,(int)&Count);
3492     emit_call((int)memdebug);
3493     emit_popa();
3494     //restore_regs(0x100f);
3495   /**/
3496 }
3497
3498 void c1ls_assemble(int i,struct regstat *i_regs)
3499 {
3500 #ifndef DISABLE_COP1
3501   int s,th,tl;
3502   int temp,ar;
3503   int map=-1;
3504   int offset;
3505   int c=0;
3506   int jaddr,jaddr2=0,jaddr3,type;
3507   int agr=AGEN1+(i&1);
3508   u_int hr,reglist=0;
3509   th=get_reg(i_regs->regmap,FTEMP|64);
3510   tl=get_reg(i_regs->regmap,FTEMP);
3511   s=get_reg(i_regs->regmap,rs1[i]);
3512   temp=get_reg(i_regs->regmap,agr);
3513   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3514   offset=imm[i];
3515   assert(tl>=0);
3516   assert(rs1[i]>0);
3517   assert(temp>=0);
3518   for(hr=0;hr<HOST_REGS;hr++) {
3519     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3520   }
3521   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3522   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3523   {
3524     // Loads use a temporary register which we need to save
3525     reglist|=1<<temp;
3526   }
3527   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3528     ar=temp;
3529   else // LWC1/LDC1
3530     ar=tl;
3531   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3532   //else c=(i_regs->wasconst>>s)&1;
3533   if(s>=0) c=(i_regs->wasconst>>s)&1;
3534   // Check cop1 unusable
3535   if(!cop1_usable) {
3536     signed char rs=get_reg(i_regs->regmap,CSREG);
3537     assert(rs>=0);
3538     emit_testimm(rs,0x20000000);
3539     jaddr=(int)out;
3540     emit_jeq(0);
3541     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3542     cop1_usable=1;
3543   }
3544   if (opcode[i]==0x39) { // SWC1 (get float address)
3545     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3546   }
3547   if (opcode[i]==0x3D) { // SDC1 (get double address)
3548     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3549   }
3550   // Generate address + offset
3551   if(!using_tlb) {
3552     if(!c)
3553       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3554   }
3555   else
3556   {
3557     map=get_reg(i_regs->regmap,TLREG);
3558     assert(map>=0);
3559     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3560       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3561     }
3562     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3563       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3564     }
3565   }
3566   if (opcode[i]==0x39) { // SWC1 (read float)
3567     emit_readword_indexed(0,tl,tl);
3568   }
3569   if (opcode[i]==0x3D) { // SDC1 (read double)
3570     emit_readword_indexed(4,tl,th);
3571     emit_readword_indexed(0,tl,tl);
3572   }
3573   if (opcode[i]==0x31) { // LWC1 (get target address)
3574     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3575   }
3576   if (opcode[i]==0x35) { // LDC1 (get target address)
3577     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3578   }
3579   if(!using_tlb) {
3580     if(!c) {
3581       jaddr2=(int)out;
3582       emit_jno(0);
3583     }
3584     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3585       jaddr2=(int)out;
3586       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3587     }
3588     #ifdef DESTRUCTIVE_SHIFT
3589     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3590       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3591     }
3592     #endif
3593   }else{
3594     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3595       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3596     }
3597     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3598       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3599     }
3600   }
3601   if (opcode[i]==0x31) { // LWC1
3602     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3603     //gen_tlb_addr_r(ar,map);
3604     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3605     #ifdef HOST_IMM_ADDR32
3606     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3607     else
3608     #endif
3609     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3610     type=LOADW_STUB;
3611   }
3612   if (opcode[i]==0x35) { // LDC1
3613     assert(th>=0);
3614     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3615     //gen_tlb_addr_r(ar,map);
3616     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3617     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3618     #ifdef HOST_IMM_ADDR32
3619     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3620     else
3621     #endif
3622     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3623     type=LOADD_STUB;
3624   }
3625   if (opcode[i]==0x39) { // SWC1
3626     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3627     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3628     type=STOREW_STUB;
3629   }
3630   if (opcode[i]==0x3D) { // SDC1
3631     assert(th>=0);
3632     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3633     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3634     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3635     type=STORED_STUB;
3636   }
3637   if(!using_tlb) {
3638     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3639       #ifndef DESTRUCTIVE_SHIFT
3640       temp=offset||c||s<0?ar:s;
3641       #endif
3642       #if defined(HOST_IMM8)
3643       int ir=get_reg(i_regs->regmap,INVCP);
3644       assert(ir>=0);
3645       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3646       #else
3647       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3648       #endif
3649       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3650       emit_callne(invalidate_addr_reg[temp]);
3651       #else
3652       jaddr3=(int)out;
3653       emit_jne(0);
3654       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3655       #endif
3656     }
3657   }
3658   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3659   if (opcode[i]==0x31) { // LWC1 (write float)
3660     emit_writeword_indexed(tl,0,temp);
3661   }
3662   if (opcode[i]==0x35) { // LDC1 (write double)
3663     emit_writeword_indexed(th,4,temp);
3664     emit_writeword_indexed(tl,0,temp);
3665   }
3666   //if(opcode[i]==0x39)
3667   /*if(opcode[i]==0x39||opcode[i]==0x31)
3668   {
3669     emit_pusha();
3670         emit_readword((int)&last_count,ECX);
3671         if(get_reg(i_regs->regmap,CCREG)<0)
3672           emit_loadreg(CCREG,HOST_CCREG);
3673         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3674         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3675         emit_writeword(HOST_CCREG,(int)&Count);
3676     emit_call((int)memdebug);
3677     emit_popa();
3678   }/**/
3679 #else
3680   cop1_unusable(i, i_regs);
3681 #endif
3682 }
3683
3684 void c2ls_assemble(int i,struct regstat *i_regs)
3685 {
3686   int s,tl;
3687   int ar;
3688   int offset;
3689   int memtarget=0,c=0;
3690   int jaddr,jaddr2=0,jaddr3,type;
3691   int agr=AGEN1+(i&1);
3692   u_int hr,reglist=0;
3693   u_int copr=(source[i]>>16)&0x1f;
3694   s=get_reg(i_regs->regmap,rs1[i]);
3695   tl=get_reg(i_regs->regmap,FTEMP);
3696   offset=imm[i];
3697   assert(rs1[i]>0);
3698   assert(tl>=0);
3699   assert(!using_tlb);
3700
3701   for(hr=0;hr<HOST_REGS;hr++) {
3702     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3703   }
3704   if(i_regs->regmap[HOST_CCREG]==CCREG)
3705     reglist&=~(1<<HOST_CCREG);
3706
3707   // get the address
3708   if (opcode[i]==0x3a) { // SWC2
3709     ar=get_reg(i_regs->regmap,agr);
3710     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3711     reglist|=1<<ar;
3712   } else { // LWC2
3713     ar=tl;
3714   }
3715   if(s>=0) c=(i_regs->wasconst>>s)&1;
3716   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3717   if (!offset&&!c&&s>=0) ar=s;
3718   assert(ar>=0);
3719
3720   if (opcode[i]==0x3a) { // SWC2
3721     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3722     type=STOREW_STUB;
3723   }
3724   else
3725     type=LOADW_STUB;
3726
3727   if(c&&!memtarget) {
3728     jaddr2=(int)out;
3729     emit_jmp(0); // inline_readstub/inline_writestub?
3730   }
3731   else {
3732     if(!c) {
3733       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3734       jaddr2=(int)out;
3735       emit_jno(0);
3736     }
3737     if (opcode[i]==0x32) { // LWC2
3738       #ifdef HOST_IMM_ADDR32
3739       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3740       else
3741       #endif
3742       emit_readword_indexed(0,ar,tl);
3743     }
3744     if (opcode[i]==0x3a) { // SWC2
3745       #ifdef DESTRUCTIVE_SHIFT
3746       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3747       #endif
3748       emit_writeword_indexed(tl,0,ar);
3749     }
3750   }
3751   if(jaddr2)
3752     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3753   if (opcode[i]==0x3a) { // SWC2
3754 #if defined(HOST_IMM8)
3755     int ir=get_reg(i_regs->regmap,INVCP);
3756     assert(ir>=0);
3757     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3758 #else
3759     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3760 #endif
3761     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3762     emit_callne(invalidate_addr_reg[ar]);
3763     #else
3764     jaddr3=(int)out;
3765     emit_jne(0);
3766     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3767     #endif
3768   }
3769   if (opcode[i]==0x32) { // LWC2
3770     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3771   }
3772 }
3773
3774 #ifndef multdiv_assemble
3775 void multdiv_assemble(int i,struct regstat *i_regs)
3776 {
3777   printf("Need multdiv_assemble for this architecture.\n");
3778   exit(1);
3779 }
3780 #endif
3781
3782 void mov_assemble(int i,struct regstat *i_regs)
3783 {
3784   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3785   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3786   if(rt1[i]) {
3787     signed char sh,sl,th,tl;
3788     th=get_reg(i_regs->regmap,rt1[i]|64);
3789     tl=get_reg(i_regs->regmap,rt1[i]);
3790     //assert(tl>=0);
3791     if(tl>=0) {
3792       sh=get_reg(i_regs->regmap,rs1[i]|64);
3793       sl=get_reg(i_regs->regmap,rs1[i]);
3794       if(sl>=0) emit_mov(sl,tl);
3795       else emit_loadreg(rs1[i],tl);
3796       if(th>=0) {
3797         if(sh>=0) emit_mov(sh,th);
3798         else emit_loadreg(rs1[i]|64,th);
3799       }
3800     }
3801   }
3802 }
3803
3804 #ifndef fconv_assemble
3805 void fconv_assemble(int i,struct regstat *i_regs)
3806 {
3807   printf("Need fconv_assemble for this architecture.\n");
3808   exit(1);
3809 }
3810 #endif
3811
3812 #if 0
3813 void float_assemble(int i,struct regstat *i_regs)
3814 {
3815   printf("Need float_assemble for this architecture.\n");
3816   exit(1);
3817 }
3818 #endif
3819
3820 void syscall_assemble(int i,struct regstat *i_regs)
3821 {
3822   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3823   assert(ccreg==HOST_CCREG);
3824   assert(!is_delayslot);
3825   emit_movimm(start+i*4,EAX); // Get PC
3826   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3827   emit_jmp((int)jump_syscall_hle); // XXX
3828 }
3829
3830 void hlecall_assemble(int i,struct regstat *i_regs)
3831 {
3832   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3833   assert(ccreg==HOST_CCREG);
3834   assert(!is_delayslot);
3835   emit_movimm(start+i*4+4,0); // Get PC
3836   emit_movimm((int)psxHLEt[source[i]&7],1);
3837   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3838   emit_jmp((int)jump_hlecall);
3839 }
3840
3841 void intcall_assemble(int i,struct regstat *i_regs)
3842 {
3843   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3844   assert(ccreg==HOST_CCREG);
3845   assert(!is_delayslot);
3846   emit_movimm(start+i*4,0); // Get PC
3847   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3848   emit_jmp((int)jump_intcall);
3849 }
3850
3851 void ds_assemble(int i,struct regstat *i_regs)
3852 {
3853   is_delayslot=1;
3854   switch(itype[i]) {
3855     case ALU:
3856       alu_assemble(i,i_regs);break;
3857     case IMM16:
3858       imm16_assemble(i,i_regs);break;
3859     case SHIFT:
3860       shift_assemble(i,i_regs);break;
3861     case SHIFTIMM:
3862       shiftimm_assemble(i,i_regs);break;
3863     case LOAD:
3864       load_assemble(i,i_regs);break;
3865     case LOADLR:
3866       loadlr_assemble(i,i_regs);break;
3867     case STORE:
3868       store_assemble(i,i_regs);break;
3869     case STORELR:
3870       storelr_assemble(i,i_regs);break;
3871     case COP0:
3872       cop0_assemble(i,i_regs);break;
3873     case COP1:
3874       cop1_assemble(i,i_regs);break;
3875     case C1LS:
3876       c1ls_assemble(i,i_regs);break;
3877     case COP2:
3878       cop2_assemble(i,i_regs);break;
3879     case C2LS:
3880       c2ls_assemble(i,i_regs);break;
3881     case C2OP:
3882       c2op_assemble(i,i_regs);break;
3883     case FCONV:
3884       fconv_assemble(i,i_regs);break;
3885     case FLOAT:
3886       float_assemble(i,i_regs);break;
3887     case FCOMP:
3888       fcomp_assemble(i,i_regs);break;
3889     case MULTDIV:
3890       multdiv_assemble(i,i_regs);break;
3891     case MOV:
3892       mov_assemble(i,i_regs);break;
3893     case SYSCALL:
3894     case HLECALL:
3895     case INTCALL:
3896     case SPAN:
3897     case UJUMP:
3898     case RJUMP:
3899     case CJUMP:
3900     case SJUMP:
3901     case FJUMP:
3902       printf("Jump in the delay slot.  This is probably a bug.\n");
3903   }
3904   is_delayslot=0;
3905 }
3906
3907 // Is the branch target a valid internal jump?
3908 int internal_branch(uint64_t i_is32,int addr)
3909 {
3910   if(addr&1) return 0; // Indirect (register) jump
3911   if(addr>=start && addr<start+slen*4-4)
3912   {
3913     int t=(addr-start)>>2;
3914     // Delay slots are not valid branch targets
3915     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3916     // 64 -> 32 bit transition requires a recompile
3917     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3918     {
3919       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3920       else printf("optimizable: yes\n");
3921     }*/
3922     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3923 #ifndef FORCE32
3924     if(requires_32bit[t]&~i_is32) return 0;
3925     else
3926 #endif
3927       return 1;
3928   }
3929   return 0;
3930 }
3931
3932 #ifndef wb_invalidate
3933 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3934   uint64_t u,uint64_t uu)
3935 {
3936   int hr;
3937   for(hr=0;hr<HOST_REGS;hr++) {
3938     if(hr!=EXCLUDE_REG) {
3939       if(pre[hr]!=entry[hr]) {
3940         if(pre[hr]>=0) {
3941           if((dirty>>hr)&1) {
3942             if(get_reg(entry,pre[hr])<0) {
3943               if(pre[hr]<64) {
3944                 if(!((u>>pre[hr])&1)) {
3945                   emit_storereg(pre[hr],hr);
3946                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3947                     emit_sarimm(hr,31,hr);
3948                     emit_storereg(pre[hr]|64,hr);
3949                   }
3950                 }
3951               }else{
3952                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3953                   emit_storereg(pre[hr],hr);
3954                 }
3955               }
3956             }
3957           }
3958         }
3959       }
3960     }
3961   }
3962   // Move from one register to another (no writeback)
3963   for(hr=0;hr<HOST_REGS;hr++) {
3964     if(hr!=EXCLUDE_REG) {
3965       if(pre[hr]!=entry[hr]) {
3966         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3967           int nr;
3968           if((nr=get_reg(entry,pre[hr]))>=0) {
3969             emit_mov(hr,nr);
3970           }
3971         }
3972       }
3973     }
3974   }
3975 }
3976 #endif
3977
3978 // Load the specified registers
3979 // This only loads the registers given as arguments because
3980 // we don't want to load things that will be overwritten
3981 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3982 {
3983   int hr;
3984   // Load 32-bit regs
3985   for(hr=0;hr<HOST_REGS;hr++) {
3986     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3987       if(entry[hr]!=regmap[hr]) {
3988         if(regmap[hr]==rs1||regmap[hr]==rs2)
3989         {
3990           if(regmap[hr]==0) {
3991             emit_zeroreg(hr);
3992           }
3993           else
3994           {
3995             emit_loadreg(regmap[hr],hr);
3996           }
3997         }
3998       }
3999     }
4000   }
4001   //Load 64-bit regs
4002   for(hr=0;hr<HOST_REGS;hr++) {
4003     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4004       if(entry[hr]!=regmap[hr]) {
4005         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4006         {
4007           assert(regmap[hr]!=64);
4008           if((is32>>(regmap[hr]&63))&1) {
4009             int lr=get_reg(regmap,regmap[hr]-64);
4010             if(lr>=0)
4011               emit_sarimm(lr,31,hr);
4012             else
4013               emit_loadreg(regmap[hr],hr);
4014           }
4015           else
4016           {
4017             emit_loadreg(regmap[hr],hr);
4018           }
4019         }
4020       }
4021     }
4022   }
4023 }
4024
4025 // Load registers prior to the start of a loop
4026 // so that they are not loaded within the loop
4027 static void loop_preload(signed char pre[],signed char entry[])
4028 {
4029   int hr;
4030   for(hr=0;hr<HOST_REGS;hr++) {
4031     if(hr!=EXCLUDE_REG) {
4032       if(pre[hr]!=entry[hr]) {
4033         if(entry[hr]>=0) {
4034           if(get_reg(pre,entry[hr])<0) {
4035             assem_debug("loop preload:\n");
4036             //printf("loop preload: %d\n",hr);
4037             if(entry[hr]==0) {
4038               emit_zeroreg(hr);
4039             }
4040             else if(entry[hr]<TEMPREG)
4041             {
4042               emit_loadreg(entry[hr],hr);
4043             }
4044             else if(entry[hr]-64<TEMPREG)
4045             {
4046               emit_loadreg(entry[hr],hr);
4047             }
4048           }
4049         }
4050       }
4051     }
4052   }
4053 }
4054
4055 // Generate address for load/store instruction
4056 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4057 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4058 {
4059   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4060     int ra;
4061     int agr=AGEN1+(i&1);
4062     int mgr=MGEN1+(i&1);
4063     if(itype[i]==LOAD) {
4064       ra=get_reg(i_regs->regmap,rt1[i]);
4065       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4066       assert(ra>=0);
4067     }
4068     if(itype[i]==LOADLR) {
4069       ra=get_reg(i_regs->regmap,FTEMP);
4070     }
4071     if(itype[i]==STORE||itype[i]==STORELR) {
4072       ra=get_reg(i_regs->regmap,agr);
4073       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4074     }
4075     if(itype[i]==C1LS||itype[i]==C2LS) {
4076       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4077         ra=get_reg(i_regs->regmap,FTEMP);
4078       else { // SWC1/SDC1/SWC2/SDC2
4079         ra=get_reg(i_regs->regmap,agr);
4080         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4081       }
4082     }
4083     int rs=get_reg(i_regs->regmap,rs1[i]);
4084     int rm=get_reg(i_regs->regmap,TLREG);
4085     if(ra>=0) {
4086       int offset=imm[i];
4087       int c=(i_regs->wasconst>>rs)&1;
4088       if(rs1[i]==0) {
4089         // Using r0 as a base address
4090         /*if(rm>=0) {
4091           if(!entry||entry[rm]!=mgr) {
4092             generate_map_const(offset,rm);
4093           } // else did it in the previous cycle
4094         }*/
4095         if(!entry||entry[ra]!=agr) {
4096           if (opcode[i]==0x22||opcode[i]==0x26) {
4097             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4098           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4099             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4100           }else{
4101             emit_movimm(offset,ra);
4102           }
4103         } // else did it in the previous cycle
4104       }
4105       else if(rs<0) {
4106         if(!entry||entry[ra]!=rs1[i])
4107           emit_loadreg(rs1[i],ra);
4108         //if(!entry||entry[ra]!=rs1[i])
4109         //  printf("poor load scheduling!\n");
4110       }
4111       else if(c) {
4112         if(rm>=0) {
4113           if(!entry||entry[rm]!=mgr) {
4114             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4115               // Stores to memory go thru the mapper to detect self-modifying
4116               // code, loads don't.
4117               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4118                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4119                 generate_map_const(constmap[i][rs]+offset,rm);
4120             }else{
4121               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4122                 generate_map_const(constmap[i][rs]+offset,rm);
4123             }
4124           }
4125         }
4126         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4127           if(!entry||entry[ra]!=agr) {
4128             if (opcode[i]==0x22||opcode[i]==0x26) {
4129               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4130             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4131               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4132             }else{
4133               #ifdef HOST_IMM_ADDR32
4134               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4135                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4136               #endif
4137               emit_movimm(constmap[i][rs]+offset,ra);
4138             }
4139           } // else did it in the previous cycle
4140         } // else load_consts already did it
4141       }
4142       if(offset&&!c&&rs1[i]) {
4143         if(rs>=0) {
4144           emit_addimm(rs,offset,ra);
4145         }else{
4146           emit_addimm(ra,offset,ra);
4147         }
4148       }
4149     }
4150   }
4151   // Preload constants for next instruction
4152   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4153     int agr,ra;
4154     #ifndef HOST_IMM_ADDR32
4155     // Mapper entry
4156     agr=MGEN1+((i+1)&1);
4157     ra=get_reg(i_regs->regmap,agr);
4158     if(ra>=0) {
4159       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4160       int offset=imm[i+1];
4161       int c=(regs[i+1].wasconst>>rs)&1;
4162       if(c) {
4163         if(itype[i+1]==STORE||itype[i+1]==STORELR
4164            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4165           // Stores to memory go thru the mapper to detect self-modifying
4166           // code, loads don't.
4167           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4168              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4169             generate_map_const(constmap[i+1][rs]+offset,ra);
4170         }else{
4171           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4172             generate_map_const(constmap[i+1][rs]+offset,ra);
4173         }
4174       }
4175       /*else if(rs1[i]==0) {
4176         generate_map_const(offset,ra);
4177       }*/
4178     }
4179     #endif
4180     // Actual address
4181     agr=AGEN1+((i+1)&1);
4182     ra=get_reg(i_regs->regmap,agr);
4183     if(ra>=0) {
4184       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4185       int offset=imm[i+1];
4186       int c=(regs[i+1].wasconst>>rs)&1;
4187       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4188         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4189           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4190         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4191           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4192         }else{
4193           #ifdef HOST_IMM_ADDR32
4194           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4195              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4196           #endif
4197           emit_movimm(constmap[i+1][rs]+offset,ra);
4198         }
4199       }
4200       else if(rs1[i+1]==0) {
4201         // Using r0 as a base address
4202         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4203           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4204         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4205           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4206         }else{
4207           emit_movimm(offset,ra);
4208         }
4209       }
4210     }
4211   }
4212 }
4213
4214 int get_final_value(int hr, int i, int *value)
4215 {
4216   int reg=regs[i].regmap[hr];
4217   while(i<slen-1) {
4218     if(regs[i+1].regmap[hr]!=reg) break;
4219     if(!((regs[i+1].isconst>>hr)&1)) break;
4220     if(bt[i+1]) break;
4221     i++;
4222   }
4223   if(i<slen-1) {
4224     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4225       *value=constmap[i][hr];
4226       return 1;
4227     }
4228     if(!bt[i+1]) {
4229       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4230         // Load in delay slot, out-of-order execution
4231         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4232         {
4233           #ifdef HOST_IMM_ADDR32
4234           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4235           #endif
4236           // Precompute load address
4237           *value=constmap[i][hr]+imm[i+2];
4238           return 1;
4239         }
4240       }
4241       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4242       {
4243         #ifdef HOST_IMM_ADDR32
4244         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4245         #endif
4246         // Precompute load address
4247         *value=constmap[i][hr]+imm[i+1];
4248         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4249         return 1;
4250       }
4251     }
4252   }
4253   *value=constmap[i][hr];
4254   //printf("c=%x\n",(int)constmap[i][hr]);
4255   if(i==slen-1) return 1;
4256   if(reg<64) {
4257     return !((unneeded_reg[i+1]>>reg)&1);
4258   }else{
4259     return !((unneeded_reg_upper[i+1]>>reg)&1);
4260   }
4261 }
4262
4263 // Load registers with known constants
4264 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4265 {
4266   int hr;
4267   // Load 32-bit regs
4268   for(hr=0;hr<HOST_REGS;hr++) {
4269     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4270       //if(entry[hr]!=regmap[hr]) {
4271       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4272         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4273           int value;
4274           if(get_final_value(hr,i,&value)) {
4275             if(value==0) {
4276               emit_zeroreg(hr);
4277             }
4278             else {
4279               emit_movimm(value,hr);
4280             }
4281           }
4282         }
4283       }
4284     }
4285   }
4286   // Load 64-bit regs
4287   for(hr=0;hr<HOST_REGS;hr++) {
4288     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4289       //if(entry[hr]!=regmap[hr]) {
4290       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4291         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4292           if((is32>>(regmap[hr]&63))&1) {
4293             int lr=get_reg(regmap,regmap[hr]-64);
4294             assert(lr>=0);
4295             emit_sarimm(lr,31,hr);
4296           }
4297           else
4298           {
4299             int value;
4300             if(get_final_value(hr,i,&value)) {
4301               if(value==0) {
4302                 emit_zeroreg(hr);
4303               }
4304               else {
4305                 emit_movimm(value,hr);
4306               }
4307             }
4308           }
4309         }
4310       }
4311     }
4312   }
4313 }
4314 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4315 {
4316   int hr;
4317   // Load 32-bit regs
4318   for(hr=0;hr<HOST_REGS;hr++) {
4319     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4320       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4321         int value=constmap[i][hr];
4322         if(value==0) {
4323           emit_zeroreg(hr);
4324         }
4325         else {
4326           emit_movimm(value,hr);
4327         }
4328       }
4329     }
4330   }
4331   // Load 64-bit regs
4332   for(hr=0;hr<HOST_REGS;hr++) {
4333     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4334       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4335         if((is32>>(regmap[hr]&63))&1) {
4336           int lr=get_reg(regmap,regmap[hr]-64);
4337           assert(lr>=0);
4338           emit_sarimm(lr,31,hr);
4339         }
4340         else
4341         {
4342           int value=constmap[i][hr];
4343           if(value==0) {
4344             emit_zeroreg(hr);
4345           }
4346           else {
4347             emit_movimm(value,hr);
4348           }
4349         }
4350       }
4351     }
4352   }
4353 }
4354
4355 // Write out all dirty registers (except cycle count)
4356 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4357 {
4358   int hr;
4359   for(hr=0;hr<HOST_REGS;hr++) {
4360     if(hr!=EXCLUDE_REG) {
4361       if(i_regmap[hr]>0) {
4362         if(i_regmap[hr]!=CCREG) {
4363           if((i_dirty>>hr)&1) {
4364             if(i_regmap[hr]<64) {
4365               emit_storereg(i_regmap[hr],hr);
4366 #ifndef FORCE32
4367               if( ((i_is32>>i_regmap[hr])&1) ) {
4368                 #ifdef DESTRUCTIVE_WRITEBACK
4369                 emit_sarimm(hr,31,hr);
4370                 emit_storereg(i_regmap[hr]|64,hr);
4371                 #else
4372                 emit_sarimm(hr,31,HOST_TEMPREG);
4373                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4374                 #endif
4375               }
4376 #endif
4377             }else{
4378               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4379                 emit_storereg(i_regmap[hr],hr);
4380               }
4381             }
4382           }
4383         }
4384       }
4385     }
4386   }
4387 }
4388 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4389 // This writes the registers not written by store_regs_bt
4390 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4391 {
4392   int hr;
4393   int t=(addr-start)>>2;
4394   for(hr=0;hr<HOST_REGS;hr++) {
4395     if(hr!=EXCLUDE_REG) {
4396       if(i_regmap[hr]>0) {
4397         if(i_regmap[hr]!=CCREG) {
4398           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4399             if((i_dirty>>hr)&1) {
4400               if(i_regmap[hr]<64) {
4401                 emit_storereg(i_regmap[hr],hr);
4402 #ifndef FORCE32
4403                 if( ((i_is32>>i_regmap[hr])&1) ) {
4404                   #ifdef DESTRUCTIVE_WRITEBACK
4405                   emit_sarimm(hr,31,hr);
4406                   emit_storereg(i_regmap[hr]|64,hr);
4407                   #else
4408                   emit_sarimm(hr,31,HOST_TEMPREG);
4409                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4410                   #endif
4411                 }
4412 #endif
4413               }else{
4414                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4415                   emit_storereg(i_regmap[hr],hr);
4416                 }
4417               }
4418             }
4419           }
4420         }
4421       }
4422     }
4423   }
4424 }
4425
4426 // Load all registers (except cycle count)
4427 void load_all_regs(signed char i_regmap[])
4428 {
4429   int hr;
4430   for(hr=0;hr<HOST_REGS;hr++) {
4431     if(hr!=EXCLUDE_REG) {
4432       if(i_regmap[hr]==0) {
4433         emit_zeroreg(hr);
4434       }
4435       else
4436       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4437       {
4438         emit_loadreg(i_regmap[hr],hr);
4439       }
4440     }
4441   }
4442 }
4443
4444 // Load all current registers also needed by next instruction
4445 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4446 {
4447   int hr;
4448   for(hr=0;hr<HOST_REGS;hr++) {
4449     if(hr!=EXCLUDE_REG) {
4450       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4451         if(i_regmap[hr]==0) {
4452           emit_zeroreg(hr);
4453         }
4454         else
4455         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4456         {
4457           emit_loadreg(i_regmap[hr],hr);
4458         }
4459       }
4460     }
4461   }
4462 }
4463
4464 // Load all regs, storing cycle count if necessary
4465 void load_regs_entry(int t)
4466 {
4467   int hr;
4468   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4469   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4470   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4471     emit_storereg(CCREG,HOST_CCREG);
4472   }
4473   // Load 32-bit regs
4474   for(hr=0;hr<HOST_REGS;hr++) {
4475     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4476       if(regs[t].regmap_entry[hr]==0) {
4477         emit_zeroreg(hr);
4478       }
4479       else if(regs[t].regmap_entry[hr]!=CCREG)
4480       {
4481         emit_loadreg(regs[t].regmap_entry[hr],hr);
4482       }
4483     }
4484   }
4485   // Load 64-bit regs
4486   for(hr=0;hr<HOST_REGS;hr++) {
4487     if(regs[t].regmap_entry[hr]>=64) {
4488       assert(regs[t].regmap_entry[hr]!=64);
4489       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4490         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4491         if(lr<0) {
4492           emit_loadreg(regs[t].regmap_entry[hr],hr);
4493         }
4494         else
4495         {
4496           emit_sarimm(lr,31,hr);
4497         }
4498       }
4499       else
4500       {
4501         emit_loadreg(regs[t].regmap_entry[hr],hr);
4502       }
4503     }
4504   }
4505 }
4506
4507 // Store dirty registers prior to branch
4508 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4509 {
4510   if(internal_branch(i_is32,addr))
4511   {
4512     int t=(addr-start)>>2;
4513     int hr;
4514     for(hr=0;hr<HOST_REGS;hr++) {
4515       if(hr!=EXCLUDE_REG) {
4516         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4517           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4518             if((i_dirty>>hr)&1) {
4519               if(i_regmap[hr]<64) {
4520                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4521                   emit_storereg(i_regmap[hr],hr);
4522                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4523                     #ifdef DESTRUCTIVE_WRITEBACK
4524                     emit_sarimm(hr,31,hr);
4525                     emit_storereg(i_regmap[hr]|64,hr);
4526                     #else
4527                     emit_sarimm(hr,31,HOST_TEMPREG);
4528                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4529                     #endif
4530                   }
4531                 }
4532               }else{
4533                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4534                   emit_storereg(i_regmap[hr],hr);
4535                 }
4536               }
4537             }
4538           }
4539         }
4540       }
4541     }
4542   }
4543   else
4544   {
4545     // Branch out of this block, write out all dirty regs
4546     wb_dirtys(i_regmap,i_is32,i_dirty);
4547   }
4548 }
4549
4550 // Load all needed registers for branch target
4551 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4552 {
4553   //if(addr>=start && addr<(start+slen*4))
4554   if(internal_branch(i_is32,addr))
4555   {
4556     int t=(addr-start)>>2;
4557     int hr;
4558     // Store the cycle count before loading something else
4559     if(i_regmap[HOST_CCREG]!=CCREG) {
4560       assert(i_regmap[HOST_CCREG]==-1);
4561     }
4562     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4563       emit_storereg(CCREG,HOST_CCREG);
4564     }
4565     // Load 32-bit regs
4566     for(hr=0;hr<HOST_REGS;hr++) {
4567       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4568         #ifdef DESTRUCTIVE_WRITEBACK
4569         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4570         #else
4571         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4572         #endif
4573           if(regs[t].regmap_entry[hr]==0) {
4574             emit_zeroreg(hr);
4575           }
4576           else if(regs[t].regmap_entry[hr]!=CCREG)
4577           {
4578             emit_loadreg(regs[t].regmap_entry[hr],hr);
4579           }
4580         }
4581       }
4582     }
4583     //Load 64-bit regs
4584     for(hr=0;hr<HOST_REGS;hr++) {
4585       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4586         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4587           assert(regs[t].regmap_entry[hr]!=64);
4588           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4589             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4590             if(lr<0) {
4591               emit_loadreg(regs[t].regmap_entry[hr],hr);
4592             }
4593             else
4594             {
4595               emit_sarimm(lr,31,hr);
4596             }
4597           }
4598           else
4599           {
4600             emit_loadreg(regs[t].regmap_entry[hr],hr);
4601           }
4602         }
4603         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4604           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4605           assert(lr>=0);
4606           emit_sarimm(lr,31,hr);
4607         }
4608       }
4609     }
4610   }
4611 }
4612
4613 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4614 {
4615   if(addr>=start && addr<start+slen*4-4)
4616   {
4617     int t=(addr-start)>>2;
4618     int hr;
4619     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4620     for(hr=0;hr<HOST_REGS;hr++)
4621     {
4622       if(hr!=EXCLUDE_REG)
4623       {
4624         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4625         {
4626           if(regs[t].regmap_entry[hr]!=-1)
4627           {
4628             return 0;
4629           }
4630           else 
4631           if((i_dirty>>hr)&1)
4632           {
4633             if(i_regmap[hr]<64)
4634             {
4635               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4636                 return 0;
4637             }
4638             else
4639             {
4640               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4641                 return 0;
4642             }
4643           }
4644         }
4645         else // Same register but is it 32-bit or dirty?
4646         if(i_regmap[hr]>=0)
4647         {
4648           if(!((regs[t].dirty>>hr)&1))
4649           {
4650             if((i_dirty>>hr)&1)
4651             {
4652               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4653               {
4654                 //printf("%x: dirty no match\n",addr);
4655                 return 0;
4656               }
4657             }
4658           }
4659           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4660           {
4661             //printf("%x: is32 no match\n",addr);
4662             return 0;
4663           }
4664         }
4665       }
4666     }
4667     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4668 #ifndef FORCE32
4669     if(requires_32bit[t]&~i_is32) return 0;
4670 #endif
4671     // Delay slots are not valid branch targets
4672     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4673     // Delay slots require additional processing, so do not match
4674     if(is_ds[t]) return 0;
4675   }
4676   else
4677   {
4678     int hr;
4679     for(hr=0;hr<HOST_REGS;hr++)
4680     {
4681       if(hr!=EXCLUDE_REG)
4682       {
4683         if(i_regmap[hr]>=0)
4684         {
4685           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4686           {
4687             if((i_dirty>>hr)&1)
4688             {
4689               return 0;
4690             }
4691           }
4692         }
4693       }
4694     }
4695   }
4696   return 1;
4697 }
4698
4699 // Used when a branch jumps into the delay slot of another branch
4700 void ds_assemble_entry(int i)
4701 {
4702   int t=(ba[i]-start)>>2;
4703   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4704   assem_debug("Assemble delay slot at %x\n",ba[i]);
4705   assem_debug("<->\n");
4706   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4707     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4708   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4709   address_generation(t,&regs[t],regs[t].regmap_entry);
4710   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4711     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4712   cop1_usable=0;
4713   is_delayslot=0;
4714   switch(itype[t]) {
4715     case ALU:
4716       alu_assemble(t,&regs[t]);break;
4717     case IMM16:
4718       imm16_assemble(t,&regs[t]);break;
4719     case SHIFT:
4720       shift_assemble(t,&regs[t]);break;
4721     case SHIFTIMM:
4722       shiftimm_assemble(t,&regs[t]);break;
4723     case LOAD:
4724       load_assemble(t,&regs[t]);break;
4725     case LOADLR:
4726       loadlr_assemble(t,&regs[t]);break;
4727     case STORE:
4728       store_assemble(t,&regs[t]);break;
4729     case STORELR:
4730       storelr_assemble(t,&regs[t]);break;
4731     case COP0:
4732       cop0_assemble(t,&regs[t]);break;
4733     case COP1:
4734       cop1_assemble(t,&regs[t]);break;
4735     case C1LS:
4736       c1ls_assemble(t,&regs[t]);break;
4737     case COP2:
4738       cop2_assemble(t,&regs[t]);break;
4739     case C2LS:
4740       c2ls_assemble(t,&regs[t]);break;
4741     case C2OP:
4742       c2op_assemble(t,&regs[t]);break;
4743     case FCONV:
4744       fconv_assemble(t,&regs[t]);break;
4745     case FLOAT:
4746       float_assemble(t,&regs[t]);break;
4747     case FCOMP:
4748       fcomp_assemble(t,&regs[t]);break;
4749     case MULTDIV:
4750       multdiv_assemble(t,&regs[t]);break;
4751     case MOV:
4752       mov_assemble(t,&regs[t]);break;
4753     case SYSCALL:
4754     case HLECALL:
4755     case INTCALL:
4756     case SPAN:
4757     case UJUMP:
4758     case RJUMP:
4759     case CJUMP:
4760     case SJUMP:
4761     case FJUMP:
4762       printf("Jump in the delay slot.  This is probably a bug.\n");
4763   }
4764   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4765   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4766   if(internal_branch(regs[t].is32,ba[i]+4))
4767     assem_debug("branch: internal\n");
4768   else
4769     assem_debug("branch: external\n");
4770   assert(internal_branch(regs[t].is32,ba[i]+4));
4771   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4772   emit_jmp(0);
4773 }
4774
4775 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4776 {
4777   int count;
4778   int jaddr;
4779   int idle=0;
4780   if(itype[i]==RJUMP)
4781   {
4782     *adj=0;
4783   }
4784   //if(ba[i]>=start && ba[i]<(start+slen*4))
4785   if(internal_branch(branch_regs[i].is32,ba[i]))
4786   {
4787     int t=(ba[i]-start)>>2;
4788     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4789     else *adj=ccadj[t];
4790   }
4791   else
4792   {
4793     *adj=0;
4794   }
4795   count=ccadj[i];
4796   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4797     // Idle loop
4798     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4799     idle=(int)out;
4800     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4801     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4802     jaddr=(int)out;
4803     emit_jmp(0);
4804   }
4805   else if(*adj==0||invert) {
4806     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4807     jaddr=(int)out;
4808     emit_jns(0);
4809   }
4810   else
4811   {
4812     emit_cmpimm(HOST_CCREG,-2*(count+2));
4813     jaddr=(int)out;
4814     emit_jns(0);
4815   }
4816   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4817 }
4818
4819 void do_ccstub(int n)
4820 {
4821   literal_pool(256);
4822   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4823   set_jump_target(stubs[n][1],(int)out);
4824   int i=stubs[n][4];
4825   if(stubs[n][6]==NULLDS) {
4826     // Delay slot instruction is nullified ("likely" branch)
4827     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4828   }
4829   else if(stubs[n][6]!=TAKEN) {
4830     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4831   }
4832   else {
4833     if(internal_branch(branch_regs[i].is32,ba[i]))
4834       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4835   }
4836   if(stubs[n][5]!=-1)
4837   {
4838     // Save PC as return address
4839     emit_movimm(stubs[n][5],EAX);
4840     emit_writeword(EAX,(int)&pcaddr);
4841   }
4842   else
4843   {
4844     // Return address depends on which way the branch goes
4845     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4846     {
4847       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4848       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4849       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4850       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4851       if(rs1[i]==0)
4852       {
4853         s1l=s2l;s1h=s2h;
4854         s2l=s2h=-1;
4855       }
4856       else if(rs2[i]==0)
4857       {
4858         s2l=s2h=-1;
4859       }
4860       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4861         s1h=s2h=-1;
4862       }
4863       assert(s1l>=0);
4864       #ifdef DESTRUCTIVE_WRITEBACK
4865       if(rs1[i]) {
4866         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4867           emit_loadreg(rs1[i],s1l);
4868       } 
4869       else {
4870         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4871           emit_loadreg(rs2[i],s1l);
4872       }
4873       if(s2l>=0)
4874         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4875           emit_loadreg(rs2[i],s2l);
4876       #endif
4877       int hr=0;
4878       int addr,alt,ntaddr;
4879       while(hr<HOST_REGS)
4880       {
4881         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4882            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4883            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4884         {
4885           addr=hr++;break;
4886         }
4887         hr++;
4888       }
4889       while(hr<HOST_REGS)
4890       {
4891         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4892            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4893            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4894         {
4895           alt=hr++;break;
4896         }
4897         hr++;
4898       }
4899       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4900       {
4901         while(hr<HOST_REGS)
4902         {
4903           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4904              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4905              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4906           {
4907             ntaddr=hr;break;
4908           }
4909           hr++;
4910         }
4911         assert(hr<HOST_REGS);
4912       }
4913       if((opcode[i]&0x2f)==4) // BEQ
4914       {
4915         #ifdef HAVE_CMOV_IMM
4916         if(s1h<0) {
4917           if(s2l>=0) emit_cmp(s1l,s2l);
4918           else emit_test(s1l,s1l);
4919           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4920         }
4921         else
4922         #endif
4923         {
4924           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4925           if(s1h>=0) {
4926             if(s2h>=0) emit_cmp(s1h,s2h);
4927             else emit_test(s1h,s1h);
4928             emit_cmovne_reg(alt,addr);
4929           }
4930           if(s2l>=0) emit_cmp(s1l,s2l);
4931           else emit_test(s1l,s1l);
4932           emit_cmovne_reg(alt,addr);
4933         }
4934       }
4935       if((opcode[i]&0x2f)==5) // BNE
4936       {
4937         #ifdef HAVE_CMOV_IMM
4938         if(s1h<0) {
4939           if(s2l>=0) emit_cmp(s1l,s2l);
4940           else emit_test(s1l,s1l);
4941           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4942         }
4943         else
4944         #endif
4945         {
4946           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4947           if(s1h>=0) {
4948             if(s2h>=0) emit_cmp(s1h,s2h);
4949             else emit_test(s1h,s1h);
4950             emit_cmovne_reg(alt,addr);
4951           }
4952           if(s2l>=0) emit_cmp(s1l,s2l);
4953           else emit_test(s1l,s1l);
4954           emit_cmovne_reg(alt,addr);
4955         }
4956       }
4957       if((opcode[i]&0x2f)==6) // BLEZ
4958       {
4959         //emit_movimm(ba[i],alt);
4960         //emit_movimm(start+i*4+8,addr);
4961         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4962         emit_cmpimm(s1l,1);
4963         if(s1h>=0) emit_mov(addr,ntaddr);
4964         emit_cmovl_reg(alt,addr);
4965         if(s1h>=0) {
4966           emit_test(s1h,s1h);
4967           emit_cmovne_reg(ntaddr,addr);
4968           emit_cmovs_reg(alt,addr);
4969         }
4970       }
4971       if((opcode[i]&0x2f)==7) // BGTZ
4972       {
4973         //emit_movimm(ba[i],addr);
4974         //emit_movimm(start+i*4+8,ntaddr);
4975         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4976         emit_cmpimm(s1l,1);
4977         if(s1h>=0) emit_mov(addr,alt);
4978         emit_cmovl_reg(ntaddr,addr);
4979         if(s1h>=0) {
4980           emit_test(s1h,s1h);
4981           emit_cmovne_reg(alt,addr);
4982           emit_cmovs_reg(ntaddr,addr);
4983         }
4984       }
4985       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4986       {
4987         //emit_movimm(ba[i],alt);
4988         //emit_movimm(start+i*4+8,addr);
4989         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4990         if(s1h>=0) emit_test(s1h,s1h);
4991         else emit_test(s1l,s1l);
4992         emit_cmovs_reg(alt,addr);
4993       }
4994       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4995       {
4996         //emit_movimm(ba[i],addr);
4997         //emit_movimm(start+i*4+8,alt);
4998         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4999         if(s1h>=0) emit_test(s1h,s1h);
5000         else emit_test(s1l,s1l);
5001         emit_cmovs_reg(alt,addr);
5002       }
5003       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5004         if(source[i]&0x10000) // BC1T
5005         {
5006           //emit_movimm(ba[i],alt);
5007           //emit_movimm(start+i*4+8,addr);
5008           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5009           emit_testimm(s1l,0x800000);
5010           emit_cmovne_reg(alt,addr);
5011         }
5012         else // BC1F
5013         {
5014           //emit_movimm(ba[i],addr);
5015           //emit_movimm(start+i*4+8,alt);
5016           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5017           emit_testimm(s1l,0x800000);
5018           emit_cmovne_reg(alt,addr);
5019         }
5020       }
5021       emit_writeword(addr,(int)&pcaddr);
5022     }
5023     else
5024     if(itype[i]==RJUMP)
5025     {
5026       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5027       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5028         r=get_reg(branch_regs[i].regmap,RTEMP);
5029       }
5030       emit_writeword(r,(int)&pcaddr);
5031     }
5032     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5033   }
5034   // Update cycle count
5035   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5036   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5037   emit_call((int)cc_interrupt);
5038   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5039   if(stubs[n][6]==TAKEN) {
5040     if(internal_branch(branch_regs[i].is32,ba[i]))
5041       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5042     else if(itype[i]==RJUMP) {
5043       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5044         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5045       else
5046         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5047     }
5048   }else if(stubs[n][6]==NOTTAKEN) {
5049     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5050     else load_all_regs(branch_regs[i].regmap);
5051   }else if(stubs[n][6]==NULLDS) {
5052     // Delay slot instruction is nullified ("likely" branch)
5053     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5054     else load_all_regs(regs[i].regmap);
5055   }else{
5056     load_all_regs(branch_regs[i].regmap);
5057   }
5058   emit_jmp(stubs[n][2]); // return address
5059   
5060   /* This works but uses a lot of memory...
5061   emit_readword((int)&last_count,ECX);
5062   emit_add(HOST_CCREG,ECX,EAX);
5063   emit_writeword(EAX,(int)&Count);
5064   emit_call((int)gen_interupt);
5065   emit_readword((int)&Count,HOST_CCREG);
5066   emit_readword((int)&next_interupt,EAX);
5067   emit_readword((int)&pending_exception,EBX);
5068   emit_writeword(EAX,(int)&last_count);
5069   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5070   emit_test(EBX,EBX);
5071   int jne_instr=(int)out;
5072   emit_jne(0);
5073   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5074   load_all_regs(branch_regs[i].regmap);
5075   emit_jmp(stubs[n][2]); // return address
5076   set_jump_target(jne_instr,(int)out);
5077   emit_readword((int)&pcaddr,EAX);
5078   // Call get_addr_ht instead of doing the hash table here.
5079   // This code is executed infrequently and takes up a lot of space
5080   // so smaller is better.
5081   emit_storereg(CCREG,HOST_CCREG);
5082   emit_pushreg(EAX);
5083   emit_call((int)get_addr_ht);
5084   emit_loadreg(CCREG,HOST_CCREG);
5085   emit_addimm(ESP,4,ESP);
5086   emit_jmpreg(EAX);*/
5087 }
5088
5089 add_to_linker(int addr,int target,int ext)
5090 {
5091   link_addr[linkcount][0]=addr;
5092   link_addr[linkcount][1]=target;
5093   link_addr[linkcount][2]=ext;  
5094   linkcount++;
5095 }
5096
5097 void ujump_assemble(int i,struct regstat *i_regs)
5098 {
5099   signed char *i_regmap=i_regs->regmap;
5100   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5101   address_generation(i+1,i_regs,regs[i].regmap_entry);
5102   #ifdef REG_PREFETCH
5103   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5104   if(rt1[i]==31&&temp>=0) 
5105   {
5106     int return_address=start+i*4+8;
5107     if(get_reg(branch_regs[i].regmap,31)>0) 
5108     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5109   }
5110   #endif
5111   ds_assemble(i+1,i_regs);
5112   uint64_t bc_unneeded=branch_regs[i].u;
5113   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5114   bc_unneeded|=1|(1LL<<rt1[i]);
5115   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5116   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5117                 bc_unneeded,bc_unneeded_upper);
5118   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5119   if(rt1[i]==31) {
5120     int rt;
5121     unsigned int return_address;
5122     assert(rt1[i+1]!=31);
5123     assert(rt2[i+1]!=31);
5124     rt=get_reg(branch_regs[i].regmap,31);
5125     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5126     //assert(rt>=0);
5127     return_address=start+i*4+8;
5128     if(rt>=0) {
5129       #ifdef USE_MINI_HT
5130       if(internal_branch(branch_regs[i].is32,return_address)) {
5131         int temp=rt+1;
5132         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
5133            branch_regs[i].regmap[temp]>=0)
5134         {
5135           temp=get_reg(branch_regs[i].regmap,-1);
5136         }
5137         #ifdef HOST_TEMPREG
5138         if(temp<0) temp=HOST_TEMPREG;
5139         #endif
5140         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5141         else emit_movimm(return_address,rt);
5142       }
5143       else
5144       #endif
5145       {
5146         #ifdef REG_PREFETCH
5147         if(temp>=0) 
5148         {
5149           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5150         }
5151         #endif
5152         emit_movimm(return_address,rt); // PC into link register
5153         #ifdef IMM_PREFETCH
5154         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5155         #endif
5156       }
5157     }
5158   }
5159   int cc,adj;
5160   cc=get_reg(branch_regs[i].regmap,CCREG);
5161   assert(cc==HOST_CCREG);
5162   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5163   #ifdef REG_PREFETCH
5164   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5165   #endif
5166   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5167   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5168   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5169   if(internal_branch(branch_regs[i].is32,ba[i]))
5170     assem_debug("branch: internal\n");
5171   else
5172     assem_debug("branch: external\n");
5173   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5174     ds_assemble_entry(i);
5175   }
5176   else {
5177     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5178     emit_jmp(0);
5179   }
5180 }
5181
5182 void rjump_assemble(int i,struct regstat *i_regs)
5183 {
5184   signed char *i_regmap=i_regs->regmap;
5185   int temp;
5186   int rs,cc,adj;
5187   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5188   assert(rs>=0);
5189   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5190     // Delay slot abuse, make a copy of the branch address register
5191     temp=get_reg(branch_regs[i].regmap,RTEMP);
5192     assert(temp>=0);
5193     assert(regs[i].regmap[temp]==RTEMP);
5194     emit_mov(rs,temp);
5195     rs=temp;
5196   }
5197   address_generation(i+1,i_regs,regs[i].regmap_entry);
5198   #ifdef REG_PREFETCH
5199   if(rt1[i]==31) 
5200   {
5201     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5202       int return_address=start+i*4+8;
5203       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5204     }
5205   }
5206   #endif
5207   #ifdef USE_MINI_HT
5208   if(rs1[i]==31) {
5209     int rh=get_reg(regs[i].regmap,RHASH);
5210     if(rh>=0) do_preload_rhash(rh);
5211   }
5212   #endif
5213   ds_assemble(i+1,i_regs);
5214   uint64_t bc_unneeded=branch_regs[i].u;
5215   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5216   bc_unneeded|=1|(1LL<<rt1[i]);
5217   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5218   bc_unneeded&=~(1LL<<rs1[i]);
5219   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5220                 bc_unneeded,bc_unneeded_upper);
5221   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5222   if(rt1[i]!=0) {
5223     int rt,return_address;
5224     assert(rt1[i+1]!=rt1[i]);
5225     assert(rt2[i+1]!=rt1[i]);
5226     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5227     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5228     assert(rt>=0);
5229     return_address=start+i*4+8;
5230     #ifdef REG_PREFETCH
5231     if(temp>=0) 
5232     {
5233       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5234     }
5235     #endif
5236     emit_movimm(return_address,rt); // PC into link register
5237     #ifdef IMM_PREFETCH
5238     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5239     #endif
5240   }
5241   cc=get_reg(branch_regs[i].regmap,CCREG);
5242   assert(cc==HOST_CCREG);
5243   #ifdef USE_MINI_HT
5244   int rh=get_reg(branch_regs[i].regmap,RHASH);
5245   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5246   if(rs1[i]==31) {
5247     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5248     do_preload_rhtbl(ht);
5249     do_rhash(rs,rh);
5250   }
5251   #endif
5252   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5253   #ifdef DESTRUCTIVE_WRITEBACK
5254   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5255     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5256       emit_loadreg(rs1[i],rs);
5257     }
5258   }
5259   #endif
5260   #ifdef REG_PREFETCH
5261   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5262   #endif
5263   #ifdef USE_MINI_HT
5264   if(rs1[i]==31) {
5265     do_miniht_load(ht,rh);
5266   }
5267   #endif
5268   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5269   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5270   //assert(adj==0);
5271   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5272   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5273   emit_jns(0);
5274   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5275   #ifdef USE_MINI_HT
5276   if(rs1[i]==31) {
5277     do_miniht_jump(rs,rh,ht);
5278   }
5279   else
5280   #endif
5281   {
5282     //if(rs!=EAX) emit_mov(rs,EAX);
5283     //emit_jmp((int)jump_vaddr_eax);
5284     emit_jmp(jump_vaddr_reg[rs]);
5285   }
5286   /* Check hash table
5287   temp=!rs;
5288   emit_mov(rs,temp);
5289   emit_shrimm(rs,16,rs);
5290   emit_xor(temp,rs,rs);
5291   emit_movzwl_reg(rs,rs);
5292   emit_shlimm(rs,4,rs);
5293   emit_cmpmem_indexed((int)hash_table,rs,temp);
5294   emit_jne((int)out+14);
5295   emit_readword_indexed((int)hash_table+4,rs,rs);
5296   emit_jmpreg(rs);
5297   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5298   emit_addimm_no_flags(8,rs);
5299   emit_jeq((int)out-17);
5300   // No hit on hash table, call compiler
5301   emit_pushreg(temp);
5302 //DEBUG >
5303 #ifdef DEBUG_CYCLE_COUNT
5304   emit_readword((int)&last_count,ECX);
5305   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5306   emit_readword((int)&next_interupt,ECX);
5307   emit_writeword(HOST_CCREG,(int)&Count);
5308   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5309   emit_writeword(ECX,(int)&last_count);
5310 #endif
5311 //DEBUG <
5312   emit_storereg(CCREG,HOST_CCREG);
5313   emit_call((int)get_addr);
5314   emit_loadreg(CCREG,HOST_CCREG);
5315   emit_addimm(ESP,4,ESP);
5316   emit_jmpreg(EAX);*/
5317   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5318   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5319   #endif
5320 }
5321
5322 void cjump_assemble(int i,struct regstat *i_regs)
5323 {
5324   signed char *i_regmap=i_regs->regmap;
5325   int cc;
5326   int match;
5327   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5328   assem_debug("match=%d\n",match);
5329   int s1h,s1l,s2h,s2l;
5330   int prev_cop1_usable=cop1_usable;
5331   int unconditional=0,nop=0;
5332   int only32=0;
5333   int invert=0;
5334   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5335   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5336   if(!match) invert=1;
5337   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5338   if(i>(ba[i]-start)>>2) invert=1;
5339   #endif
5340   
5341   if(ooo[i]) {
5342     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5343     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5344     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5345     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5346   }
5347   else {
5348     s1l=get_reg(i_regmap,rs1[i]);
5349     s1h=get_reg(i_regmap,rs1[i]|64);
5350     s2l=get_reg(i_regmap,rs2[i]);
5351     s2h=get_reg(i_regmap,rs2[i]|64);
5352   }
5353   if(rs1[i]==0&&rs2[i]==0)
5354   {
5355     if(opcode[i]&1) nop=1;
5356     else unconditional=1;
5357     //assert(opcode[i]!=5);
5358     //assert(opcode[i]!=7);
5359     //assert(opcode[i]!=0x15);
5360     //assert(opcode[i]!=0x17);
5361   }
5362   else if(rs1[i]==0)
5363   {
5364     s1l=s2l;s1h=s2h;
5365     s2l=s2h=-1;
5366     only32=(regs[i].was32>>rs2[i])&1;
5367   }
5368   else if(rs2[i]==0)
5369   {
5370     s2l=s2h=-1;
5371     only32=(regs[i].was32>>rs1[i])&1;
5372   }
5373   else {
5374     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5375   }
5376
5377   if(ooo[i]) {
5378     // Out of order execution (delay slot first)
5379     //printf("OOOE\n");
5380     address_generation(i+1,i_regs,regs[i].regmap_entry);
5381     ds_assemble(i+1,i_regs);
5382     int adj;
5383     uint64_t bc_unneeded=branch_regs[i].u;
5384     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5385     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5386     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5387     bc_unneeded|=1;
5388     bc_unneeded_upper|=1;
5389     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5390                   bc_unneeded,bc_unneeded_upper);
5391     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5392     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5393     cc=get_reg(branch_regs[i].regmap,CCREG);
5394     assert(cc==HOST_CCREG);
5395     if(unconditional) 
5396       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5397     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5398     //assem_debug("cycle count (adj)\n");
5399     if(unconditional) {
5400       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5401       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5402         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5403         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5404         if(internal)
5405           assem_debug("branch: internal\n");
5406         else
5407           assem_debug("branch: external\n");
5408         if(internal&&is_ds[(ba[i]-start)>>2]) {
5409           ds_assemble_entry(i);
5410         }
5411         else {
5412           add_to_linker((int)out,ba[i],internal);
5413           emit_jmp(0);
5414         }
5415         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5416         if(((u_int)out)&7) emit_addnop(0);
5417         #endif
5418       }
5419     }
5420     else if(nop) {
5421       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5422       int jaddr=(int)out;
5423       emit_jns(0);
5424       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5425     }
5426     else {
5427       int taken=0,nottaken=0,nottaken1=0;
5428       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5429       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5430       if(!only32)
5431       {
5432         assert(s1h>=0);
5433         if(opcode[i]==4) // BEQ
5434         {
5435           if(s2h>=0) emit_cmp(s1h,s2h);
5436           else emit_test(s1h,s1h);
5437           nottaken1=(int)out;
5438           emit_jne(1);
5439         }
5440         if(opcode[i]==5) // BNE
5441         {
5442           if(s2h>=0) emit_cmp(s1h,s2h);
5443           else emit_test(s1h,s1h);
5444           if(invert) taken=(int)out;
5445           else add_to_linker((int)out,ba[i],internal);
5446           emit_jne(0);
5447         }
5448         if(opcode[i]==6) // BLEZ
5449         {
5450           emit_test(s1h,s1h);
5451           if(invert) taken=(int)out;
5452           else add_to_linker((int)out,ba[i],internal);
5453           emit_js(0);
5454           nottaken1=(int)out;
5455           emit_jne(1);
5456         }
5457         if(opcode[i]==7) // BGTZ
5458         {
5459           emit_test(s1h,s1h);
5460           nottaken1=(int)out;
5461           emit_js(1);
5462           if(invert) taken=(int)out;
5463           else add_to_linker((int)out,ba[i],internal);
5464           emit_jne(0);
5465         }
5466       } // if(!only32)
5467           
5468       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5469       assert(s1l>=0);
5470       if(opcode[i]==4) // BEQ
5471       {
5472         if(s2l>=0) emit_cmp(s1l,s2l);
5473         else emit_test(s1l,s1l);
5474         if(invert){
5475           nottaken=(int)out;
5476           emit_jne(1);
5477         }else{
5478           add_to_linker((int)out,ba[i],internal);
5479           emit_jeq(0);
5480         }
5481       }
5482       if(opcode[i]==5) // BNE
5483       {
5484         if(s2l>=0) emit_cmp(s1l,s2l);
5485         else emit_test(s1l,s1l);
5486         if(invert){
5487           nottaken=(int)out;
5488           emit_jeq(1);
5489         }else{
5490           add_to_linker((int)out,ba[i],internal);
5491           emit_jne(0);
5492         }
5493       }
5494       if(opcode[i]==6) // BLEZ
5495       {
5496         emit_cmpimm(s1l,1);
5497         if(invert){
5498           nottaken=(int)out;
5499           emit_jge(1);
5500         }else{
5501           add_to_linker((int)out,ba[i],internal);
5502           emit_jl(0);
5503         }
5504       }
5505       if(opcode[i]==7) // BGTZ
5506       {
5507         emit_cmpimm(s1l,1);
5508         if(invert){
5509           nottaken=(int)out;
5510           emit_jl(1);
5511         }else{
5512           add_to_linker((int)out,ba[i],internal);
5513           emit_jge(0);
5514         }
5515       }
5516       if(invert) {
5517         if(taken) set_jump_target(taken,(int)out);
5518         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5519         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5520           if(adj) {
5521             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5522             add_to_linker((int)out,ba[i],internal);
5523           }else{
5524             emit_addnop(13);
5525             add_to_linker((int)out,ba[i],internal*2);
5526           }
5527           emit_jmp(0);
5528         }else
5529         #endif
5530         {
5531           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5532           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5533           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5534           if(internal)
5535             assem_debug("branch: internal\n");
5536           else
5537             assem_debug("branch: external\n");
5538           if(internal&&is_ds[(ba[i]-start)>>2]) {
5539             ds_assemble_entry(i);
5540           }
5541           else {
5542             add_to_linker((int)out,ba[i],internal);
5543             emit_jmp(0);
5544           }
5545         }
5546         set_jump_target(nottaken,(int)out);
5547       }
5548
5549       if(nottaken1) set_jump_target(nottaken1,(int)out);
5550       if(adj) {
5551         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5552       }
5553     } // (!unconditional)
5554   } // if(ooo)
5555   else
5556   {
5557     // In-order execution (branch first)
5558     //if(likely[i]) printf("IOL\n");
5559     //else
5560     //printf("IOE\n");
5561     int taken=0,nottaken=0,nottaken1=0;
5562     if(!unconditional&&!nop) {
5563       if(!only32)
5564       {
5565         assert(s1h>=0);
5566         if((opcode[i]&0x2f)==4) // BEQ
5567         {
5568           if(s2h>=0) emit_cmp(s1h,s2h);
5569           else emit_test(s1h,s1h);
5570           nottaken1=(int)out;
5571           emit_jne(2);
5572         }
5573         if((opcode[i]&0x2f)==5) // BNE
5574         {
5575           if(s2h>=0) emit_cmp(s1h,s2h);
5576           else emit_test(s1h,s1h);
5577           taken=(int)out;
5578           emit_jne(1);
5579         }
5580         if((opcode[i]&0x2f)==6) // BLEZ
5581         {
5582           emit_test(s1h,s1h);
5583           taken=(int)out;
5584           emit_js(1);
5585           nottaken1=(int)out;
5586           emit_jne(2);
5587         }
5588         if((opcode[i]&0x2f)==7) // BGTZ
5589         {
5590           emit_test(s1h,s1h);
5591           nottaken1=(int)out;
5592           emit_js(2);
5593           taken=(int)out;
5594           emit_jne(1);
5595         }
5596       } // if(!only32)
5597           
5598       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5599       assert(s1l>=0);
5600       if((opcode[i]&0x2f)==4) // BEQ
5601       {
5602         if(s2l>=0) emit_cmp(s1l,s2l);
5603         else emit_test(s1l,s1l);
5604         nottaken=(int)out;
5605         emit_jne(2);
5606       }
5607       if((opcode[i]&0x2f)==5) // BNE
5608       {
5609         if(s2l>=0) emit_cmp(s1l,s2l);
5610         else emit_test(s1l,s1l);
5611         nottaken=(int)out;
5612         emit_jeq(2);
5613       }
5614       if((opcode[i]&0x2f)==6) // BLEZ
5615       {
5616         emit_cmpimm(s1l,1);
5617         nottaken=(int)out;
5618         emit_jge(2);
5619       }
5620       if((opcode[i]&0x2f)==7) // BGTZ
5621       {
5622         emit_cmpimm(s1l,1);
5623         nottaken=(int)out;
5624         emit_jl(2);
5625       }
5626     } // if(!unconditional)
5627     int adj;
5628     uint64_t ds_unneeded=branch_regs[i].u;
5629     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5630     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5631     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5632     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5633     ds_unneeded|=1;
5634     ds_unneeded_upper|=1;
5635     // branch taken
5636     if(!nop) {
5637       if(taken) set_jump_target(taken,(int)out);
5638       assem_debug("1:\n");
5639       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5640                     ds_unneeded,ds_unneeded_upper);
5641       // load regs
5642       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5643       address_generation(i+1,&branch_regs[i],0);
5644       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5645       ds_assemble(i+1,&branch_regs[i]);
5646       cc=get_reg(branch_regs[i].regmap,CCREG);
5647       if(cc==-1) {
5648         emit_loadreg(CCREG,cc=HOST_CCREG);
5649         // CHECK: Is the following instruction (fall thru) allocated ok?
5650       }
5651       assert(cc==HOST_CCREG);
5652       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5653       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5654       assem_debug("cycle count (adj)\n");
5655       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5656       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5657       if(internal)
5658         assem_debug("branch: internal\n");
5659       else
5660         assem_debug("branch: external\n");
5661       if(internal&&is_ds[(ba[i]-start)>>2]) {
5662         ds_assemble_entry(i);
5663       }
5664       else {
5665         add_to_linker((int)out,ba[i],internal);
5666         emit_jmp(0);
5667       }
5668     }
5669     // branch not taken
5670     cop1_usable=prev_cop1_usable;
5671     if(!unconditional) {
5672       if(nottaken1) set_jump_target(nottaken1,(int)out);
5673       set_jump_target(nottaken,(int)out);
5674       assem_debug("2:\n");
5675       if(!likely[i]) {
5676         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5677                       ds_unneeded,ds_unneeded_upper);
5678         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5679         address_generation(i+1,&branch_regs[i],0);
5680         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5681         ds_assemble(i+1,&branch_regs[i]);
5682       }
5683       cc=get_reg(branch_regs[i].regmap,CCREG);
5684       if(cc==-1&&!likely[i]) {
5685         // Cycle count isn't in a register, temporarily load it then write it out
5686         emit_loadreg(CCREG,HOST_CCREG);
5687         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5688         int jaddr=(int)out;
5689         emit_jns(0);
5690         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5691         emit_storereg(CCREG,HOST_CCREG);
5692       }
5693       else{
5694         cc=get_reg(i_regmap,CCREG);
5695         assert(cc==HOST_CCREG);
5696         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5697         int jaddr=(int)out;
5698         emit_jns(0);
5699         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5700       }
5701     }
5702   }
5703 }
5704
5705 void sjump_assemble(int i,struct regstat *i_regs)
5706 {
5707   signed char *i_regmap=i_regs->regmap;
5708   int cc;
5709   int match;
5710   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5711   assem_debug("smatch=%d\n",match);
5712   int s1h,s1l;
5713   int prev_cop1_usable=cop1_usable;
5714   int unconditional=0,nevertaken=0;
5715   int only32=0;
5716   int invert=0;
5717   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5718   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5719   if(!match) invert=1;
5720   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5721   if(i>(ba[i]-start)>>2) invert=1;
5722   #endif
5723
5724   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5725   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5726
5727   if(ooo[i]) {
5728     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5729     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5730   }
5731   else {
5732     s1l=get_reg(i_regmap,rs1[i]);
5733     s1h=get_reg(i_regmap,rs1[i]|64);
5734   }
5735   if(rs1[i]==0)
5736   {
5737     if(opcode2[i]&1) unconditional=1;
5738     else nevertaken=1;
5739     // These are never taken (r0 is never less than zero)
5740     //assert(opcode2[i]!=0);
5741     //assert(opcode2[i]!=2);
5742     //assert(opcode2[i]!=0x10);
5743     //assert(opcode2[i]!=0x12);
5744   }
5745   else {
5746     only32=(regs[i].was32>>rs1[i])&1;
5747   }
5748
5749   if(ooo[i]) {
5750     // Out of order execution (delay slot first)
5751     //printf("OOOE\n");
5752     address_generation(i+1,i_regs,regs[i].regmap_entry);
5753     ds_assemble(i+1,i_regs);
5754     int adj;
5755     uint64_t bc_unneeded=branch_regs[i].u;
5756     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5757     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5758     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5759     bc_unneeded|=1;
5760     bc_unneeded_upper|=1;
5761     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5762                   bc_unneeded,bc_unneeded_upper);
5763     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5764     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5765     if(rt1[i]==31) {
5766       int rt,return_address;
5767       rt=get_reg(branch_regs[i].regmap,31);
5768       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5769       if(rt>=0) {
5770         // Save the PC even if the branch is not taken
5771         return_address=start+i*4+8;
5772         emit_movimm(return_address,rt); // PC into link register
5773         #ifdef IMM_PREFETCH
5774         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5775         #endif
5776       }
5777     }
5778     cc=get_reg(branch_regs[i].regmap,CCREG);
5779     assert(cc==HOST_CCREG);
5780     if(unconditional) 
5781       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5782     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5783     assem_debug("cycle count (adj)\n");
5784     if(unconditional) {
5785       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5786       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5787         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5788         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5789         if(internal)
5790           assem_debug("branch: internal\n");
5791         else
5792           assem_debug("branch: external\n");
5793         if(internal&&is_ds[(ba[i]-start)>>2]) {
5794           ds_assemble_entry(i);
5795         }
5796         else {
5797           add_to_linker((int)out,ba[i],internal);
5798           emit_jmp(0);
5799         }
5800         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5801         if(((u_int)out)&7) emit_addnop(0);
5802         #endif
5803       }
5804     }
5805     else if(nevertaken) {
5806       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5807       int jaddr=(int)out;
5808       emit_jns(0);
5809       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5810     }
5811     else {
5812       int nottaken=0;
5813       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5814       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5815       if(!only32)
5816       {
5817         assert(s1h>=0);
5818         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5819         {
5820           emit_test(s1h,s1h);
5821           if(invert){
5822             nottaken=(int)out;
5823             emit_jns(1);
5824           }else{
5825             add_to_linker((int)out,ba[i],internal);
5826             emit_js(0);
5827           }
5828         }
5829         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5830         {
5831           emit_test(s1h,s1h);
5832           if(invert){
5833             nottaken=(int)out;
5834             emit_js(1);
5835           }else{
5836             add_to_linker((int)out,ba[i],internal);
5837             emit_jns(0);
5838           }
5839         }
5840       } // if(!only32)
5841       else
5842       {
5843         assert(s1l>=0);
5844         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5845         {
5846           emit_test(s1l,s1l);
5847           if(invert){
5848             nottaken=(int)out;
5849             emit_jns(1);
5850           }else{
5851             add_to_linker((int)out,ba[i],internal);
5852             emit_js(0);
5853           }
5854         }
5855         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5856         {
5857           emit_test(s1l,s1l);
5858           if(invert){
5859             nottaken=(int)out;
5860             emit_js(1);
5861           }else{
5862             add_to_linker((int)out,ba[i],internal);
5863             emit_jns(0);
5864           }
5865         }
5866       } // if(!only32)
5867           
5868       if(invert) {
5869         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5870         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5871           if(adj) {
5872             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5873             add_to_linker((int)out,ba[i],internal);
5874           }else{
5875             emit_addnop(13);
5876             add_to_linker((int)out,ba[i],internal*2);
5877           }
5878           emit_jmp(0);
5879         }else
5880         #endif
5881         {
5882           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5883           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5884           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5885           if(internal)
5886             assem_debug("branch: internal\n");
5887           else
5888             assem_debug("branch: external\n");
5889           if(internal&&is_ds[(ba[i]-start)>>2]) {
5890             ds_assemble_entry(i);
5891           }
5892           else {
5893             add_to_linker((int)out,ba[i],internal);
5894             emit_jmp(0);
5895           }
5896         }
5897         set_jump_target(nottaken,(int)out);
5898       }
5899
5900       if(adj) {
5901         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5902       }
5903     } // (!unconditional)
5904   } // if(ooo)
5905   else
5906   {
5907     // In-order execution (branch first)
5908     //printf("IOE\n");
5909     int nottaken=0;
5910     if(rt1[i]==31) {
5911       int rt,return_address;
5912       rt=get_reg(branch_regs[i].regmap,31);
5913       if(rt>=0) {
5914         // Save the PC even if the branch is not taken
5915         return_address=start+i*4+8;
5916         emit_movimm(return_address,rt); // PC into link register
5917         #ifdef IMM_PREFETCH
5918         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5919         #endif
5920       }
5921     }
5922     if(!unconditional) {
5923       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5924       if(!only32)
5925       {
5926         assert(s1h>=0);
5927         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5928         {
5929           emit_test(s1h,s1h);
5930           nottaken=(int)out;
5931           emit_jns(1);
5932         }
5933         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5934         {
5935           emit_test(s1h,s1h);
5936           nottaken=(int)out;
5937           emit_js(1);
5938         }
5939       } // if(!only32)
5940       else
5941       {
5942         assert(s1l>=0);
5943         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5944         {
5945           emit_test(s1l,s1l);
5946           nottaken=(int)out;
5947           emit_jns(1);
5948         }
5949         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5950         {
5951           emit_test(s1l,s1l);
5952           nottaken=(int)out;
5953           emit_js(1);
5954         }
5955       }
5956     } // if(!unconditional)
5957     int adj;
5958     uint64_t ds_unneeded=branch_regs[i].u;
5959     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5960     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5961     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5962     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5963     ds_unneeded|=1;
5964     ds_unneeded_upper|=1;
5965     // branch taken
5966     if(!nevertaken) {
5967       //assem_debug("1:\n");
5968       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5969                     ds_unneeded,ds_unneeded_upper);
5970       // load regs
5971       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5972       address_generation(i+1,&branch_regs[i],0);
5973       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5974       ds_assemble(i+1,&branch_regs[i]);
5975       cc=get_reg(branch_regs[i].regmap,CCREG);
5976       if(cc==-1) {
5977         emit_loadreg(CCREG,cc=HOST_CCREG);
5978         // CHECK: Is the following instruction (fall thru) allocated ok?
5979       }
5980       assert(cc==HOST_CCREG);
5981       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5982       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5983       assem_debug("cycle count (adj)\n");
5984       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5985       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5986       if(internal)
5987         assem_debug("branch: internal\n");
5988       else
5989         assem_debug("branch: external\n");
5990       if(internal&&is_ds[(ba[i]-start)>>2]) {
5991         ds_assemble_entry(i);
5992       }
5993       else {
5994         add_to_linker((int)out,ba[i],internal);
5995         emit_jmp(0);
5996       }
5997     }
5998     // branch not taken
5999     cop1_usable=prev_cop1_usable;
6000     if(!unconditional) {
6001       set_jump_target(nottaken,(int)out);
6002       assem_debug("1:\n");
6003       if(!likely[i]) {
6004         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6005                       ds_unneeded,ds_unneeded_upper);
6006         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6007         address_generation(i+1,&branch_regs[i],0);
6008         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6009         ds_assemble(i+1,&branch_regs[i]);
6010       }
6011       cc=get_reg(branch_regs[i].regmap,CCREG);
6012       if(cc==-1&&!likely[i]) {
6013         // Cycle count isn't in a register, temporarily load it then write it out
6014         emit_loadreg(CCREG,HOST_CCREG);
6015         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6016         int jaddr=(int)out;
6017         emit_jns(0);
6018         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6019         emit_storereg(CCREG,HOST_CCREG);
6020       }
6021       else{
6022         cc=get_reg(i_regmap,CCREG);
6023         assert(cc==HOST_CCREG);
6024         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6025         int jaddr=(int)out;
6026         emit_jns(0);
6027         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6028       }
6029     }
6030   }
6031 }
6032
6033 void fjump_assemble(int i,struct regstat *i_regs)
6034 {
6035   signed char *i_regmap=i_regs->regmap;
6036   int cc;
6037   int match;
6038   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6039   assem_debug("fmatch=%d\n",match);
6040   int fs,cs;
6041   int eaddr;
6042   int invert=0;
6043   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6044   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6045   if(!match) invert=1;
6046   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6047   if(i>(ba[i]-start)>>2) invert=1;
6048   #endif
6049
6050   if(ooo[i]) {
6051     fs=get_reg(branch_regs[i].regmap,FSREG);
6052     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6053   }
6054   else {
6055     fs=get_reg(i_regmap,FSREG);
6056   }
6057
6058   // Check cop1 unusable
6059   if(!cop1_usable) {
6060     cs=get_reg(i_regmap,CSREG);
6061     assert(cs>=0);
6062     emit_testimm(cs,0x20000000);
6063     eaddr=(int)out;
6064     emit_jeq(0);
6065     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6066     cop1_usable=1;
6067   }
6068
6069   if(ooo[i]) {
6070     // Out of order execution (delay slot first)
6071     //printf("OOOE\n");
6072     ds_assemble(i+1,i_regs);
6073     int adj;
6074     uint64_t bc_unneeded=branch_regs[i].u;
6075     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6076     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6077     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6078     bc_unneeded|=1;
6079     bc_unneeded_upper|=1;
6080     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6081                   bc_unneeded,bc_unneeded_upper);
6082     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6083     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6084     cc=get_reg(branch_regs[i].regmap,CCREG);
6085     assert(cc==HOST_CCREG);
6086     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6087     assem_debug("cycle count (adj)\n");
6088     if(1) {
6089       int nottaken=0;
6090       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6091       if(1) {
6092         assert(fs>=0);
6093         emit_testimm(fs,0x800000);
6094         if(source[i]&0x10000) // BC1T
6095         {
6096           if(invert){
6097             nottaken=(int)out;
6098             emit_jeq(1);
6099           }else{
6100             add_to_linker((int)out,ba[i],internal);
6101             emit_jne(0);
6102           }
6103         }
6104         else // BC1F
6105           if(invert){
6106             nottaken=(int)out;
6107             emit_jne(1);
6108           }else{
6109             add_to_linker((int)out,ba[i],internal);
6110             emit_jeq(0);
6111           }
6112         {
6113         }
6114       } // if(!only32)
6115           
6116       if(invert) {
6117         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6118         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6119         else if(match) emit_addnop(13);
6120         #endif
6121         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6122         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6123         if(internal)
6124           assem_debug("branch: internal\n");
6125         else
6126           assem_debug("branch: external\n");
6127         if(internal&&is_ds[(ba[i]-start)>>2]) {
6128           ds_assemble_entry(i);
6129         }
6130         else {
6131           add_to_linker((int)out,ba[i],internal);
6132           emit_jmp(0);
6133         }
6134         set_jump_target(nottaken,(int)out);
6135       }
6136
6137       if(adj) {
6138         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6139       }
6140     } // (!unconditional)
6141   } // if(ooo)
6142   else
6143   {
6144     // In-order execution (branch first)
6145     //printf("IOE\n");
6146     int nottaken=0;
6147     if(1) {
6148       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6149       if(1) {
6150         assert(fs>=0);
6151         emit_testimm(fs,0x800000);
6152         if(source[i]&0x10000) // BC1T
6153         {
6154           nottaken=(int)out;
6155           emit_jeq(1);
6156         }
6157         else // BC1F
6158         {
6159           nottaken=(int)out;
6160           emit_jne(1);
6161         }
6162       }
6163     } // if(!unconditional)
6164     int adj;
6165     uint64_t ds_unneeded=branch_regs[i].u;
6166     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6167     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6168     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6169     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6170     ds_unneeded|=1;
6171     ds_unneeded_upper|=1;
6172     // branch taken
6173     //assem_debug("1:\n");
6174     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6175                   ds_unneeded,ds_unneeded_upper);
6176     // load regs
6177     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6178     address_generation(i+1,&branch_regs[i],0);
6179     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6180     ds_assemble(i+1,&branch_regs[i]);
6181     cc=get_reg(branch_regs[i].regmap,CCREG);
6182     if(cc==-1) {
6183       emit_loadreg(CCREG,cc=HOST_CCREG);
6184       // CHECK: Is the following instruction (fall thru) allocated ok?
6185     }
6186     assert(cc==HOST_CCREG);
6187     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6188     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6189     assem_debug("cycle count (adj)\n");
6190     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6191     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6192     if(internal)
6193       assem_debug("branch: internal\n");
6194     else
6195       assem_debug("branch: external\n");
6196     if(internal&&is_ds[(ba[i]-start)>>2]) {
6197       ds_assemble_entry(i);
6198     }
6199     else {
6200       add_to_linker((int)out,ba[i],internal);
6201       emit_jmp(0);
6202     }
6203
6204     // branch not taken
6205     if(1) { // <- FIXME (don't need this)
6206       set_jump_target(nottaken,(int)out);
6207       assem_debug("1:\n");
6208       if(!likely[i]) {
6209         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6210                       ds_unneeded,ds_unneeded_upper);
6211         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6212         address_generation(i+1,&branch_regs[i],0);
6213         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6214         ds_assemble(i+1,&branch_regs[i]);
6215       }
6216       cc=get_reg(branch_regs[i].regmap,CCREG);
6217       if(cc==-1&&!likely[i]) {
6218         // Cycle count isn't in a register, temporarily load it then write it out
6219         emit_loadreg(CCREG,HOST_CCREG);
6220         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6221         int jaddr=(int)out;
6222         emit_jns(0);
6223         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6224         emit_storereg(CCREG,HOST_CCREG);
6225       }
6226       else{
6227         cc=get_reg(i_regmap,CCREG);
6228         assert(cc==HOST_CCREG);
6229         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6230         int jaddr=(int)out;
6231         emit_jns(0);
6232         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6233       }
6234     }
6235   }
6236 }
6237
6238 static void pagespan_assemble(int i,struct regstat *i_regs)
6239 {
6240   int s1l=get_reg(i_regs->regmap,rs1[i]);
6241   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6242   int s2l=get_reg(i_regs->regmap,rs2[i]);
6243   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6244   void *nt_branch=NULL;
6245   int taken=0;
6246   int nottaken=0;
6247   int unconditional=0;
6248   if(rs1[i]==0)
6249   {
6250     s1l=s2l;s1h=s2h;
6251     s2l=s2h=-1;
6252   }
6253   else if(rs2[i]==0)
6254   {
6255     s2l=s2h=-1;
6256   }
6257   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6258     s1h=s2h=-1;
6259   }
6260   int hr=0;
6261   int addr,alt,ntaddr;
6262   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6263   else {
6264     while(hr<HOST_REGS)
6265     {
6266       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6267          (i_regs->regmap[hr]&63)!=rs1[i] &&
6268          (i_regs->regmap[hr]&63)!=rs2[i] )
6269       {
6270         addr=hr++;break;
6271       }
6272       hr++;
6273     }
6274   }
6275   while(hr<HOST_REGS)
6276   {
6277     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6278        (i_regs->regmap[hr]&63)!=rs1[i] &&
6279        (i_regs->regmap[hr]&63)!=rs2[i] )
6280     {
6281       alt=hr++;break;
6282     }
6283     hr++;
6284   }
6285   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6286   {
6287     while(hr<HOST_REGS)
6288     {
6289       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6290          (i_regs->regmap[hr]&63)!=rs1[i] &&
6291          (i_regs->regmap[hr]&63)!=rs2[i] )
6292       {
6293         ntaddr=hr;break;
6294       }
6295       hr++;
6296     }
6297   }
6298   assert(hr<HOST_REGS);
6299   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6300     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6301   }
6302   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6303   if(opcode[i]==2) // J
6304   {
6305     unconditional=1;
6306   }
6307   if(opcode[i]==3) // JAL
6308   {
6309     // TODO: mini_ht
6310     int rt=get_reg(i_regs->regmap,31);
6311     emit_movimm(start+i*4+8,rt);
6312     unconditional=1;
6313   }
6314   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6315   {
6316     emit_mov(s1l,addr);
6317     if(opcode2[i]==9) // JALR
6318     {
6319       int rt=get_reg(i_regs->regmap,rt1[i]);
6320       emit_movimm(start+i*4+8,rt);
6321     }
6322   }
6323   if((opcode[i]&0x3f)==4) // BEQ
6324   {
6325     if(rs1[i]==rs2[i])
6326     {
6327       unconditional=1;
6328     }
6329     else
6330     #ifdef HAVE_CMOV_IMM
6331     if(s1h<0) {
6332       if(s2l>=0) emit_cmp(s1l,s2l);
6333       else emit_test(s1l,s1l);
6334       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6335     }
6336     else
6337     #endif
6338     {
6339       assert(s1l>=0);
6340       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6341       if(s1h>=0) {
6342         if(s2h>=0) emit_cmp(s1h,s2h);
6343         else emit_test(s1h,s1h);
6344         emit_cmovne_reg(alt,addr);
6345       }
6346       if(s2l>=0) emit_cmp(s1l,s2l);
6347       else emit_test(s1l,s1l);
6348       emit_cmovne_reg(alt,addr);
6349     }
6350   }
6351   if((opcode[i]&0x3f)==5) // BNE
6352   {
6353     #ifdef HAVE_CMOV_IMM
6354     if(s1h<0) {
6355       if(s2l>=0) emit_cmp(s1l,s2l);
6356       else emit_test(s1l,s1l);
6357       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6358     }
6359     else
6360     #endif
6361     {
6362       assert(s1l>=0);
6363       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6364       if(s1h>=0) {
6365         if(s2h>=0) emit_cmp(s1h,s2h);
6366         else emit_test(s1h,s1h);
6367         emit_cmovne_reg(alt,addr);
6368       }
6369       if(s2l>=0) emit_cmp(s1l,s2l);
6370       else emit_test(s1l,s1l);
6371       emit_cmovne_reg(alt,addr);
6372     }
6373   }
6374   if((opcode[i]&0x3f)==0x14) // BEQL
6375   {
6376     if(s1h>=0) {
6377       if(s2h>=0) emit_cmp(s1h,s2h);
6378       else emit_test(s1h,s1h);
6379       nottaken=(int)out;
6380       emit_jne(0);
6381     }
6382     if(s2l>=0) emit_cmp(s1l,s2l);
6383     else emit_test(s1l,s1l);
6384     if(nottaken) set_jump_target(nottaken,(int)out);
6385     nottaken=(int)out;
6386     emit_jne(0);
6387   }
6388   if((opcode[i]&0x3f)==0x15) // BNEL
6389   {
6390     if(s1h>=0) {
6391       if(s2h>=0) emit_cmp(s1h,s2h);
6392       else emit_test(s1h,s1h);
6393       taken=(int)out;
6394       emit_jne(0);
6395     }
6396     if(s2l>=0) emit_cmp(s1l,s2l);
6397     else emit_test(s1l,s1l);
6398     nottaken=(int)out;
6399     emit_jeq(0);
6400     if(taken) set_jump_target(taken,(int)out);
6401   }
6402   if((opcode[i]&0x3f)==6) // BLEZ
6403   {
6404     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6405     emit_cmpimm(s1l,1);
6406     if(s1h>=0) emit_mov(addr,ntaddr);
6407     emit_cmovl_reg(alt,addr);
6408     if(s1h>=0) {
6409       emit_test(s1h,s1h);
6410       emit_cmovne_reg(ntaddr,addr);
6411       emit_cmovs_reg(alt,addr);
6412     }
6413   }
6414   if((opcode[i]&0x3f)==7) // BGTZ
6415   {
6416     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6417     emit_cmpimm(s1l,1);
6418     if(s1h>=0) emit_mov(addr,alt);
6419     emit_cmovl_reg(ntaddr,addr);
6420     if(s1h>=0) {
6421       emit_test(s1h,s1h);
6422       emit_cmovne_reg(alt,addr);
6423       emit_cmovs_reg(ntaddr,addr);
6424     }
6425   }
6426   if((opcode[i]&0x3f)==0x16) // BLEZL
6427   {
6428     assert((opcode[i]&0x3f)!=0x16);
6429   }
6430   if((opcode[i]&0x3f)==0x17) // BGTZL
6431   {
6432     assert((opcode[i]&0x3f)!=0x17);
6433   }
6434   assert(opcode[i]!=1); // BLTZ/BGEZ
6435
6436   //FIXME: Check CSREG
6437   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6438     if((source[i]&0x30000)==0) // BC1F
6439     {
6440       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6441       emit_testimm(s1l,0x800000);
6442       emit_cmovne_reg(alt,addr);
6443     }
6444     if((source[i]&0x30000)==0x10000) // BC1T
6445     {
6446       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6447       emit_testimm(s1l,0x800000);
6448       emit_cmovne_reg(alt,addr);
6449     }
6450     if((source[i]&0x30000)==0x20000) // BC1FL
6451     {
6452       emit_testimm(s1l,0x800000);
6453       nottaken=(int)out;
6454       emit_jne(0);
6455     }
6456     if((source[i]&0x30000)==0x30000) // BC1TL
6457     {
6458       emit_testimm(s1l,0x800000);
6459       nottaken=(int)out;
6460       emit_jeq(0);
6461     }
6462   }
6463
6464   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6465   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6466   if(likely[i]||unconditional)
6467   {
6468     emit_movimm(ba[i],HOST_BTREG);
6469   }
6470   else if(addr!=HOST_BTREG)
6471   {
6472     emit_mov(addr,HOST_BTREG);
6473   }
6474   void *branch_addr=out;
6475   emit_jmp(0);
6476   int target_addr=start+i*4+5;
6477   void *stub=out;
6478   void *compiled_target_addr=check_addr(target_addr);
6479   emit_extjump_ds((int)branch_addr,target_addr);
6480   if(compiled_target_addr) {
6481     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6482     add_link(target_addr,stub);
6483   }
6484   else set_jump_target((int)branch_addr,(int)stub);
6485   if(likely[i]) {
6486     // Not-taken path
6487     set_jump_target((int)nottaken,(int)out);
6488     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6489     void *branch_addr=out;
6490     emit_jmp(0);
6491     int target_addr=start+i*4+8;
6492     void *stub=out;
6493     void *compiled_target_addr=check_addr(target_addr);
6494     emit_extjump_ds((int)branch_addr,target_addr);
6495     if(compiled_target_addr) {
6496       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6497       add_link(target_addr,stub);
6498     }
6499     else set_jump_target((int)branch_addr,(int)stub);
6500   }
6501 }
6502
6503 // Assemble the delay slot for the above
6504 static void pagespan_ds()
6505 {
6506   assem_debug("initial delay slot:\n");
6507   u_int vaddr=start+1;
6508   u_int page=get_page(vaddr);
6509   u_int vpage=get_vpage(vaddr);
6510   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6511   do_dirty_stub_ds();
6512   ll_add(jump_in+page,vaddr,(void *)out);
6513   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6514   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6515     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6516   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6517     emit_writeword(HOST_BTREG,(int)&branch_target);
6518   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6519   address_generation(0,&regs[0],regs[0].regmap_entry);
6520   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6521     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6522   cop1_usable=0;
6523   is_delayslot=0;
6524   switch(itype[0]) {
6525     case ALU:
6526       alu_assemble(0,&regs[0]);break;
6527     case IMM16:
6528       imm16_assemble(0,&regs[0]);break;
6529     case SHIFT:
6530       shift_assemble(0,&regs[0]);break;
6531     case SHIFTIMM:
6532       shiftimm_assemble(0,&regs[0]);break;
6533     case LOAD:
6534       load_assemble(0,&regs[0]);break;
6535     case LOADLR:
6536       loadlr_assemble(0,&regs[0]);break;
6537     case STORE:
6538       store_assemble(0,&regs[0]);break;
6539     case STORELR:
6540       storelr_assemble(0,&regs[0]);break;
6541     case COP0:
6542       cop0_assemble(0,&regs[0]);break;
6543     case COP1:
6544       cop1_assemble(0,&regs[0]);break;
6545     case C1LS:
6546       c1ls_assemble(0,&regs[0]);break;
6547     case COP2:
6548       cop2_assemble(0,&regs[0]);break;
6549     case C2LS:
6550       c2ls_assemble(0,&regs[0]);break;
6551     case C2OP:
6552       c2op_assemble(0,&regs[0]);break;
6553     case FCONV:
6554       fconv_assemble(0,&regs[0]);break;
6555     case FLOAT:
6556       float_assemble(0,&regs[0]);break;
6557     case FCOMP:
6558       fcomp_assemble(0,&regs[0]);break;
6559     case MULTDIV:
6560       multdiv_assemble(0,&regs[0]);break;
6561     case MOV:
6562       mov_assemble(0,&regs[0]);break;
6563     case SYSCALL:
6564     case HLECALL:
6565     case INTCALL:
6566     case SPAN:
6567     case UJUMP:
6568     case RJUMP:
6569     case CJUMP:
6570     case SJUMP:
6571     case FJUMP:
6572       printf("Jump in the delay slot.  This is probably a bug.\n");
6573   }
6574   int btaddr=get_reg(regs[0].regmap,BTREG);
6575   if(btaddr<0) {
6576     btaddr=get_reg(regs[0].regmap,-1);
6577     emit_readword((int)&branch_target,btaddr);
6578   }
6579   assert(btaddr!=HOST_CCREG);
6580   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6581 #ifdef HOST_IMM8
6582   emit_movimm(start+4,HOST_TEMPREG);
6583   emit_cmp(btaddr,HOST_TEMPREG);
6584 #else
6585   emit_cmpimm(btaddr,start+4);
6586 #endif
6587   int branch=(int)out;
6588   emit_jeq(0);
6589   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6590   emit_jmp(jump_vaddr_reg[btaddr]);
6591   set_jump_target(branch,(int)out);
6592   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6593   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6594 }
6595
6596 // Basic liveness analysis for MIPS registers
6597 void unneeded_registers(int istart,int iend,int r)
6598 {
6599   int i;
6600   uint64_t u,uu,b,bu;
6601   uint64_t temp_u,temp_uu;
6602   uint64_t tdep;
6603   if(iend==slen-1) {
6604     u=1;uu=1;
6605   }else{
6606     u=unneeded_reg[iend+1];
6607     uu=unneeded_reg_upper[iend+1];
6608     u=1;uu=1;
6609   }
6610   for (i=iend;i>=istart;i--)
6611   {
6612     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6613     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6614     {
6615       // If subroutine call, flag return address as a possible branch target
6616       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6617       
6618       if(ba[i]<start || ba[i]>=(start+slen*4))
6619       {
6620         // Branch out of this block, flush all regs
6621         u=1;
6622         uu=1;
6623         /* Hexagon hack 
6624         if(itype[i]==UJUMP&&rt1[i]==31)
6625         {
6626           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6627         }
6628         if(itype[i]==RJUMP&&rs1[i]==31)
6629         {
6630           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6631         }
6632         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6633           if(itype[i]==UJUMP&&rt1[i]==31)
6634           {
6635             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6636             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6637           }
6638           if(itype[i]==RJUMP&&rs1[i]==31)
6639           {
6640             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6641             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6642           }
6643         }*/
6644         branch_unneeded_reg[i]=u;
6645         branch_unneeded_reg_upper[i]=uu;
6646         // Merge in delay slot
6647         tdep=(~uu>>rt1[i+1])&1;
6648         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6649         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6650         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6651         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6652         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6653         u|=1;uu|=1;
6654         // If branch is "likely" (and conditional)
6655         // then we skip the delay slot on the fall-thru path
6656         if(likely[i]) {
6657           if(i<slen-1) {
6658             u&=unneeded_reg[i+2];
6659             uu&=unneeded_reg_upper[i+2];
6660           }
6661           else
6662           {
6663             u=1;
6664             uu=1;
6665           }
6666         }
6667       }
6668       else
6669       {
6670         // Internal branch, flag target
6671         bt[(ba[i]-start)>>2]=1;
6672         if(ba[i]<=start+i*4) {
6673           // Backward branch
6674           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6675           {
6676             // Unconditional branch
6677             temp_u=1;temp_uu=1;
6678           } else {
6679             // Conditional branch (not taken case)
6680             temp_u=unneeded_reg[i+2];
6681             temp_uu=unneeded_reg_upper[i+2];
6682           }
6683           // Merge in delay slot
6684           tdep=(~temp_uu>>rt1[i+1])&1;
6685           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6686           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6687           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6688           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6689           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6690           temp_u|=1;temp_uu|=1;
6691           // If branch is "likely" (and conditional)
6692           // then we skip the delay slot on the fall-thru path
6693           if(likely[i]) {
6694             if(i<slen-1) {
6695               temp_u&=unneeded_reg[i+2];
6696               temp_uu&=unneeded_reg_upper[i+2];
6697             }
6698             else
6699             {
6700               temp_u=1;
6701               temp_uu=1;
6702             }
6703           }
6704           tdep=(~temp_uu>>rt1[i])&1;
6705           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6706           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6707           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6708           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6709           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6710           temp_u|=1;temp_uu|=1;
6711           unneeded_reg[i]=temp_u;
6712           unneeded_reg_upper[i]=temp_uu;
6713           // Only go three levels deep.  This recursion can take an
6714           // excessive amount of time if there are a lot of nested loops.
6715           if(r<2) {
6716             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6717           }else{
6718             unneeded_reg[(ba[i]-start)>>2]=1;
6719             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6720           }
6721         } /*else*/ if(1) {
6722           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6723           {
6724             // Unconditional branch
6725             u=unneeded_reg[(ba[i]-start)>>2];
6726             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6727             branch_unneeded_reg[i]=u;
6728             branch_unneeded_reg_upper[i]=uu;
6729         //u=1;
6730         //uu=1;
6731         //branch_unneeded_reg[i]=u;
6732         //branch_unneeded_reg_upper[i]=uu;
6733             // Merge in delay slot
6734             tdep=(~uu>>rt1[i+1])&1;
6735             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6736             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6737             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6738             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6739             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6740             u|=1;uu|=1;
6741           } else {
6742             // Conditional branch
6743             b=unneeded_reg[(ba[i]-start)>>2];
6744             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6745             branch_unneeded_reg[i]=b;
6746             branch_unneeded_reg_upper[i]=bu;
6747         //b=1;
6748         //bu=1;
6749         //branch_unneeded_reg[i]=b;
6750         //branch_unneeded_reg_upper[i]=bu;
6751             // Branch delay slot
6752             tdep=(~uu>>rt1[i+1])&1;
6753             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6754             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6755             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6756             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6757             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6758             b|=1;bu|=1;
6759             // If branch is "likely" then we skip the
6760             // delay slot on the fall-thru path
6761             if(likely[i]) {
6762               u=b;
6763               uu=bu;
6764               if(i<slen-1) {
6765                 u&=unneeded_reg[i+2];
6766                 uu&=unneeded_reg_upper[i+2];
6767         //u=1;
6768         //uu=1;
6769               }
6770             } else {
6771               u&=b;
6772               uu&=bu;
6773         //u=1;
6774         //uu=1;
6775             }
6776             if(i<slen-1) {
6777               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6778               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6779         //branch_unneeded_reg[i]=1;
6780         //branch_unneeded_reg_upper[i]=1;
6781             } else {
6782               branch_unneeded_reg[i]=1;
6783               branch_unneeded_reg_upper[i]=1;
6784             }
6785           }
6786         }
6787       }
6788     }
6789     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6790     {
6791       // SYSCALL instruction (software interrupt)
6792       u=1;
6793       uu=1;
6794     }
6795     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6796     {
6797       // ERET instruction (return from interrupt)
6798       u=1;
6799       uu=1;
6800     }
6801     //u=uu=1; // DEBUG
6802     tdep=(~uu>>rt1[i])&1;
6803     // Written registers are unneeded
6804     u|=1LL<<rt1[i];
6805     u|=1LL<<rt2[i];
6806     uu|=1LL<<rt1[i];
6807     uu|=1LL<<rt2[i];
6808     // Accessed registers are needed
6809     u&=~(1LL<<rs1[i]);
6810     u&=~(1LL<<rs2[i]);
6811     uu&=~(1LL<<us1[i]);
6812     uu&=~(1LL<<us2[i]);
6813     // Source-target dependencies
6814     uu&=~(tdep<<dep1[i]);
6815     uu&=~(tdep<<dep2[i]);
6816     // R0 is always unneeded
6817     u|=1;uu|=1;
6818     // Save it
6819     unneeded_reg[i]=u;
6820     unneeded_reg_upper[i]=uu;
6821     /*
6822     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6823     printf("U:");
6824     int r;
6825     for(r=1;r<=CCREG;r++) {
6826       if((unneeded_reg[i]>>r)&1) {
6827         if(r==HIREG) printf(" HI");
6828         else if(r==LOREG) printf(" LO");
6829         else printf(" r%d",r);
6830       }
6831     }
6832     printf(" UU:");
6833     for(r=1;r<=CCREG;r++) {
6834       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6835         if(r==HIREG) printf(" HI");
6836         else if(r==LOREG) printf(" LO");
6837         else printf(" r%d",r);
6838       }
6839     }
6840     printf("\n");*/
6841   }
6842 #ifdef FORCE32
6843   for (i=iend;i>=istart;i--)
6844   {
6845     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6846   }
6847 #endif
6848 }
6849
6850 // Identify registers which are likely to contain 32-bit values
6851 // This is used to predict whether any branches will jump to a
6852 // location with 64-bit values in registers.
6853 static void provisional_32bit()
6854 {
6855   int i,j;
6856   uint64_t is32=1;
6857   uint64_t lastbranch=1;
6858   
6859   for(i=0;i<slen;i++)
6860   {
6861     if(i>0) {
6862       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6863         if(i>1) is32=lastbranch;
6864         else is32=1;
6865       }
6866     }
6867     if(i>1)
6868     {
6869       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6870         if(likely[i-2]) {
6871           if(i>2) is32=lastbranch;
6872           else is32=1;
6873         }
6874       }
6875       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6876       {
6877         if(rs1[i-2]==0||rs2[i-2]==0)
6878         {
6879           if(rs1[i-2]) {
6880             is32|=1LL<<rs1[i-2];
6881           }
6882           if(rs2[i-2]) {
6883             is32|=1LL<<rs2[i-2];
6884           }
6885         }
6886       }
6887     }
6888     // If something jumps here with 64-bit values
6889     // then promote those registers to 64 bits
6890     if(bt[i])
6891     {
6892       uint64_t temp_is32=is32;
6893       for(j=i-1;j>=0;j--)
6894       {
6895         if(ba[j]==start+i*4) 
6896           //temp_is32&=branch_regs[j].is32;
6897           temp_is32&=p32[j];
6898       }
6899       for(j=i;j<slen;j++)
6900       {
6901         if(ba[j]==start+i*4) 
6902           temp_is32=1;
6903       }
6904       is32=temp_is32;
6905     }
6906     int type=itype[i];
6907     int op=opcode[i];
6908     int op2=opcode2[i];
6909     int rt=rt1[i];
6910     int s1=rs1[i];
6911     int s2=rs2[i];
6912     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6913       // Branches don't write registers, consider the delay slot instead.
6914       type=itype[i+1];
6915       op=opcode[i+1];
6916       op2=opcode2[i+1];
6917       rt=rt1[i+1];
6918       s1=rs1[i+1];
6919       s2=rs2[i+1];
6920       lastbranch=is32;
6921     }
6922     switch(type) {
6923       case LOAD:
6924         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6925            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6926           is32&=~(1LL<<rt);
6927         else
6928           is32|=1LL<<rt;
6929         break;
6930       case STORE:
6931       case STORELR:
6932         break;
6933       case LOADLR:
6934         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6935         if(op==0x22) is32|=1LL<<rt; // LWL
6936         break;
6937       case IMM16:
6938         if (op==0x08||op==0x09|| // ADDI/ADDIU
6939             op==0x0a||op==0x0b|| // SLTI/SLTIU
6940             op==0x0c|| // ANDI
6941             op==0x0f)  // LUI
6942         {
6943           is32|=1LL<<rt;
6944         }
6945         if(op==0x18||op==0x19) { // DADDI/DADDIU
6946           is32&=~(1LL<<rt);
6947           //if(imm[i]==0)
6948           //  is32|=((is32>>s1)&1LL)<<rt;
6949         }
6950         if(op==0x0d||op==0x0e) { // ORI/XORI
6951           uint64_t sr=((is32>>s1)&1LL);
6952           is32&=~(1LL<<rt);
6953           is32|=sr<<rt;
6954         }
6955         break;
6956       case UJUMP:
6957         break;
6958       case RJUMP:
6959         break;
6960       case CJUMP:
6961         break;
6962       case SJUMP:
6963         break;
6964       case FJUMP:
6965         break;
6966       case ALU:
6967         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6968           is32|=1LL<<rt;
6969         }
6970         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6971           is32|=1LL<<rt;
6972         }
6973         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6974           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6975           is32&=~(1LL<<rt);
6976           is32|=sr<<rt;
6977         }
6978         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6979           if(s1==0&&s2==0) {
6980             is32|=1LL<<rt;
6981           }
6982           else if(s2==0) {
6983             uint64_t sr=((is32>>s1)&1LL);
6984             is32&=~(1LL<<rt);
6985             is32|=sr<<rt;
6986           }
6987           else if(s1==0) {
6988             uint64_t sr=((is32>>s2)&1LL);
6989             is32&=~(1LL<<rt);
6990             is32|=sr<<rt;
6991           }
6992           else {
6993             is32&=~(1LL<<rt);
6994           }
6995         }
6996         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6997           if(s1==0&&s2==0) {
6998             is32|=1LL<<rt;
6999           }
7000           else if(s2==0) {
7001             uint64_t sr=((is32>>s1)&1LL);
7002             is32&=~(1LL<<rt);
7003             is32|=sr<<rt;
7004           }
7005           else {
7006             is32&=~(1LL<<rt);
7007           }
7008         }
7009         break;
7010       case MULTDIV:
7011         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7012           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7013         }
7014         else {
7015           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7016         }
7017         break;
7018       case MOV:
7019         {
7020           uint64_t sr=((is32>>s1)&1LL);
7021           is32&=~(1LL<<rt);
7022           is32|=sr<<rt;
7023         }
7024         break;
7025       case SHIFT:
7026         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7027         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7028         break;
7029       case SHIFTIMM:
7030         is32|=1LL<<rt;
7031         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7032         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7033         break;
7034       case COP0:
7035         if(op2==0) is32|=1LL<<rt; // MFC0
7036         break;
7037       case COP1:
7038       case COP2:
7039         if(op2==0) is32|=1LL<<rt; // MFC1
7040         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7041         if(op2==2) is32|=1LL<<rt; // CFC1
7042         break;
7043       case C1LS:
7044       case C2LS:
7045         break;
7046       case FLOAT:
7047       case FCONV:
7048         break;
7049       case FCOMP:
7050         break;
7051       case C2OP:
7052       case SYSCALL:
7053       case HLECALL:
7054         break;
7055       default:
7056         break;
7057     }
7058     is32|=1;
7059     p32[i]=is32;
7060
7061     if(i>0)
7062     {
7063       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7064       {
7065         if(rt1[i-1]==31) // JAL/JALR
7066         {
7067           // Subroutine call will return here, don't alloc any registers
7068           is32=1;
7069         }
7070         else if(i+1<slen)
7071         {
7072           // Internal branch will jump here, match registers to caller
7073           is32=0x3FFFFFFFFLL;
7074         }
7075       }
7076     }
7077   }
7078 }
7079
7080 // Identify registers which may be assumed to contain 32-bit values
7081 // and where optimizations will rely on this.
7082 // This is used to determine whether backward branches can safely
7083 // jump to a location with 64-bit values in registers.
7084 static void provisional_r32()
7085 {
7086   u_int r32=0;
7087   int i;
7088   
7089   for (i=slen-1;i>=0;i--)
7090   {
7091     int hr;
7092     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7093     {
7094       if(ba[i]<start || ba[i]>=(start+slen*4))
7095       {
7096         // Branch out of this block, don't need anything
7097         r32=0;
7098       }
7099       else
7100       {
7101         // Internal branch
7102         // Need whatever matches the target
7103         // (and doesn't get overwritten by the delay slot instruction)
7104         r32=0;
7105         int t=(ba[i]-start)>>2;
7106         if(ba[i]>start+i*4) {
7107           // Forward branch
7108           //if(!(requires_32bit[t]&~regs[i].was32))
7109           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7110           if(!(pr32[t]&~regs[i].was32))
7111             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7112         }else{
7113           // Backward branch
7114           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7115             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7116         }
7117       }
7118       // Conditional branch may need registers for following instructions
7119       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7120       {
7121         if(i<slen-2) {
7122           //r32|=requires_32bit[i+2];
7123           r32|=pr32[i+2];
7124           r32&=regs[i].was32;
7125           // Mark this address as a branch target since it may be called
7126           // upon return from interrupt
7127           //bt[i+2]=1;
7128         }
7129       }
7130       // Merge in delay slot
7131       if(!likely[i]) {
7132         // These are overwritten unless the branch is "likely"
7133         // and the delay slot is nullified if not taken
7134         r32&=~(1LL<<rt1[i+1]);
7135         r32&=~(1LL<<rt2[i+1]);
7136       }
7137       // Assume these are needed (delay slot)
7138       if(us1[i+1]>0)
7139       {
7140         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7141       }
7142       if(us2[i+1]>0)
7143       {
7144         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7145       }
7146       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7147       {
7148         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7149       }
7150       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7151       {
7152         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7153       }
7154     }
7155     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7156     {
7157       // SYSCALL instruction (software interrupt)
7158       r32=0;
7159     }
7160     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7161     {
7162       // ERET instruction (return from interrupt)
7163       r32=0;
7164     }
7165     // Check 32 bits
7166     r32&=~(1LL<<rt1[i]);
7167     r32&=~(1LL<<rt2[i]);
7168     if(us1[i]>0)
7169     {
7170       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7171     }
7172     if(us2[i]>0)
7173     {
7174       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7175     }
7176     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7177     {
7178       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7179     }
7180     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7181     {
7182       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7183     }
7184     //requires_32bit[i]=r32;
7185     pr32[i]=r32;
7186     
7187     // Dirty registers which are 32-bit, require 32-bit input
7188     // as they will be written as 32-bit values
7189     for(hr=0;hr<HOST_REGS;hr++)
7190     {
7191       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7192         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7193           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7194           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7195           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7196         }
7197       }
7198     }
7199   }
7200 }
7201
7202 // Write back dirty registers as soon as we will no longer modify them,
7203 // so that we don't end up with lots of writes at the branches.
7204 void clean_registers(int istart,int iend,int wr)
7205 {
7206   int i;
7207   int r;
7208   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7209   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7210   if(iend==slen-1) {
7211     will_dirty_i=will_dirty_next=0;
7212     wont_dirty_i=wont_dirty_next=0;
7213   }else{
7214     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7215     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7216   }
7217   for (i=iend;i>=istart;i--)
7218   {
7219     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7220     {
7221       if(ba[i]<start || ba[i]>=(start+slen*4))
7222       {
7223         // Branch out of this block, flush all regs
7224         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7225         {
7226           // Unconditional branch
7227           will_dirty_i=0;
7228           wont_dirty_i=0;
7229           // Merge in delay slot (will dirty)
7230           for(r=0;r<HOST_REGS;r++) {
7231             if(r!=EXCLUDE_REG) {
7232               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7233               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7234               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7235               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7236               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7237               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7238               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7239               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7240               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7241               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7242               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7243               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7244               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7245               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7246             }
7247           }
7248         }
7249         else
7250         {
7251           // Conditional branch
7252           will_dirty_i=0;
7253           wont_dirty_i=wont_dirty_next;
7254           // Merge in delay slot (will dirty)
7255           for(r=0;r<HOST_REGS;r++) {
7256             if(r!=EXCLUDE_REG) {
7257               if(!likely[i]) {
7258                 // Might not dirty if likely branch is not taken
7259                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7260                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7261                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7262                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7263                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7264                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7265                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7266                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7267                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7268                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7269                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7270                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7271                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7272                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7273               }
7274             }
7275           }
7276         }
7277         // Merge in delay slot (wont dirty)
7278         for(r=0;r<HOST_REGS;r++) {
7279           if(r!=EXCLUDE_REG) {
7280             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7281             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7282             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7283             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7284             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7285             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7286             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7287             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7288             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7289             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7290           }
7291         }
7292         if(wr) {
7293           #ifndef DESTRUCTIVE_WRITEBACK
7294           branch_regs[i].dirty&=wont_dirty_i;
7295           #endif
7296           branch_regs[i].dirty|=will_dirty_i;
7297         }
7298       }
7299       else
7300       {
7301         // Internal branch
7302         if(ba[i]<=start+i*4) {
7303           // Backward branch
7304           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7305           {
7306             // Unconditional branch
7307             temp_will_dirty=0;
7308             temp_wont_dirty=0;
7309             // Merge in delay slot (will dirty)
7310             for(r=0;r<HOST_REGS;r++) {
7311               if(r!=EXCLUDE_REG) {
7312                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7313                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7314                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7315                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7316                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7317                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7318                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7319                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7320                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7321                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7322                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7323                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7324                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7325                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7326               }
7327             }
7328           } else {
7329             // Conditional branch (not taken case)
7330             temp_will_dirty=will_dirty_next;
7331             temp_wont_dirty=wont_dirty_next;
7332             // Merge in delay slot (will dirty)
7333             for(r=0;r<HOST_REGS;r++) {
7334               if(r!=EXCLUDE_REG) {
7335                 if(!likely[i]) {
7336                   // Will not dirty if likely branch is not taken
7337                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7338                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7339                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7340                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7341                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7342                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7343                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7344                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7345                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7346                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7347                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7348                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7349                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7350                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7351                 }
7352               }
7353             }
7354           }
7355           // Merge in delay slot (wont dirty)
7356           for(r=0;r<HOST_REGS;r++) {
7357             if(r!=EXCLUDE_REG) {
7358               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7359               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7360               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7361               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7362               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7363               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7364               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7365               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7366               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7367               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7368             }
7369           }
7370           // Deal with changed mappings
7371           if(i<iend) {
7372             for(r=0;r<HOST_REGS;r++) {
7373               if(r!=EXCLUDE_REG) {
7374                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7375                   temp_will_dirty&=~(1<<r);
7376                   temp_wont_dirty&=~(1<<r);
7377                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7378                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7379                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7380                   } else {
7381                     temp_will_dirty|=1<<r;
7382                     temp_wont_dirty|=1<<r;
7383                   }
7384                 }
7385               }
7386             }
7387           }
7388           if(wr) {
7389             will_dirty[i]=temp_will_dirty;
7390             wont_dirty[i]=temp_wont_dirty;
7391             clean_registers((ba[i]-start)>>2,i-1,0);
7392           }else{
7393             // Limit recursion.  It can take an excessive amount
7394             // of time if there are a lot of nested loops.
7395             will_dirty[(ba[i]-start)>>2]=0;
7396             wont_dirty[(ba[i]-start)>>2]=-1;
7397           }
7398         }
7399         /*else*/ if(1)
7400         {
7401           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7402           {
7403             // Unconditional branch
7404             will_dirty_i=0;
7405             wont_dirty_i=0;
7406           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7407             for(r=0;r<HOST_REGS;r++) {
7408               if(r!=EXCLUDE_REG) {
7409                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7410                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7411                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7412                 }
7413               }
7414             }
7415           //}
7416             // Merge in delay slot
7417             for(r=0;r<HOST_REGS;r++) {
7418               if(r!=EXCLUDE_REG) {
7419                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7420                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7421                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7422                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7423                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7424                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7425                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7426                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7427                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7428                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7429                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7430                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7431                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7432                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7433               }
7434             }
7435           } else {
7436             // Conditional branch
7437             will_dirty_i=will_dirty_next;
7438             wont_dirty_i=wont_dirty_next;
7439           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7440             for(r=0;r<HOST_REGS;r++) {
7441               if(r!=EXCLUDE_REG) {
7442                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7443                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7444                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7445                 }
7446                 else
7447                 {
7448                   will_dirty_i&=~(1<<r);
7449                 }
7450                 // Treat delay slot as part of branch too
7451                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7452                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7453                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7454                 }
7455                 else
7456                 {
7457                   will_dirty[i+1]&=~(1<<r);
7458                 }*/
7459               }
7460             }
7461           //}
7462             // Merge in delay slot
7463             for(r=0;r<HOST_REGS;r++) {
7464               if(r!=EXCLUDE_REG) {
7465                 if(!likely[i]) {
7466                   // Might not dirty if likely branch is not taken
7467                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7468                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7469                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7470                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7471                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7472                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7473                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7474                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7475                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7476                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7477                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7478                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7479                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7480                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7481                 }
7482               }
7483             }
7484           }
7485           // Merge in delay slot
7486           for(r=0;r<HOST_REGS;r++) {
7487             if(r!=EXCLUDE_REG) {
7488               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7489               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7490               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7491               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7492               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7493               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7494               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7495               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7496               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7497               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7498             }
7499           }
7500           if(wr) {
7501             #ifndef DESTRUCTIVE_WRITEBACK
7502             branch_regs[i].dirty&=wont_dirty_i;
7503             #endif
7504             branch_regs[i].dirty|=will_dirty_i;
7505           }
7506         }
7507       }
7508     }
7509     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7510     {
7511       // SYSCALL instruction (software interrupt)
7512       will_dirty_i=0;
7513       wont_dirty_i=0;
7514     }
7515     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7516     {
7517       // ERET instruction (return from interrupt)
7518       will_dirty_i=0;
7519       wont_dirty_i=0;
7520     }
7521     will_dirty_next=will_dirty_i;
7522     wont_dirty_next=wont_dirty_i;
7523     for(r=0;r<HOST_REGS;r++) {
7524       if(r!=EXCLUDE_REG) {
7525         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7526         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7527         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7528         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7529         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7530         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7531         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7532         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7533         if(i>istart) {
7534           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7535           {
7536             // Don't store a register immediately after writing it,
7537             // may prevent dual-issue.
7538             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7539             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7540           }
7541         }
7542       }
7543     }
7544     // Save it
7545     will_dirty[i]=will_dirty_i;
7546     wont_dirty[i]=wont_dirty_i;
7547     // Mark registers that won't be dirtied as not dirty
7548     if(wr) {
7549       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7550       for(r=0;r<HOST_REGS;r++) {
7551         if((will_dirty_i>>r)&1) {
7552           printf(" r%d",r);
7553         }
7554       }
7555       printf("\n");*/
7556
7557       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7558         regs[i].dirty|=will_dirty_i;
7559         #ifndef DESTRUCTIVE_WRITEBACK
7560         regs[i].dirty&=wont_dirty_i;
7561         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7562         {
7563           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7564             for(r=0;r<HOST_REGS;r++) {
7565               if(r!=EXCLUDE_REG) {
7566                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7567                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7568                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7569               }
7570             }
7571           }
7572         }
7573         else
7574         {
7575           if(i<iend) {
7576             for(r=0;r<HOST_REGS;r++) {
7577               if(r!=EXCLUDE_REG) {
7578                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7579                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7580                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7581               }
7582             }
7583           }
7584         }
7585         #endif
7586       //}
7587     }
7588     // Deal with changed mappings
7589     temp_will_dirty=will_dirty_i;
7590     temp_wont_dirty=wont_dirty_i;
7591     for(r=0;r<HOST_REGS;r++) {
7592       if(r!=EXCLUDE_REG) {
7593         int nr;
7594         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7595           if(wr) {
7596             #ifndef DESTRUCTIVE_WRITEBACK
7597             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7598             #endif
7599             regs[i].wasdirty|=will_dirty_i&(1<<r);
7600           }
7601         }
7602         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7603           // Register moved to a different register
7604           will_dirty_i&=~(1<<r);
7605           wont_dirty_i&=~(1<<r);
7606           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7607           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7608           if(wr) {
7609             #ifndef DESTRUCTIVE_WRITEBACK
7610             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7611             #endif
7612             regs[i].wasdirty|=will_dirty_i&(1<<r);
7613           }
7614         }
7615         else {
7616           will_dirty_i&=~(1<<r);
7617           wont_dirty_i&=~(1<<r);
7618           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7619             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7620             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7621           } else {
7622             wont_dirty_i|=1<<r;
7623             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7624           }
7625         }
7626       }
7627     }
7628   }
7629 }
7630
7631   /* disassembly */
7632 void disassemble_inst(int i)
7633 {
7634     if (bt[i]) printf("*"); else printf(" ");
7635     switch(itype[i]) {
7636       case UJUMP:
7637         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7638       case CJUMP:
7639         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7640       case SJUMP:
7641         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7642       case FJUMP:
7643         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7644       case RJUMP:
7645         if (opcode[i]==0x9&&rt1[i]!=31)
7646           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7647         else
7648           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7649         break;
7650       case SPAN:
7651         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7652       case IMM16:
7653         if(opcode[i]==0xf) //LUI
7654           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7655         else
7656           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7657         break;
7658       case LOAD:
7659       case LOADLR:
7660         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7661         break;
7662       case STORE:
7663       case STORELR:
7664         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7665         break;
7666       case ALU:
7667       case SHIFT:
7668         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7669         break;
7670       case MULTDIV:
7671         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7672         break;
7673       case SHIFTIMM:
7674         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7675         break;
7676       case MOV:
7677         if((opcode2[i]&0x1d)==0x10)
7678           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7679         else if((opcode2[i]&0x1d)==0x11)
7680           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7681         else
7682           printf (" %x: %s\n",start+i*4,insn[i]);
7683         break;
7684       case COP0:
7685         if(opcode2[i]==0)
7686           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7687         else if(opcode2[i]==4)
7688           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7689         else printf (" %x: %s\n",start+i*4,insn[i]);
7690         break;
7691       case COP1:
7692         if(opcode2[i]<3)
7693           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7694         else if(opcode2[i]>3)
7695           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7696         else printf (" %x: %s\n",start+i*4,insn[i]);
7697         break;
7698       case COP2:
7699         if(opcode2[i]<3)
7700           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7701         else if(opcode2[i]>3)
7702           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7703         else printf (" %x: %s\n",start+i*4,insn[i]);
7704         break;
7705       case C1LS:
7706         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7707         break;
7708       case C2LS:
7709         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7710         break;
7711       case INTCALL:
7712         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7713         break;
7714       default:
7715         //printf (" %s %8x\n",insn[i],source[i]);
7716         printf (" %x: %s\n",start+i*4,insn[i]);
7717     }
7718 }
7719
7720 // clear the state completely, instead of just marking
7721 // things invalid like invalidate_all_pages() does
7722 void new_dynarec_clear_full()
7723 {
7724   int n;
7725   for(n=0x80000;n<0x80800;n++)
7726     invalid_code[n]=1;
7727   for(n=0;n<65536;n++)
7728     hash_table[n][0]=hash_table[n][2]=-1;
7729   memset(mini_ht,-1,sizeof(mini_ht));
7730   memset(restore_candidate,0,sizeof(restore_candidate));
7731   memset(shadow,0,sizeof(shadow));
7732   copy=shadow;
7733   expirep=16384; // Expiry pointer, +2 blocks
7734   pending_exception=0;
7735   literalcount=0;
7736   stop_after_jal=0;
7737   // TLB
7738 #ifndef DISABLE_TLB
7739   using_tlb=0;
7740 #endif
7741   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7742     memory_map[n]=-1;
7743   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7744     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7745   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7746     memory_map[n]=-1;
7747   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7748   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7749   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7750 }
7751
7752 void new_dynarec_init()
7753 {
7754   printf("Init new dynarec\n");
7755   out=(u_char *)BASE_ADDR;
7756   if (mmap (out, 1<<TARGET_SIZE_2,
7757             PROT_READ | PROT_WRITE | PROT_EXEC,
7758             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7759             -1, 0) <= 0) {printf("mmap() failed\n");}
7760 #ifdef MUPEN64
7761   rdword=&readmem_dword;
7762   fake_pc.f.r.rs=&readmem_dword;
7763   fake_pc.f.r.rt=&readmem_dword;
7764   fake_pc.f.r.rd=&readmem_dword;
7765 #endif
7766   int n;
7767   new_dynarec_clear_full();
7768 #ifdef HOST_IMM8
7769   // Copy this into local area so we don't have to put it in every literal pool
7770   invc_ptr=invalid_code;
7771 #endif
7772 #ifdef MUPEN64
7773   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7774     writemem[n] = write_nomem_new;
7775     writememb[n] = write_nomemb_new;
7776     writememh[n] = write_nomemh_new;
7777 #ifndef FORCE32
7778     writememd[n] = write_nomemd_new;
7779 #endif
7780     readmem[n] = read_nomem_new;
7781     readmemb[n] = read_nomemb_new;
7782     readmemh[n] = read_nomemh_new;
7783 #ifndef FORCE32
7784     readmemd[n] = read_nomemd_new;
7785 #endif
7786   }
7787   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7788     writemem[n] = write_rdram_new;
7789     writememb[n] = write_rdramb_new;
7790     writememh[n] = write_rdramh_new;
7791 #ifndef FORCE32
7792     writememd[n] = write_rdramd_new;
7793 #endif
7794   }
7795   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7796     writemem[n] = write_nomem_new;
7797     writememb[n] = write_nomemb_new;
7798     writememh[n] = write_nomemh_new;
7799 #ifndef FORCE32
7800     writememd[n] = write_nomemd_new;
7801 #endif
7802     readmem[n] = read_nomem_new;
7803     readmemb[n] = read_nomemb_new;
7804     readmemh[n] = read_nomemh_new;
7805 #ifndef FORCE32
7806     readmemd[n] = read_nomemd_new;
7807 #endif
7808   }
7809 #endif
7810   tlb_hacks();
7811   arch_init();
7812 }
7813
7814 void new_dynarec_cleanup()
7815 {
7816   int n;
7817   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7818   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7819   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7820   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7821   #ifdef ROM_COPY
7822   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7823   #endif
7824 }
7825
7826 int new_recompile_block(int addr)
7827 {
7828 /*
7829   if(addr==0x800cd050) {
7830     int block;
7831     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7832     int n;
7833     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7834   }
7835 */
7836   //if(Count==365117028) tracedebug=1;
7837   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7838   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7839   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7840   //if(debug) 
7841   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7842   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7843   /*if(Count>=312978186) {
7844     rlist();
7845   }*/
7846   //rlist();
7847   start = (u_int)addr&~3;
7848   //assert(((u_int)addr&1)==0);
7849 #ifdef PCSX
7850   if (Config.HLE && start == 0x80001000) // hlecall
7851   {
7852     // XXX: is this enough? Maybe check hleSoftCall?
7853     u_int beginning=(u_int)out;
7854     u_int page=get_page(start);
7855     invalid_code[start>>12]=0;
7856     emit_movimm(start,0);
7857     emit_writeword(0,(int)&pcaddr);
7858     emit_jmp((int)new_dyna_leave);
7859 #ifdef __arm__
7860     __clear_cache((void *)beginning,out);
7861 #endif
7862     ll_add(jump_in+page,start,(void *)beginning);
7863     return 0;
7864   }
7865   else if ((u_int)addr < 0x00200000 ||
7866     (0xa0000000 <= addr && addr < 0xa0200000)) {
7867     // used for BIOS calls mostly?
7868     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7869     pagelimit = (addr&0xa0000000)|0x00200000;
7870   }
7871   else if (!Config.HLE && (
7872 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7873     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7874     // BIOS
7875     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7876     pagelimit = (addr&0xfff00000)|0x80000;
7877   }
7878   else
7879 #endif
7880 #ifdef MUPEN64
7881   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7882     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7883     pagelimit = 0xa4001000;
7884   }
7885   else
7886 #endif
7887   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7888     source = (u_int *)((u_int)rdram+start-0x80000000);
7889     pagelimit = 0x80000000+RAM_SIZE;
7890   }
7891 #ifndef DISABLE_TLB
7892   else if ((signed int)addr >= (signed int)0xC0000000) {
7893     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7894     //if(tlb_LUT_r[start>>12])
7895       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7896     if((signed int)memory_map[start>>12]>=0) {
7897       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7898       pagelimit=(start+4096)&0xFFFFF000;
7899       int map=memory_map[start>>12];
7900       int i;
7901       for(i=0;i<5;i++) {
7902         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7903         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7904       }
7905       assem_debug("pagelimit=%x\n",pagelimit);
7906       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7907     }
7908     else {
7909       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7910       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7911       return -1; // Caller will invoke exception handler
7912     }
7913     //printf("source= %x\n",(int)source);
7914   }
7915 #endif
7916   else {
7917     printf("Compile at bogus memory address: %x \n", (int)addr);
7918     exit(1);
7919   }
7920
7921   /* Pass 1: disassemble */
7922   /* Pass 2: register dependencies, branch targets */
7923   /* Pass 3: register allocation */
7924   /* Pass 4: branch dependencies */
7925   /* Pass 5: pre-alloc */
7926   /* Pass 6: optimize clean/dirty state */
7927   /* Pass 7: flag 32-bit registers */
7928   /* Pass 8: assembly */
7929   /* Pass 9: linker */
7930   /* Pass 10: garbage collection / free memory */
7931
7932   int i,j;
7933   int done=0;
7934   unsigned int type,op,op2;
7935
7936   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7937   
7938   /* Pass 1 disassembly */
7939
7940   for(i=0;!done;i++) {
7941     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7942     minimum_free_regs[i]=0;
7943     opcode[i]=op=source[i]>>26;
7944     switch(op)
7945     {
7946       case 0x00: strcpy(insn[i],"special"); type=NI;
7947         op2=source[i]&0x3f;
7948         switch(op2)
7949         {
7950           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7951           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7952           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7953           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7954           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7955           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7956           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7957           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7958           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7959           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7960           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7961           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7962           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7963           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7964           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7965           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7966           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7967           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7968           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7969           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7970           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7971           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7972           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7973           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7974           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7975           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7976           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7977           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7978           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7979           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7980           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7981           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7982           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7983           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7984           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7985           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7986           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7987           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7988           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7989           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7990           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7991           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7992           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7993           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7994           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7995           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7996           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7997           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7998           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7999           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8000           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8001           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8002         }
8003         break;
8004       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8005         op2=(source[i]>>16)&0x1f;
8006         switch(op2)
8007         {
8008           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8009           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8010           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8011           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8012           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8013           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8014           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8015           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8016           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8017           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8018           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8019           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8020           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8021           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8022         }
8023         break;
8024       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8025       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8026       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8027       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8028       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8029       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8030       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8031       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8032       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8033       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8034       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8035       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8036       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8037       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8038       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8039         op2=(source[i]>>21)&0x1f;
8040         switch(op2)
8041         {
8042           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8043           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8044           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8045           switch(source[i]&0x3f)
8046           {
8047             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8048             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8049             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8050             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8051 #ifdef PCSX
8052             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8053 #else
8054             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8055 #endif
8056           }
8057         }
8058         break;
8059       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8060         op2=(source[i]>>21)&0x1f;
8061         switch(op2)
8062         {
8063           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8064           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8065           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8066           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8067           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8068           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8069           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8070           switch((source[i]>>16)&0x3)
8071           {
8072             case 0x00: strcpy(insn[i],"BC1F"); break;
8073             case 0x01: strcpy(insn[i],"BC1T"); break;
8074             case 0x02: strcpy(insn[i],"BC1FL"); break;
8075             case 0x03: strcpy(insn[i],"BC1TL"); break;
8076           }
8077           break;
8078           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8079           switch(source[i]&0x3f)
8080           {
8081             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8082             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8083             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8084             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8085             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8086             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8087             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8088             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8089             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8090             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8091             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8092             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8093             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8094             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8095             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8096             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8097             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8098             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8099             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8100             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8101             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8102             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8103             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8104             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8105             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8106             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8107             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8108             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8109             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8110             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8111             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8112             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8113             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8114             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8115             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8116           }
8117           break;
8118           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8119           switch(source[i]&0x3f)
8120           {
8121             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8122             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8123             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8124             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8125             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8126             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8127             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8128             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8129             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8130             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8131             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8132             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8133             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8134             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8135             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8136             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8137             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8138             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8139             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8140             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8141             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8142             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8143             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8144             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8145             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8146             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8147             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8148             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8149             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8150             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8151             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8152             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8153             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8154             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8155             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8156           }
8157           break;
8158           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8159           switch(source[i]&0x3f)
8160           {
8161             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8162             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8163           }
8164           break;
8165           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8166           switch(source[i]&0x3f)
8167           {
8168             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8169             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8170           }
8171           break;
8172         }
8173         break;
8174 #ifndef FORCE32
8175       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8176       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8177       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8178       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8179       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8180       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8181       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8182       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8183 #endif
8184       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8185       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8186       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8187       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8188       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8189       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8190       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8191       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8192       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8193       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8194       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8195       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8196 #ifndef FORCE32
8197       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8198       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8199 #endif
8200       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8201       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8202       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8203       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8204 #ifndef FORCE32
8205       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8206       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8207       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8208 #endif
8209       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8210       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8211 #ifndef FORCE32
8212       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8213       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8214       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8215 #endif
8216 #ifdef PCSX
8217       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8218         // note: COP MIPS-1 encoding differs from MIPS32
8219         op2=(source[i]>>21)&0x1f;
8220         if (source[i]&0x3f) {
8221           if (gte_handlers[source[i]&0x3f]!=NULL) {
8222             snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8223             type=C2OP;
8224           }
8225         }
8226         else switch(op2)
8227         {
8228           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8229           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8230           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8231           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8232         }
8233         break;
8234       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8235       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8236       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8237 #endif
8238       default: strcpy(insn[i],"???"); type=NI;
8239         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8240         break;
8241     }
8242 #ifdef PCSX
8243     /* detect branch in delay slot early */
8244     if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8245       opcode[i+1]=source[i+1]>>26;
8246       opcode2[i+1]=source[i+1]&0x3f;
8247       if((0<opcode[i+1]&&opcode[i+1]<8)||(opcode[i+1]==0&&(opcode2[i+1]==8||opcode2[i+1]==9))) {
8248         printf("branch in delay slot @%08x (%08x)\n", addr + i*4+4, addr);
8249         // don't handle first branch and call interpreter if it's hit
8250         type=INTCALL;
8251       }
8252     }
8253 #endif
8254     itype[i]=type;
8255     opcode2[i]=op2;
8256     /* Get registers/immediates */
8257     lt1[i]=0;
8258     us1[i]=0;
8259     us2[i]=0;
8260     dep1[i]=0;
8261     dep2[i]=0;
8262     switch(type) {
8263       case LOAD:
8264         rs1[i]=(source[i]>>21)&0x1f;
8265         rs2[i]=0;
8266         rt1[i]=(source[i]>>16)&0x1f;
8267         rt2[i]=0;
8268         imm[i]=(short)source[i];
8269         break;
8270       case STORE:
8271       case STORELR:
8272         rs1[i]=(source[i]>>21)&0x1f;
8273         rs2[i]=(source[i]>>16)&0x1f;
8274         rt1[i]=0;
8275         rt2[i]=0;
8276         imm[i]=(short)source[i];
8277         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8278         break;
8279       case LOADLR:
8280         // LWL/LWR only load part of the register,
8281         // therefore the target register must be treated as a source too
8282         rs1[i]=(source[i]>>21)&0x1f;
8283         rs2[i]=(source[i]>>16)&0x1f;
8284         rt1[i]=(source[i]>>16)&0x1f;
8285         rt2[i]=0;
8286         imm[i]=(short)source[i];
8287         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8288         if(op==0x26) dep1[i]=rt1[i]; // LWR
8289         break;
8290       case IMM16:
8291         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8292         else rs1[i]=(source[i]>>21)&0x1f;
8293         rs2[i]=0;
8294         rt1[i]=(source[i]>>16)&0x1f;
8295         rt2[i]=0;
8296         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8297           imm[i]=(unsigned short)source[i];
8298         }else{
8299           imm[i]=(short)source[i];
8300         }
8301         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8302         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8303         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8304         break;
8305       case UJUMP:
8306         rs1[i]=0;
8307         rs2[i]=0;
8308         rt1[i]=0;
8309         rt2[i]=0;
8310         // The JAL instruction writes to r31.
8311         if (op&1) {
8312           rt1[i]=31;
8313         }
8314         rs2[i]=CCREG;
8315         break;
8316       case RJUMP:
8317         rs1[i]=(source[i]>>21)&0x1f;
8318         rs2[i]=0;
8319         rt1[i]=0;
8320         rt2[i]=0;
8321         // The JALR instruction writes to rd.
8322         if (op2&1) {
8323           rt1[i]=(source[i]>>11)&0x1f;
8324         }
8325         rs2[i]=CCREG;
8326         break;
8327       case CJUMP:
8328         rs1[i]=(source[i]>>21)&0x1f;
8329         rs2[i]=(source[i]>>16)&0x1f;
8330         rt1[i]=0;
8331         rt2[i]=0;
8332         if(op&2) { // BGTZ/BLEZ
8333           rs2[i]=0;
8334         }
8335         us1[i]=rs1[i];
8336         us2[i]=rs2[i];
8337         likely[i]=op>>4;
8338         break;
8339       case SJUMP:
8340         rs1[i]=(source[i]>>21)&0x1f;
8341         rs2[i]=CCREG;
8342         rt1[i]=0;
8343         rt2[i]=0;
8344         us1[i]=rs1[i];
8345         if(op2&0x10) { // BxxAL
8346           rt1[i]=31;
8347           // NOTE: If the branch is not taken, r31 is still overwritten
8348         }
8349         likely[i]=(op2&2)>>1;
8350         break;
8351       case FJUMP:
8352         rs1[i]=FSREG;
8353         rs2[i]=CSREG;
8354         rt1[i]=0;
8355         rt2[i]=0;
8356         likely[i]=((source[i])>>17)&1;
8357         break;
8358       case ALU:
8359         rs1[i]=(source[i]>>21)&0x1f; // source
8360         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8361         rt1[i]=(source[i]>>11)&0x1f; // destination
8362         rt2[i]=0;
8363         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8364           us1[i]=rs1[i];us2[i]=rs2[i];
8365         }
8366         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8367           dep1[i]=rs1[i];dep2[i]=rs2[i];
8368         }
8369         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8370           dep1[i]=rs1[i];dep2[i]=rs2[i];
8371         }
8372         break;
8373       case MULTDIV:
8374         rs1[i]=(source[i]>>21)&0x1f; // source
8375         rs2[i]=(source[i]>>16)&0x1f; // divisor
8376         rt1[i]=HIREG;
8377         rt2[i]=LOREG;
8378         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8379           us1[i]=rs1[i];us2[i]=rs2[i];
8380         }
8381         break;
8382       case MOV:
8383         rs1[i]=0;
8384         rs2[i]=0;
8385         rt1[i]=0;
8386         rt2[i]=0;
8387         if(op2==0x10) rs1[i]=HIREG; // MFHI
8388         if(op2==0x11) rt1[i]=HIREG; // MTHI
8389         if(op2==0x12) rs1[i]=LOREG; // MFLO
8390         if(op2==0x13) rt1[i]=LOREG; // MTLO
8391         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8392         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8393         dep1[i]=rs1[i];
8394         break;
8395       case SHIFT:
8396         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8397         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8398         rt1[i]=(source[i]>>11)&0x1f; // destination
8399         rt2[i]=0;
8400         // DSLLV/DSRLV/DSRAV are 64-bit
8401         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8402         break;
8403       case SHIFTIMM:
8404         rs1[i]=(source[i]>>16)&0x1f;
8405         rs2[i]=0;
8406         rt1[i]=(source[i]>>11)&0x1f;
8407         rt2[i]=0;
8408         imm[i]=(source[i]>>6)&0x1f;
8409         // DSxx32 instructions
8410         if(op2>=0x3c) imm[i]|=0x20;
8411         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8412         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8413         break;
8414       case COP0:
8415         rs1[i]=0;
8416         rs2[i]=0;
8417         rt1[i]=0;
8418         rt2[i]=0;
8419         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8420         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8421         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8422         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8423         break;
8424       case COP1:
8425       case COP2:
8426         rs1[i]=0;
8427         rs2[i]=0;
8428         rt1[i]=0;
8429         rt2[i]=0;
8430         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8431         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8432         if(op2==5) us1[i]=rs1[i]; // DMTC1
8433         rs2[i]=CSREG;
8434         break;
8435       case C1LS:
8436         rs1[i]=(source[i]>>21)&0x1F;
8437         rs2[i]=CSREG;
8438         rt1[i]=0;
8439         rt2[i]=0;
8440         imm[i]=(short)source[i];
8441         break;
8442       case C2LS:
8443         rs1[i]=(source[i]>>21)&0x1F;
8444         rs2[i]=0;
8445         rt1[i]=0;
8446         rt2[i]=0;
8447         imm[i]=(short)source[i];
8448         break;
8449       case FLOAT:
8450       case FCONV:
8451         rs1[i]=0;
8452         rs2[i]=CSREG;
8453         rt1[i]=0;
8454         rt2[i]=0;
8455         break;
8456       case FCOMP:
8457         rs1[i]=FSREG;
8458         rs2[i]=CSREG;
8459         rt1[i]=FSREG;
8460         rt2[i]=0;
8461         break;
8462       case SYSCALL:
8463       case HLECALL:
8464       case INTCALL:
8465         rs1[i]=CCREG;
8466         rs2[i]=0;
8467         rt1[i]=0;
8468         rt2[i]=0;
8469         break;
8470       default:
8471         rs1[i]=0;
8472         rs2[i]=0;
8473         rt1[i]=0;
8474         rt2[i]=0;
8475     }
8476     /* Calculate branch target addresses */
8477     if(type==UJUMP)
8478       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8479     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8480       ba[i]=start+i*4+8; // Ignore never taken branch
8481     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8482       ba[i]=start+i*4+8; // Ignore never taken branch
8483     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8484       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8485     else ba[i]=-1;
8486     /* Is this the end of the block? */
8487     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8488 #ifdef PCSX
8489       // check for link register access in delay slot
8490       int rt1_=rt1[i-1];
8491       if(rt1_!=0&&(rs1[i]==rt1_||rs2[i]==rt1_||rt1[i]==rt1_||rt2[i]==rt1_)) {
8492         printf("link access in delay slot @%08x (%08x)\n", addr + i*4, addr);
8493         ba[i-1]=-1;
8494         itype[i-1]=INTCALL;
8495         done=2;
8496       }
8497       else
8498 #endif
8499       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8500         done=2;
8501       }
8502       else {
8503         if(stop_after_jal) done=1;
8504         // Stop on BREAK
8505         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8506       }
8507       // Don't recompile stuff that's already compiled
8508       if(check_addr(start+i*4+4)) done=1;
8509       // Don't get too close to the limit
8510       if(i>MAXBLOCK/2) done=1;
8511     }
8512     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8513     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8514     if(done==2) {
8515       // Does the block continue due to a branch?
8516       for(j=i-1;j>=0;j--)
8517       {
8518         if(ba[j]==start+i*4+4) done=j=0;
8519         if(ba[j]==start+i*4+8) done=j=0;
8520       }
8521     }
8522     //assert(i<MAXBLOCK-1);
8523     if(start+i*4==pagelimit-4) done=1;
8524     assert(start+i*4<pagelimit);
8525     if (i==MAXBLOCK-1) done=1;
8526     // Stop if we're compiling junk
8527     if(itype[i]==NI&&opcode[i]==0x11) {
8528       done=stop_after_jal=1;
8529       printf("Disabled speculative precompilation\n");
8530     }
8531   }
8532   slen=i;
8533   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8534     if(start+i*4==pagelimit) {
8535       itype[i-1]=SPAN;
8536     }
8537   }
8538   assert(slen>0);
8539
8540   /* Pass 2 - Register dependencies and branch targets */
8541
8542   unneeded_registers(0,slen-1,0);
8543   
8544   /* Pass 3 - Register allocation */
8545
8546   struct regstat current; // Current register allocations/status
8547   current.is32=1;
8548   current.dirty=0;
8549   current.u=unneeded_reg[0];
8550   current.uu=unneeded_reg_upper[0];
8551   clear_all_regs(current.regmap);
8552   alloc_reg(&current,0,CCREG);
8553   dirty_reg(&current,CCREG);
8554   current.isconst=0;
8555   current.wasconst=0;
8556   int ds=0;
8557   int cc=0;
8558   int hr;
8559
8560 #ifndef FORCE32
8561   provisional_32bit();
8562 #endif
8563   if((u_int)addr&1) {
8564     // First instruction is delay slot
8565     cc=-1;
8566     bt[1]=1;
8567     ds=1;
8568     unneeded_reg[0]=1;
8569     unneeded_reg_upper[0]=1;
8570     current.regmap[HOST_BTREG]=BTREG;
8571   }
8572   
8573   for(i=0;i<slen;i++)
8574   {
8575     if(bt[i])
8576     {
8577       int hr;
8578       for(hr=0;hr<HOST_REGS;hr++)
8579       {
8580         // Is this really necessary?
8581         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8582       }
8583       current.isconst=0;
8584     }
8585     if(i>1)
8586     {
8587       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8588       {
8589         if(rs1[i-2]==0||rs2[i-2]==0)
8590         {
8591           if(rs1[i-2]) {
8592             current.is32|=1LL<<rs1[i-2];
8593             int hr=get_reg(current.regmap,rs1[i-2]|64);
8594             if(hr>=0) current.regmap[hr]=-1;
8595           }
8596           if(rs2[i-2]) {
8597             current.is32|=1LL<<rs2[i-2];
8598             int hr=get_reg(current.regmap,rs2[i-2]|64);
8599             if(hr>=0) current.regmap[hr]=-1;
8600           }
8601         }
8602       }
8603     }
8604 #ifndef FORCE32
8605     // If something jumps here with 64-bit values
8606     // then promote those registers to 64 bits
8607     if(bt[i])
8608     {
8609       uint64_t temp_is32=current.is32;
8610       for(j=i-1;j>=0;j--)
8611       {
8612         if(ba[j]==start+i*4) 
8613           temp_is32&=branch_regs[j].is32;
8614       }
8615       for(j=i;j<slen;j++)
8616       {
8617         if(ba[j]==start+i*4) 
8618           //temp_is32=1;
8619           temp_is32&=p32[j];
8620       }
8621       if(temp_is32!=current.is32) {
8622         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8623         #ifdef DESTRUCTIVE_WRITEBACK
8624         for(hr=0;hr<HOST_REGS;hr++)
8625         {
8626           int r=current.regmap[hr];
8627           if(r>0&&r<64)
8628           {
8629             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8630               temp_is32|=1LL<<r;
8631               //printf("restore %d\n",r);
8632             }
8633           }
8634         }
8635         #endif
8636         current.is32=temp_is32;
8637       }
8638     }
8639 #else
8640     current.is32=-1LL;
8641 #endif
8642
8643     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8644     regs[i].wasconst=current.isconst;
8645     regs[i].was32=current.is32;
8646     regs[i].wasdirty=current.dirty;
8647     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8648     // To change a dirty register from 32 to 64 bits, we must write
8649     // it out during the previous cycle (for branches, 2 cycles)
8650     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8651     {
8652       uint64_t temp_is32=current.is32;
8653       for(j=i-1;j>=0;j--)
8654       {
8655         if(ba[j]==start+i*4+4) 
8656           temp_is32&=branch_regs[j].is32;
8657       }
8658       for(j=i;j<slen;j++)
8659       {
8660         if(ba[j]==start+i*4+4) 
8661           //temp_is32=1;
8662           temp_is32&=p32[j];
8663       }
8664       if(temp_is32!=current.is32) {
8665         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8666         for(hr=0;hr<HOST_REGS;hr++)
8667         {
8668           int r=current.regmap[hr];
8669           if(r>0)
8670           {
8671             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8672               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8673               {
8674                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8675                 {
8676                   //printf("dump %d/r%d\n",hr,r);
8677                   current.regmap[hr]=-1;
8678                   if(get_reg(current.regmap,r|64)>=0) 
8679                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8680                 }
8681               }
8682             }
8683           }
8684         }
8685       }
8686     }
8687     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8688     {
8689       uint64_t temp_is32=current.is32;
8690       for(j=i-1;j>=0;j--)
8691       {
8692         if(ba[j]==start+i*4+8) 
8693           temp_is32&=branch_regs[j].is32;
8694       }
8695       for(j=i;j<slen;j++)
8696       {
8697         if(ba[j]==start+i*4+8) 
8698           //temp_is32=1;
8699           temp_is32&=p32[j];
8700       }
8701       if(temp_is32!=current.is32) {
8702         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8703         for(hr=0;hr<HOST_REGS;hr++)
8704         {
8705           int r=current.regmap[hr];
8706           if(r>0)
8707           {
8708             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8709               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8710               {
8711                 //printf("dump %d/r%d\n",hr,r);
8712                 current.regmap[hr]=-1;
8713                 if(get_reg(current.regmap,r|64)>=0) 
8714                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8715               }
8716             }
8717           }
8718         }
8719       }
8720     }
8721     #endif
8722     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8723       if(i+1<slen) {
8724         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8725         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8726         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8727         current.u|=1;
8728         current.uu|=1;
8729       } else {
8730         current.u=1;
8731         current.uu=1;
8732       }
8733     } else {
8734       if(i+1<slen) {
8735         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8736         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8737         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8738         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8739         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8740         current.u|=1;
8741         current.uu|=1;
8742       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8743     }
8744     is_ds[i]=ds;
8745     if(ds) {
8746       ds=0; // Skip delay slot, already allocated as part of branch
8747       // ...but we need to alloc it in case something jumps here
8748       if(i+1<slen) {
8749         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8750         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8751       }else{
8752         current.u=branch_unneeded_reg[i-1];
8753         current.uu=branch_unneeded_reg_upper[i-1];
8754       }
8755       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8756       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8757       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8758       current.u|=1;
8759       current.uu|=1;
8760       struct regstat temp;
8761       memcpy(&temp,&current,sizeof(current));
8762       temp.wasdirty=temp.dirty;
8763       temp.was32=temp.is32;
8764       // TODO: Take into account unconditional branches, as below
8765       delayslot_alloc(&temp,i);
8766       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8767       regs[i].wasdirty=temp.wasdirty;
8768       regs[i].was32=temp.was32;
8769       regs[i].dirty=temp.dirty;
8770       regs[i].is32=temp.is32;
8771       regs[i].isconst=0;
8772       regs[i].wasconst=0;
8773       current.isconst=0;
8774       // Create entry (branch target) regmap
8775       for(hr=0;hr<HOST_REGS;hr++)
8776       {
8777         int r=temp.regmap[hr];
8778         if(r>=0) {
8779           if(r!=regmap_pre[i][hr]) {
8780             regs[i].regmap_entry[hr]=-1;
8781           }
8782           else
8783           {
8784             if(r<64){
8785               if((current.u>>r)&1) {
8786                 regs[i].regmap_entry[hr]=-1;
8787                 regs[i].regmap[hr]=-1;
8788                 //Don't clear regs in the delay slot as the branch might need them
8789                 //current.regmap[hr]=-1;
8790               }else
8791                 regs[i].regmap_entry[hr]=r;
8792             }
8793             else {
8794               if((current.uu>>(r&63))&1) {
8795                 regs[i].regmap_entry[hr]=-1;
8796                 regs[i].regmap[hr]=-1;
8797                 //Don't clear regs in the delay slot as the branch might need them
8798                 //current.regmap[hr]=-1;
8799               }else
8800                 regs[i].regmap_entry[hr]=r;
8801             }
8802           }
8803         } else {
8804           // First instruction expects CCREG to be allocated
8805           if(i==0&&hr==HOST_CCREG) 
8806             regs[i].regmap_entry[hr]=CCREG;
8807           else
8808             regs[i].regmap_entry[hr]=-1;
8809         }
8810       }
8811     }
8812     else { // Not delay slot
8813       switch(itype[i]) {
8814         case UJUMP:
8815           //current.isconst=0; // DEBUG
8816           //current.wasconst=0; // DEBUG
8817           //regs[i].wasconst=0; // DEBUG
8818           clear_const(&current,rt1[i]);
8819           alloc_cc(&current,i);
8820           dirty_reg(&current,CCREG);
8821           if (rt1[i]==31) {
8822             alloc_reg(&current,i,31);
8823             dirty_reg(&current,31);
8824             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8825             assert(rt1[i+1]!=rt1[i]);
8826             #ifdef REG_PREFETCH
8827             alloc_reg(&current,i,PTEMP);
8828             #endif
8829             //current.is32|=1LL<<rt1[i];
8830           }
8831           ooo[i]=1;
8832           delayslot_alloc(&current,i+1);
8833           //current.isconst=0; // DEBUG
8834           ds=1;
8835           //printf("i=%d, isconst=%x\n",i,current.isconst);
8836           break;
8837         case RJUMP:
8838           //current.isconst=0;
8839           //current.wasconst=0;
8840           //regs[i].wasconst=0;
8841           clear_const(&current,rs1[i]);
8842           clear_const(&current,rt1[i]);
8843           alloc_cc(&current,i);
8844           dirty_reg(&current,CCREG);
8845           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8846             alloc_reg(&current,i,rs1[i]);
8847             if (rt1[i]!=0) {
8848               alloc_reg(&current,i,rt1[i]);
8849               dirty_reg(&current,rt1[i]);
8850               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8851               assert(rt1[i+1]!=rt1[i]);
8852               #ifdef REG_PREFETCH
8853               alloc_reg(&current,i,PTEMP);
8854               #endif
8855             }
8856             #ifdef USE_MINI_HT
8857             if(rs1[i]==31) { // JALR
8858               alloc_reg(&current,i,RHASH);
8859               #ifndef HOST_IMM_ADDR32
8860               alloc_reg(&current,i,RHTBL);
8861               #endif
8862             }
8863             #endif
8864             delayslot_alloc(&current,i+1);
8865           } else {
8866             // The delay slot overwrites our source register,
8867             // allocate a temporary register to hold the old value.
8868             current.isconst=0;
8869             current.wasconst=0;
8870             regs[i].wasconst=0;
8871             delayslot_alloc(&current,i+1);
8872             current.isconst=0;
8873             alloc_reg(&current,i,RTEMP);
8874           }
8875           //current.isconst=0; // DEBUG
8876           ooo[i]=1;
8877           ds=1;
8878           break;
8879         case CJUMP:
8880           //current.isconst=0;
8881           //current.wasconst=0;
8882           //regs[i].wasconst=0;
8883           clear_const(&current,rs1[i]);
8884           clear_const(&current,rs2[i]);
8885           if((opcode[i]&0x3E)==4) // BEQ/BNE
8886           {
8887             alloc_cc(&current,i);
8888             dirty_reg(&current,CCREG);
8889             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8890             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8891             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8892             {
8893               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8894               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8895             }
8896             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8897                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8898               // The delay slot overwrites one of our conditions.
8899               // Allocate the branch condition registers instead.
8900               current.isconst=0;
8901               current.wasconst=0;
8902               regs[i].wasconst=0;
8903               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8904               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8905               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8906               {
8907                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8908                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8909               }
8910             }
8911             else
8912             {
8913               ooo[i]=1;
8914               delayslot_alloc(&current,i+1);
8915             }
8916           }
8917           else
8918           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8919           {
8920             alloc_cc(&current,i);
8921             dirty_reg(&current,CCREG);
8922             alloc_reg(&current,i,rs1[i]);
8923             if(!(current.is32>>rs1[i]&1))
8924             {
8925               alloc_reg64(&current,i,rs1[i]);
8926             }
8927             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8928               // The delay slot overwrites one of our conditions.
8929               // Allocate the branch condition registers instead.
8930               current.isconst=0;
8931               current.wasconst=0;
8932               regs[i].wasconst=0;
8933               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8934               if(!((current.is32>>rs1[i])&1))
8935               {
8936                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8937               }
8938             }
8939             else
8940             {
8941               ooo[i]=1;
8942               delayslot_alloc(&current,i+1);
8943             }
8944           }
8945           else
8946           // Don't alloc the delay slot yet because we might not execute it
8947           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8948           {
8949             current.isconst=0;
8950             current.wasconst=0;
8951             regs[i].wasconst=0;
8952             alloc_cc(&current,i);
8953             dirty_reg(&current,CCREG);
8954             alloc_reg(&current,i,rs1[i]);
8955             alloc_reg(&current,i,rs2[i]);
8956             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8957             {
8958               alloc_reg64(&current,i,rs1[i]);
8959               alloc_reg64(&current,i,rs2[i]);
8960             }
8961           }
8962           else
8963           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8964           {
8965             current.isconst=0;
8966             current.wasconst=0;
8967             regs[i].wasconst=0;
8968             alloc_cc(&current,i);
8969             dirty_reg(&current,CCREG);
8970             alloc_reg(&current,i,rs1[i]);
8971             if(!(current.is32>>rs1[i]&1))
8972             {
8973               alloc_reg64(&current,i,rs1[i]);
8974             }
8975           }
8976           ds=1;
8977           //current.isconst=0;
8978           break;
8979         case SJUMP:
8980           //current.isconst=0;
8981           //current.wasconst=0;
8982           //regs[i].wasconst=0;
8983           clear_const(&current,rs1[i]);
8984           clear_const(&current,rt1[i]);
8985           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8986           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8987           {
8988             alloc_cc(&current,i);
8989             dirty_reg(&current,CCREG);
8990             alloc_reg(&current,i,rs1[i]);
8991             if(!(current.is32>>rs1[i]&1))
8992             {
8993               alloc_reg64(&current,i,rs1[i]);
8994             }
8995             if (rt1[i]==31) { // BLTZAL/BGEZAL
8996               alloc_reg(&current,i,31);
8997               dirty_reg(&current,31);
8998               //#ifdef REG_PREFETCH
8999               //alloc_reg(&current,i,PTEMP);
9000               //#endif
9001               //current.is32|=1LL<<rt1[i];
9002             }
9003             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9004                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9005               // Allocate the branch condition registers instead.
9006               current.isconst=0;
9007               current.wasconst=0;
9008               regs[i].wasconst=0;
9009               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9010               if(!((current.is32>>rs1[i])&1))
9011               {
9012                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9013               }
9014             }
9015             else
9016             {
9017               ooo[i]=1;
9018               delayslot_alloc(&current,i+1);
9019             }
9020           }
9021           else
9022           // Don't alloc the delay slot yet because we might not execute it
9023           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9024           {
9025             current.isconst=0;
9026             current.wasconst=0;
9027             regs[i].wasconst=0;
9028             alloc_cc(&current,i);
9029             dirty_reg(&current,CCREG);
9030             alloc_reg(&current,i,rs1[i]);
9031             if(!(current.is32>>rs1[i]&1))
9032             {
9033               alloc_reg64(&current,i,rs1[i]);
9034             }
9035           }
9036           ds=1;
9037           //current.isconst=0;
9038           break;
9039         case FJUMP:
9040           current.isconst=0;
9041           current.wasconst=0;
9042           regs[i].wasconst=0;
9043           if(likely[i]==0) // BC1F/BC1T
9044           {
9045             // TODO: Theoretically we can run out of registers here on x86.
9046             // The delay slot can allocate up to six, and we need to check
9047             // CSREG before executing the delay slot.  Possibly we can drop
9048             // the cycle count and then reload it after checking that the
9049             // FPU is in a usable state, or don't do out-of-order execution.
9050             alloc_cc(&current,i);
9051             dirty_reg(&current,CCREG);
9052             alloc_reg(&current,i,FSREG);
9053             alloc_reg(&current,i,CSREG);
9054             if(itype[i+1]==FCOMP) {
9055               // The delay slot overwrites the branch condition.
9056               // Allocate the branch condition registers instead.
9057               alloc_cc(&current,i);
9058               dirty_reg(&current,CCREG);
9059               alloc_reg(&current,i,CSREG);
9060               alloc_reg(&current,i,FSREG);
9061             }
9062             else {
9063               ooo[i]=1;
9064               delayslot_alloc(&current,i+1);
9065               alloc_reg(&current,i+1,CSREG);
9066             }
9067           }
9068           else
9069           // Don't alloc the delay slot yet because we might not execute it
9070           if(likely[i]) // BC1FL/BC1TL
9071           {
9072             alloc_cc(&current,i);
9073             dirty_reg(&current,CCREG);
9074             alloc_reg(&current,i,CSREG);
9075             alloc_reg(&current,i,FSREG);
9076           }
9077           ds=1;
9078           current.isconst=0;
9079           break;
9080         case IMM16:
9081           imm16_alloc(&current,i);
9082           break;
9083         case LOAD:
9084         case LOADLR:
9085           load_alloc(&current,i);
9086           break;
9087         case STORE:
9088         case STORELR:
9089           store_alloc(&current,i);
9090           break;
9091         case ALU:
9092           alu_alloc(&current,i);
9093           break;
9094         case SHIFT:
9095           shift_alloc(&current,i);
9096           break;
9097         case MULTDIV:
9098           multdiv_alloc(&current,i);
9099           break;
9100         case SHIFTIMM:
9101           shiftimm_alloc(&current,i);
9102           break;
9103         case MOV:
9104           mov_alloc(&current,i);
9105           break;
9106         case COP0:
9107           cop0_alloc(&current,i);
9108           break;
9109         case COP1:
9110         case COP2:
9111           cop1_alloc(&current,i);
9112           break;
9113         case C1LS:
9114           c1ls_alloc(&current,i);
9115           break;
9116         case C2LS:
9117           c2ls_alloc(&current,i);
9118           break;
9119         case C2OP:
9120           c2op_alloc(&current,i);
9121           break;
9122         case FCONV:
9123           fconv_alloc(&current,i);
9124           break;
9125         case FLOAT:
9126           float_alloc(&current,i);
9127           break;
9128         case FCOMP:
9129           fcomp_alloc(&current,i);
9130           break;
9131         case SYSCALL:
9132         case HLECALL:
9133         case INTCALL:
9134           syscall_alloc(&current,i);
9135           break;
9136         case SPAN:
9137           pagespan_alloc(&current,i);
9138           break;
9139       }
9140       
9141       // Drop the upper half of registers that have become 32-bit
9142       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9143       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9144         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9145         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9146         current.uu|=1;
9147       } else {
9148         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9149         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9150         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9151         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9152         current.uu|=1;
9153       }
9154
9155       // Create entry (branch target) regmap
9156       for(hr=0;hr<HOST_REGS;hr++)
9157       {
9158         int r,or,er;
9159         r=current.regmap[hr];
9160         if(r>=0) {
9161           if(r!=regmap_pre[i][hr]) {
9162             // TODO: delay slot (?)
9163             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9164             if(or<0||(r&63)>=TEMPREG){
9165               regs[i].regmap_entry[hr]=-1;
9166             }
9167             else
9168             {
9169               // Just move it to a different register
9170               regs[i].regmap_entry[hr]=r;
9171               // If it was dirty before, it's still dirty
9172               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9173             }
9174           }
9175           else
9176           {
9177             // Unneeded
9178             if(r==0){
9179               regs[i].regmap_entry[hr]=0;
9180             }
9181             else
9182             if(r<64){
9183               if((current.u>>r)&1) {
9184                 regs[i].regmap_entry[hr]=-1;
9185                 //regs[i].regmap[hr]=-1;
9186                 current.regmap[hr]=-1;
9187               }else
9188                 regs[i].regmap_entry[hr]=r;
9189             }
9190             else {
9191               if((current.uu>>(r&63))&1) {
9192                 regs[i].regmap_entry[hr]=-1;
9193                 //regs[i].regmap[hr]=-1;
9194                 current.regmap[hr]=-1;
9195               }else
9196                 regs[i].regmap_entry[hr]=r;
9197             }
9198           }
9199         } else {
9200           // Branches expect CCREG to be allocated at the target
9201           if(regmap_pre[i][hr]==CCREG) 
9202             regs[i].regmap_entry[hr]=CCREG;
9203           else
9204             regs[i].regmap_entry[hr]=-1;
9205         }
9206       }
9207       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9208     }
9209     /* Branch post-alloc */
9210     if(i>0)
9211     {
9212       current.was32=current.is32;
9213       current.wasdirty=current.dirty;
9214       switch(itype[i-1]) {
9215         case UJUMP:
9216           memcpy(&branch_regs[i-1],&current,sizeof(current));
9217           branch_regs[i-1].isconst=0;
9218           branch_regs[i-1].wasconst=0;
9219           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9220           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9221           alloc_cc(&branch_regs[i-1],i-1);
9222           dirty_reg(&branch_regs[i-1],CCREG);
9223           if(rt1[i-1]==31) { // JAL
9224             alloc_reg(&branch_regs[i-1],i-1,31);
9225             dirty_reg(&branch_regs[i-1],31);
9226             branch_regs[i-1].is32|=1LL<<31;
9227           }
9228           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9229           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9230           break;
9231         case RJUMP:
9232           memcpy(&branch_regs[i-1],&current,sizeof(current));
9233           branch_regs[i-1].isconst=0;
9234           branch_regs[i-1].wasconst=0;
9235           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9236           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9237           alloc_cc(&branch_regs[i-1],i-1);
9238           dirty_reg(&branch_regs[i-1],CCREG);
9239           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9240           if(rt1[i-1]!=0) { // JALR
9241             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9242             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9243             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9244           }
9245           #ifdef USE_MINI_HT
9246           if(rs1[i-1]==31) { // JALR
9247             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9248             #ifndef HOST_IMM_ADDR32
9249             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9250             #endif
9251           }
9252           #endif
9253           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9254           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9255           break;
9256         case CJUMP:
9257           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9258           {
9259             alloc_cc(&current,i-1);
9260             dirty_reg(&current,CCREG);
9261             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9262                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9263               // The delay slot overwrote one of our conditions
9264               // Delay slot goes after the test (in order)
9265               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9266               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9267               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9268               current.u|=1;
9269               current.uu|=1;
9270               delayslot_alloc(&current,i);
9271               current.isconst=0;
9272             }
9273             else
9274             {
9275               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9276               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9277               // Alloc the branch condition registers
9278               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9279               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9280               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9281               {
9282                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9283                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9284               }
9285             }
9286             memcpy(&branch_regs[i-1],&current,sizeof(current));
9287             branch_regs[i-1].isconst=0;
9288             branch_regs[i-1].wasconst=0;
9289             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9290             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9291           }
9292           else
9293           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9294           {
9295             alloc_cc(&current,i-1);
9296             dirty_reg(&current,CCREG);
9297             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9298               // The delay slot overwrote the branch condition
9299               // Delay slot goes after the test (in order)
9300               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9301               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9302               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9303               current.u|=1;
9304               current.uu|=1;
9305               delayslot_alloc(&current,i);
9306               current.isconst=0;
9307             }
9308             else
9309             {
9310               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9311               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9312               // Alloc the branch condition register
9313               alloc_reg(&current,i-1,rs1[i-1]);
9314               if(!(current.is32>>rs1[i-1]&1))
9315               {
9316                 alloc_reg64(&current,i-1,rs1[i-1]);
9317               }
9318             }
9319             memcpy(&branch_regs[i-1],&current,sizeof(current));
9320             branch_regs[i-1].isconst=0;
9321             branch_regs[i-1].wasconst=0;
9322             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9323             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9324           }
9325           else
9326           // Alloc the delay slot in case the branch is taken
9327           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9328           {
9329             memcpy(&branch_regs[i-1],&current,sizeof(current));
9330             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9331             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9332             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9333             alloc_cc(&branch_regs[i-1],i);
9334             dirty_reg(&branch_regs[i-1],CCREG);
9335             delayslot_alloc(&branch_regs[i-1],i);
9336             branch_regs[i-1].isconst=0;
9337             alloc_reg(&current,i,CCREG); // Not taken path
9338             dirty_reg(&current,CCREG);
9339             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9340           }
9341           else
9342           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9343           {
9344             memcpy(&branch_regs[i-1],&current,sizeof(current));
9345             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9346             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9347             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9348             alloc_cc(&branch_regs[i-1],i);
9349             dirty_reg(&branch_regs[i-1],CCREG);
9350             delayslot_alloc(&branch_regs[i-1],i);
9351             branch_regs[i-1].isconst=0;
9352             alloc_reg(&current,i,CCREG); // Not taken path
9353             dirty_reg(&current,CCREG);
9354             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9355           }
9356           break;
9357         case SJUMP:
9358           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9359           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9360           {
9361             alloc_cc(&current,i-1);
9362             dirty_reg(&current,CCREG);
9363             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9364               // The delay slot overwrote the branch condition
9365               // Delay slot goes after the test (in order)
9366               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9367               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9368               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9369               current.u|=1;
9370               current.uu|=1;
9371               delayslot_alloc(&current,i);
9372               current.isconst=0;
9373             }
9374             else
9375             {
9376               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9377               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9378               // Alloc the branch condition register
9379               alloc_reg(&current,i-1,rs1[i-1]);
9380               if(!(current.is32>>rs1[i-1]&1))
9381               {
9382                 alloc_reg64(&current,i-1,rs1[i-1]);
9383               }
9384             }
9385             memcpy(&branch_regs[i-1],&current,sizeof(current));
9386             branch_regs[i-1].isconst=0;
9387             branch_regs[i-1].wasconst=0;
9388             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9389             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9390           }
9391           else
9392           // Alloc the delay slot in case the branch is taken
9393           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9394           {
9395             memcpy(&branch_regs[i-1],&current,sizeof(current));
9396             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9397             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9398             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9399             alloc_cc(&branch_regs[i-1],i);
9400             dirty_reg(&branch_regs[i-1],CCREG);
9401             delayslot_alloc(&branch_regs[i-1],i);
9402             branch_regs[i-1].isconst=0;
9403             alloc_reg(&current,i,CCREG); // Not taken path
9404             dirty_reg(&current,CCREG);
9405             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9406           }
9407           // FIXME: BLTZAL/BGEZAL
9408           if(opcode2[i-1]&0x10) { // BxxZAL
9409             alloc_reg(&branch_regs[i-1],i-1,31);
9410             dirty_reg(&branch_regs[i-1],31);
9411             branch_regs[i-1].is32|=1LL<<31;
9412           }
9413           break;
9414         case FJUMP:
9415           if(likely[i-1]==0) // BC1F/BC1T
9416           {
9417             alloc_cc(&current,i-1);
9418             dirty_reg(&current,CCREG);
9419             if(itype[i]==FCOMP) {
9420               // The delay slot overwrote the branch condition
9421               // Delay slot goes after the test (in order)
9422               delayslot_alloc(&current,i);
9423               current.isconst=0;
9424             }
9425             else
9426             {
9427               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9428               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9429               // Alloc the branch condition register
9430               alloc_reg(&current,i-1,FSREG);
9431             }
9432             memcpy(&branch_regs[i-1],&current,sizeof(current));
9433             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9434           }
9435           else // BC1FL/BC1TL
9436           {
9437             // Alloc the delay slot in case the branch is taken
9438             memcpy(&branch_regs[i-1],&current,sizeof(current));
9439             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9440             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9441             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9442             alloc_cc(&branch_regs[i-1],i);
9443             dirty_reg(&branch_regs[i-1],CCREG);
9444             delayslot_alloc(&branch_regs[i-1],i);
9445             branch_regs[i-1].isconst=0;
9446             alloc_reg(&current,i,CCREG); // Not taken path
9447             dirty_reg(&current,CCREG);
9448             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9449           }
9450           break;
9451       }
9452
9453       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9454       {
9455         if(rt1[i-1]==31) // JAL/JALR
9456         {
9457           // Subroutine call will return here, don't alloc any registers
9458           current.is32=1;
9459           current.dirty=0;
9460           clear_all_regs(current.regmap);
9461           alloc_reg(&current,i,CCREG);
9462           dirty_reg(&current,CCREG);
9463         }
9464         else if(i+1<slen)
9465         {
9466           // Internal branch will jump here, match registers to caller
9467           current.is32=0x3FFFFFFFFLL;
9468           current.dirty=0;
9469           clear_all_regs(current.regmap);
9470           alloc_reg(&current,i,CCREG);
9471           dirty_reg(&current,CCREG);
9472           for(j=i-1;j>=0;j--)
9473           {
9474             if(ba[j]==start+i*4+4) {
9475               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9476               current.is32=branch_regs[j].is32;
9477               current.dirty=branch_regs[j].dirty;
9478               break;
9479             }
9480           }
9481           while(j>=0) {
9482             if(ba[j]==start+i*4+4) {
9483               for(hr=0;hr<HOST_REGS;hr++) {
9484                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9485                   current.regmap[hr]=-1;
9486                 }
9487                 current.is32&=branch_regs[j].is32;
9488                 current.dirty&=branch_regs[j].dirty;
9489               }
9490             }
9491             j--;
9492           }
9493         }
9494       }
9495     }
9496
9497     // Count cycles in between branches
9498     ccadj[i]=cc;
9499     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9500     {
9501       cc=0;
9502     }
9503 #ifdef PCSX
9504     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9505     {
9506       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9507     }
9508     else if(itype[i]==C2LS)
9509     {
9510       cc+=4;
9511     }
9512 #endif
9513     else
9514     {
9515       cc++;
9516     }
9517
9518     flush_dirty_uppers(&current);
9519     if(!is_ds[i]) {
9520       regs[i].is32=current.is32;
9521       regs[i].dirty=current.dirty;
9522       regs[i].isconst=current.isconst;
9523       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9524     }
9525     for(hr=0;hr<HOST_REGS;hr++) {
9526       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9527         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9528           regs[i].wasconst&=~(1<<hr);
9529         }
9530       }
9531     }
9532     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9533   }
9534   
9535   /* Pass 4 - Cull unused host registers */
9536   
9537   uint64_t nr=0;
9538   
9539   for (i=slen-1;i>=0;i--)
9540   {
9541     int hr;
9542     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9543     {
9544       if(ba[i]<start || ba[i]>=(start+slen*4))
9545       {
9546         // Branch out of this block, don't need anything
9547         nr=0;
9548       }
9549       else
9550       {
9551         // Internal branch
9552         // Need whatever matches the target
9553         nr=0;
9554         int t=(ba[i]-start)>>2;
9555         for(hr=0;hr<HOST_REGS;hr++)
9556         {
9557           if(regs[i].regmap_entry[hr]>=0) {
9558             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9559           }
9560         }
9561       }
9562       // Conditional branch may need registers for following instructions
9563       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9564       {
9565         if(i<slen-2) {
9566           nr|=needed_reg[i+2];
9567           for(hr=0;hr<HOST_REGS;hr++)
9568           {
9569             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9570             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9571           }
9572         }
9573       }
9574       // Don't need stuff which is overwritten
9575       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9576       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9577       // Merge in delay slot
9578       for(hr=0;hr<HOST_REGS;hr++)
9579       {
9580         if(!likely[i]) {
9581           // These are overwritten unless the branch is "likely"
9582           // and the delay slot is nullified if not taken
9583           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9584           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9585         }
9586         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9587         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9588         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9589         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9590         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9591         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9592         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9593         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9594         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9595           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9596           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9597         }
9598         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9599           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9600           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9601         }
9602         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9603           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9604           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9605         }
9606       }
9607     }
9608     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9609     {
9610       // SYSCALL instruction (software interrupt)
9611       nr=0;
9612     }
9613     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9614     {
9615       // ERET instruction (return from interrupt)
9616       nr=0;
9617     }
9618     else // Non-branch
9619     {
9620       if(i<slen-1) {
9621         for(hr=0;hr<HOST_REGS;hr++) {
9622           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9623           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9624           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9625           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9626         }
9627       }
9628     }
9629     for(hr=0;hr<HOST_REGS;hr++)
9630     {
9631       // Overwritten registers are not needed
9632       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9633       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9634       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9635       // Source registers are needed
9636       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9637       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9638       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9639       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9640       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9641       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9642       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9643       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9644       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9645         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9646         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9647       }
9648       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9649         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9650         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9651       }
9652       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9653         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9654         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9655       }
9656       // Don't store a register immediately after writing it,
9657       // may prevent dual-issue.
9658       // But do so if this is a branch target, otherwise we
9659       // might have to load the register before the branch.
9660       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9661         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9662            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9663           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9664           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9665         }
9666         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9667            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9668           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9669           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9670         }
9671       }
9672     }
9673     // Cycle count is needed at branches.  Assume it is needed at the target too.
9674     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9675       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9676       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9677     }
9678     // Save it
9679     needed_reg[i]=nr;
9680     
9681     // Deallocate unneeded registers
9682     for(hr=0;hr<HOST_REGS;hr++)
9683     {
9684       if(!((nr>>hr)&1)) {
9685         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9686         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9687            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9688            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9689         {
9690           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9691           {
9692             if(likely[i]) {
9693               regs[i].regmap[hr]=-1;
9694               regs[i].isconst&=~(1<<hr);
9695               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9696             }
9697           }
9698         }
9699         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9700         {
9701           int d1=0,d2=0,map=0,temp=0;
9702           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9703           {
9704             d1=dep1[i+1];
9705             d2=dep2[i+1];
9706           }
9707           if(using_tlb) {
9708             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9709                itype[i+1]==STORE || itype[i+1]==STORELR ||
9710                itype[i+1]==C1LS || itype[i+1]==C2LS)
9711             map=TLREG;
9712           } else
9713           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9714              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9715             map=INVCP;
9716           }
9717           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9718              itype[i+1]==C1LS || itype[i+1]==C2LS)
9719             temp=FTEMP;
9720           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9721              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9722              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9723              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9724              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9725              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9726              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9727              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9728              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9729              regs[i].regmap[hr]!=map )
9730           {
9731             regs[i].regmap[hr]=-1;
9732             regs[i].isconst&=~(1<<hr);
9733             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9734                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9735                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9736                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9737                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9738                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9739                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9740                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9741                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9742                branch_regs[i].regmap[hr]!=map)
9743             {
9744               branch_regs[i].regmap[hr]=-1;
9745               branch_regs[i].regmap_entry[hr]=-1;
9746               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9747               {
9748                 if(!likely[i]&&i<slen-2) {
9749                   regmap_pre[i+2][hr]=-1;
9750                 }
9751               }
9752             }
9753           }
9754         }
9755         else
9756         {
9757           // Non-branch
9758           if(i>0)
9759           {
9760             int d1=0,d2=0,map=-1,temp=-1;
9761             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9762             {
9763               d1=dep1[i];
9764               d2=dep2[i];
9765             }
9766             if(using_tlb) {
9767               if(itype[i]==LOAD || itype[i]==LOADLR ||
9768                  itype[i]==STORE || itype[i]==STORELR ||
9769                  itype[i]==C1LS || itype[i]==C2LS)
9770               map=TLREG;
9771             } else if(itype[i]==STORE || itype[i]==STORELR ||
9772                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9773               map=INVCP;
9774             }
9775             if(itype[i]==LOADLR || itype[i]==STORELR ||
9776                itype[i]==C1LS || itype[i]==C2LS)
9777               temp=FTEMP;
9778             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9779                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9780                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9781                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9782                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9783                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9784             {
9785               if(i<slen-1&&!is_ds[i]) {
9786                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9787                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9788                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9789                 {
9790                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9791                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9792                 }
9793                 regmap_pre[i+1][hr]=-1;
9794                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9795               }
9796               regs[i].regmap[hr]=-1;
9797               regs[i].isconst&=~(1<<hr);
9798             }
9799           }
9800         }
9801       }
9802     }
9803   }
9804   
9805   /* Pass 5 - Pre-allocate registers */
9806   
9807   // If a register is allocated during a loop, try to allocate it for the
9808   // entire loop, if possible.  This avoids loading/storing registers
9809   // inside of the loop.
9810
9811   signed char f_regmap[HOST_REGS];
9812   clear_all_regs(f_regmap);
9813   for(i=0;i<slen-1;i++)
9814   {
9815     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9816     {
9817       if(ba[i]>=start && ba[i]<(start+i*4)) 
9818       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9819       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9820       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9821       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9822       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9823       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9824       {
9825         int t=(ba[i]-start)>>2;
9826         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9827         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9828         for(hr=0;hr<HOST_REGS;hr++)
9829         {
9830           if(regs[i].regmap[hr]>64) {
9831             if(!((regs[i].dirty>>hr)&1))
9832               f_regmap[hr]=regs[i].regmap[hr];
9833             else f_regmap[hr]=-1;
9834           }
9835           else if(regs[i].regmap[hr]>=0) {
9836             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9837               // dealloc old register
9838               int n;
9839               for(n=0;n<HOST_REGS;n++)
9840               {
9841                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9842               }
9843               // and alloc new one
9844               f_regmap[hr]=regs[i].regmap[hr];
9845             }
9846           }
9847           if(branch_regs[i].regmap[hr]>64) {
9848             if(!((branch_regs[i].dirty>>hr)&1))
9849               f_regmap[hr]=branch_regs[i].regmap[hr];
9850             else f_regmap[hr]=-1;
9851           }
9852           else if(branch_regs[i].regmap[hr]>=0) {
9853             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9854               // dealloc old register
9855               int n;
9856               for(n=0;n<HOST_REGS;n++)
9857               {
9858                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9859               }
9860               // and alloc new one
9861               f_regmap[hr]=branch_regs[i].regmap[hr];
9862             }
9863           }
9864           if(ooo[i]) {
9865             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
9866               f_regmap[hr]=branch_regs[i].regmap[hr];
9867           }else{
9868             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
9869               f_regmap[hr]=branch_regs[i].regmap[hr];
9870           }
9871           // Avoid dirty->clean transition
9872           #ifdef DESTRUCTIVE_WRITEBACK
9873           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9874           #endif
9875           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9876           // case above, however it's always a good idea.  We can't hoist the
9877           // load if the register was already allocated, so there's no point
9878           // wasting time analyzing most of these cases.  It only "succeeds"
9879           // when the mapping was different and the load can be replaced with
9880           // a mov, which is of negligible benefit.  So such cases are
9881           // skipped below.
9882           if(f_regmap[hr]>0) {
9883             if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
9884               int r=f_regmap[hr];
9885               for(j=t;j<=i;j++)
9886               {
9887                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9888                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9889                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9890                 if(r>63) {
9891                   // NB This can exclude the case where the upper-half
9892                   // register is lower numbered than the lower-half
9893                   // register.  Not sure if it's worth fixing...
9894                   if(get_reg(regs[j].regmap,r&63)<0) break;
9895                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9896                   if(regs[j].is32&(1LL<<(r&63))) break;
9897                 }
9898                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9899                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9900                   int k;
9901                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9902                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9903                     if(r>63) {
9904                       if(get_reg(regs[i].regmap,r&63)<0) break;
9905                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9906                     }
9907                     k=i;
9908                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9909                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9910                         //printf("no free regs for store %x\n",start+(k-1)*4);
9911                         break;
9912                       }
9913                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9914                         //printf("no-match due to different register\n");
9915                         break;
9916                       }
9917                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9918                         //printf("no-match due to branch\n");
9919                         break;
9920                       }
9921                       // call/ret fast path assumes no registers allocated
9922                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9923                         break;
9924                       }
9925                       if(r>63) {
9926                         // NB This can exclude the case where the upper-half
9927                         // register is lower numbered than the lower-half
9928                         // register.  Not sure if it's worth fixing...
9929                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9930                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9931                       }
9932                       k--;
9933                     }
9934                     if(i<slen-1) {
9935                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9936                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9937                         //printf("bad match after branch\n");
9938                         break;
9939                       }
9940                     }
9941                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9942                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9943                       while(k<i) {
9944                         regs[k].regmap_entry[hr]=f_regmap[hr];
9945                         regs[k].regmap[hr]=f_regmap[hr];
9946                         regmap_pre[k+1][hr]=f_regmap[hr];
9947                         regs[k].wasdirty&=~(1<<hr);
9948                         regs[k].dirty&=~(1<<hr);
9949                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9950                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9951                         regs[k].wasconst&=~(1<<hr);
9952                         regs[k].isconst&=~(1<<hr);
9953                         k++;
9954                       }
9955                     }
9956                     else {
9957                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9958                       break;
9959                     }
9960                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9961                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9962                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9963                       regs[i].regmap_entry[hr]=f_regmap[hr];
9964                       regs[i].regmap[hr]=f_regmap[hr];
9965                       regs[i].wasdirty&=~(1<<hr);
9966                       regs[i].dirty&=~(1<<hr);
9967                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9968                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9969                       regs[i].wasconst&=~(1<<hr);
9970                       regs[i].isconst&=~(1<<hr);
9971                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9972                       branch_regs[i].wasdirty&=~(1<<hr);
9973                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9974                       branch_regs[i].regmap[hr]=f_regmap[hr];
9975                       branch_regs[i].dirty&=~(1<<hr);
9976                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9977                       branch_regs[i].wasconst&=~(1<<hr);
9978                       branch_regs[i].isconst&=~(1<<hr);
9979                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9980                         regmap_pre[i+2][hr]=f_regmap[hr];
9981                         regs[i+2].wasdirty&=~(1<<hr);
9982                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9983                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9984                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9985                       }
9986                     }
9987                   }
9988                   for(k=t;k<j;k++) {
9989                     // Alloc register clean at beginning of loop,
9990                     // but may dirty it in pass 6
9991                     regs[k].regmap_entry[hr]=f_regmap[hr];
9992                     regs[k].regmap[hr]=f_regmap[hr];
9993                     regs[k].dirty&=~(1<<hr);
9994                     regs[k].wasconst&=~(1<<hr);
9995                     regs[k].isconst&=~(1<<hr);
9996                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9997                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9998                       branch_regs[k].regmap[hr]=f_regmap[hr];
9999                       branch_regs[k].dirty&=~(1<<hr);
10000                       branch_regs[k].wasconst&=~(1<<hr);
10001                       branch_regs[k].isconst&=~(1<<hr);
10002                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10003                         regmap_pre[k+2][hr]=f_regmap[hr];
10004                         regs[k+2].wasdirty&=~(1<<hr);
10005                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10006                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10007                       }
10008                     }
10009                     else
10010                     {
10011                       regmap_pre[k+1][hr]=f_regmap[hr];
10012                       regs[k+1].wasdirty&=~(1<<hr);
10013                     }
10014                   }
10015                   if(regs[j].regmap[hr]==f_regmap[hr])
10016                     regs[j].regmap_entry[hr]=f_regmap[hr];
10017                   break;
10018                 }
10019                 if(j==i) break;
10020                 if(regs[j].regmap[hr]>=0)
10021                   break;
10022                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10023                   //printf("no-match due to different register\n");
10024                   break;
10025                 }
10026                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10027                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10028                   break;
10029                 }
10030                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10031                 {
10032                   // Stop on unconditional branch
10033                   break;
10034                 }
10035                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10036                 {
10037                   if(ooo[j]) {
10038                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10039                       break;
10040                   }else{
10041                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10042                       break;
10043                   }
10044                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10045                     //printf("no-match due to different register (branch)\n");
10046                     break;
10047                   }
10048                 }
10049                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10050                   //printf("No free regs for store %x\n",start+j*4);
10051                   break;
10052                 }
10053                 if(f_regmap[hr]>=64) {
10054                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10055                     break;
10056                   }
10057                   else
10058                   {
10059                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10060                       break;
10061                     }
10062                   }
10063                 }
10064               }
10065             }
10066           }
10067         }
10068       }
10069     }else{
10070       int count=0;
10071       for(hr=0;hr<HOST_REGS;hr++)
10072       {
10073         if(hr!=EXCLUDE_REG) {
10074           if(regs[i].regmap[hr]>64) {
10075             if(!((regs[i].dirty>>hr)&1))
10076               f_regmap[hr]=regs[i].regmap[hr];
10077           }
10078           else if(regs[i].regmap[hr]>=0) {
10079             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10080               // dealloc old register
10081               int n;
10082               for(n=0;n<HOST_REGS;n++)
10083               {
10084                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10085               }
10086               // and alloc new one
10087               f_regmap[hr]=regs[i].regmap[hr];
10088             }
10089           }
10090           else if(regs[i].regmap[hr]<0) count++;
10091         }
10092       }
10093       // Try to restore cycle count at branch targets
10094       if(bt[i]) {
10095         for(j=i;j<slen-1;j++) {
10096           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10097           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10098             //printf("no free regs for store %x\n",start+j*4);
10099             break;
10100           }
10101         }
10102         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10103           int k=i;
10104           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10105           while(k<j) {
10106             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10107             regs[k].regmap[HOST_CCREG]=CCREG;
10108             regmap_pre[k+1][HOST_CCREG]=CCREG;
10109             regs[k+1].wasdirty|=1<<HOST_CCREG;
10110             regs[k].dirty|=1<<HOST_CCREG;
10111             regs[k].wasconst&=~(1<<HOST_CCREG);
10112             regs[k].isconst&=~(1<<HOST_CCREG);
10113             k++;
10114           }
10115           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10116         }
10117         // Work backwards from the branch target
10118         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10119         {
10120           //printf("Extend backwards\n");
10121           int k;
10122           k=i;
10123           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10124             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10125               //printf("no free regs for store %x\n",start+(k-1)*4);
10126               break;
10127             }
10128             k--;
10129           }
10130           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10131             //printf("Extend CC, %x ->\n",start+k*4);
10132             while(k<=i) {
10133               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10134               regs[k].regmap[HOST_CCREG]=CCREG;
10135               regmap_pre[k+1][HOST_CCREG]=CCREG;
10136               regs[k+1].wasdirty|=1<<HOST_CCREG;
10137               regs[k].dirty|=1<<HOST_CCREG;
10138               regs[k].wasconst&=~(1<<HOST_CCREG);
10139               regs[k].isconst&=~(1<<HOST_CCREG);
10140               k++;
10141             }
10142           }
10143           else {
10144             //printf("Fail Extend CC, %x ->\n",start+k*4);
10145           }
10146         }
10147       }
10148       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10149          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10150          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10151          itype[i]!=FCONV&&itype[i]!=FCOMP)
10152       {
10153         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10154       }
10155     }
10156   }
10157   
10158   // This allocates registers (if possible) one instruction prior
10159   // to use, which can avoid a load-use penalty on certain CPUs.
10160   for(i=0;i<slen-1;i++)
10161   {
10162     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10163     {
10164       if(!bt[i+1])
10165       {
10166         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10167            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10168         {
10169           if(rs1[i+1]) {
10170             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10171             {
10172               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10173               {
10174                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10175                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10176                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10177                 regs[i].isconst&=~(1<<hr);
10178                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10179                 constmap[i][hr]=constmap[i+1][hr];
10180                 regs[i+1].wasdirty&=~(1<<hr);
10181                 regs[i].dirty&=~(1<<hr);
10182               }
10183             }
10184           }
10185           if(rs2[i+1]) {
10186             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10187             {
10188               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10189               {
10190                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10191                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10192                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10193                 regs[i].isconst&=~(1<<hr);
10194                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10195                 constmap[i][hr]=constmap[i+1][hr];
10196                 regs[i+1].wasdirty&=~(1<<hr);
10197                 regs[i].dirty&=~(1<<hr);
10198               }
10199             }
10200           }
10201           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10202             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10203             {
10204               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10205               {
10206                 regs[i].regmap[hr]=rs1[i+1];
10207                 regmap_pre[i+1][hr]=rs1[i+1];
10208                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10209                 regs[i].isconst&=~(1<<hr);
10210                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10211                 constmap[i][hr]=constmap[i+1][hr];
10212                 regs[i+1].wasdirty&=~(1<<hr);
10213                 regs[i].dirty&=~(1<<hr);
10214               }
10215             }
10216           }
10217           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10218             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10219             {
10220               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10221               {
10222                 regs[i].regmap[hr]=rs1[i+1];
10223                 regmap_pre[i+1][hr]=rs1[i+1];
10224                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10225                 regs[i].isconst&=~(1<<hr);
10226                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10227                 constmap[i][hr]=constmap[i+1][hr];
10228                 regs[i+1].wasdirty&=~(1<<hr);
10229                 regs[i].dirty&=~(1<<hr);
10230               }
10231             }
10232           }
10233           #ifndef HOST_IMM_ADDR32
10234           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10235             hr=get_reg(regs[i+1].regmap,TLREG);
10236             if(hr>=0) {
10237               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10238               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10239                 int nr;
10240                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10241                 {
10242                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10243                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10244                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10245                   regs[i].isconst&=~(1<<hr);
10246                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10247                   constmap[i][hr]=constmap[i+1][hr];
10248                   regs[i+1].wasdirty&=~(1<<hr);
10249                   regs[i].dirty&=~(1<<hr);
10250                 }
10251                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10252                 {
10253                   // move it to another register
10254                   regs[i+1].regmap[hr]=-1;
10255                   regmap_pre[i+2][hr]=-1;
10256                   regs[i+1].regmap[nr]=TLREG;
10257                   regmap_pre[i+2][nr]=TLREG;
10258                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10259                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10260                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10261                   regs[i].isconst&=~(1<<nr);
10262                   regs[i+1].isconst&=~(1<<nr);
10263                   regs[i].dirty&=~(1<<nr);
10264                   regs[i+1].wasdirty&=~(1<<nr);
10265                   regs[i+1].dirty&=~(1<<nr);
10266                   regs[i+2].wasdirty&=~(1<<nr);
10267                 }
10268               }
10269             }
10270           }
10271           #endif
10272           if(itype[i+1]==STORE||itype[i+1]==STORELR
10273              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10274             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10275               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10276               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10277               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10278               assert(hr>=0);
10279               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10280               {
10281                 regs[i].regmap[hr]=rs1[i+1];
10282                 regmap_pre[i+1][hr]=rs1[i+1];
10283                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10284                 regs[i].isconst&=~(1<<hr);
10285                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10286                 constmap[i][hr]=constmap[i+1][hr];
10287                 regs[i+1].wasdirty&=~(1<<hr);
10288                 regs[i].dirty&=~(1<<hr);
10289               }
10290             }
10291           }
10292           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10293             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10294               int nr;
10295               hr=get_reg(regs[i+1].regmap,FTEMP);
10296               assert(hr>=0);
10297               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10298               {
10299                 regs[i].regmap[hr]=rs1[i+1];
10300                 regmap_pre[i+1][hr]=rs1[i+1];
10301                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10302                 regs[i].isconst&=~(1<<hr);
10303                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10304                 constmap[i][hr]=constmap[i+1][hr];
10305                 regs[i+1].wasdirty&=~(1<<hr);
10306                 regs[i].dirty&=~(1<<hr);
10307               }
10308               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10309               {
10310                 // move it to another register
10311                 regs[i+1].regmap[hr]=-1;
10312                 regmap_pre[i+2][hr]=-1;
10313                 regs[i+1].regmap[nr]=FTEMP;
10314                 regmap_pre[i+2][nr]=FTEMP;
10315                 regs[i].regmap[nr]=rs1[i+1];
10316                 regmap_pre[i+1][nr]=rs1[i+1];
10317                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10318                 regs[i].isconst&=~(1<<nr);
10319                 regs[i+1].isconst&=~(1<<nr);
10320                 regs[i].dirty&=~(1<<nr);
10321                 regs[i+1].wasdirty&=~(1<<nr);
10322                 regs[i+1].dirty&=~(1<<nr);
10323                 regs[i+2].wasdirty&=~(1<<nr);
10324               }
10325             }
10326           }
10327           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10328             if(itype[i+1]==LOAD) 
10329               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10330             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10331               hr=get_reg(regs[i+1].regmap,FTEMP);
10332             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10333               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10334               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10335             }
10336             if(hr>=0&&regs[i].regmap[hr]<0) {
10337               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10338               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10339                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10340                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10341                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10342                 regs[i].isconst&=~(1<<hr);
10343                 regs[i+1].wasdirty&=~(1<<hr);
10344                 regs[i].dirty&=~(1<<hr);
10345               }
10346             }
10347           }
10348         }
10349       }
10350     }
10351   }
10352   
10353   /* Pass 6 - Optimize clean/dirty state */
10354   clean_registers(0,slen-1,1);
10355   
10356   /* Pass 7 - Identify 32-bit registers */
10357 #ifndef FORCE32
10358   provisional_r32();
10359
10360   u_int r32=0;
10361   
10362   for (i=slen-1;i>=0;i--)
10363   {
10364     int hr;
10365     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10366     {
10367       if(ba[i]<start || ba[i]>=(start+slen*4))
10368       {
10369         // Branch out of this block, don't need anything
10370         r32=0;
10371       }
10372       else
10373       {
10374         // Internal branch
10375         // Need whatever matches the target
10376         // (and doesn't get overwritten by the delay slot instruction)
10377         r32=0;
10378         int t=(ba[i]-start)>>2;
10379         if(ba[i]>start+i*4) {
10380           // Forward branch
10381           if(!(requires_32bit[t]&~regs[i].was32))
10382             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10383         }else{
10384           // Backward branch
10385           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10386           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10387           if(!(pr32[t]&~regs[i].was32))
10388             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10389         }
10390       }
10391       // Conditional branch may need registers for following instructions
10392       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10393       {
10394         if(i<slen-2) {
10395           r32|=requires_32bit[i+2];
10396           r32&=regs[i].was32;
10397           // Mark this address as a branch target since it may be called
10398           // upon return from interrupt
10399           bt[i+2]=1;
10400         }
10401       }
10402       // Merge in delay slot
10403       if(!likely[i]) {
10404         // These are overwritten unless the branch is "likely"
10405         // and the delay slot is nullified if not taken
10406         r32&=~(1LL<<rt1[i+1]);
10407         r32&=~(1LL<<rt2[i+1]);
10408       }
10409       // Assume these are needed (delay slot)
10410       if(us1[i+1]>0)
10411       {
10412         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10413       }
10414       if(us2[i+1]>0)
10415       {
10416         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10417       }
10418       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10419       {
10420         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10421       }
10422       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10423       {
10424         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10425       }
10426     }
10427     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10428     {
10429       // SYSCALL instruction (software interrupt)
10430       r32=0;
10431     }
10432     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10433     {
10434       // ERET instruction (return from interrupt)
10435       r32=0;
10436     }
10437     // Check 32 bits
10438     r32&=~(1LL<<rt1[i]);
10439     r32&=~(1LL<<rt2[i]);
10440     if(us1[i]>0)
10441     {
10442       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10443     }
10444     if(us2[i]>0)
10445     {
10446       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10447     }
10448     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10449     {
10450       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10451     }
10452     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10453     {
10454       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10455     }
10456     requires_32bit[i]=r32;
10457     
10458     // Dirty registers which are 32-bit, require 32-bit input
10459     // as they will be written as 32-bit values
10460     for(hr=0;hr<HOST_REGS;hr++)
10461     {
10462       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10463         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10464           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10465           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10466         }
10467       }
10468     }
10469     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10470   }
10471 #endif
10472
10473   if(itype[slen-1]==SPAN) {
10474     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10475   }
10476   
10477   /* Debug/disassembly */
10478   if((void*)assem_debug==(void*)printf) 
10479   for(i=0;i<slen;i++)
10480   {
10481     printf("U:");
10482     int r;
10483     for(r=1;r<=CCREG;r++) {
10484       if((unneeded_reg[i]>>r)&1) {
10485         if(r==HIREG) printf(" HI");
10486         else if(r==LOREG) printf(" LO");
10487         else printf(" r%d",r);
10488       }
10489     }
10490 #ifndef FORCE32
10491     printf(" UU:");
10492     for(r=1;r<=CCREG;r++) {
10493       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10494         if(r==HIREG) printf(" HI");
10495         else if(r==LOREG) printf(" LO");
10496         else printf(" r%d",r);
10497       }
10498     }
10499     printf(" 32:");
10500     for(r=0;r<=CCREG;r++) {
10501       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10502       if((regs[i].was32>>r)&1) {
10503         if(r==CCREG) printf(" CC");
10504         else if(r==HIREG) printf(" HI");
10505         else if(r==LOREG) printf(" LO");
10506         else printf(" r%d",r);
10507       }
10508     }
10509 #endif
10510     printf("\n");
10511     #if defined(__i386__) || defined(__x86_64__)
10512     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10513     #endif
10514     #ifdef __arm__
10515     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10516     #endif
10517     printf("needs: ");
10518     if(needed_reg[i]&1) printf("eax ");
10519     if((needed_reg[i]>>1)&1) printf("ecx ");
10520     if((needed_reg[i]>>2)&1) printf("edx ");
10521     if((needed_reg[i]>>3)&1) printf("ebx ");
10522     if((needed_reg[i]>>5)&1) printf("ebp ");
10523     if((needed_reg[i]>>6)&1) printf("esi ");
10524     if((needed_reg[i]>>7)&1) printf("edi ");
10525     printf("r:");
10526     for(r=0;r<=CCREG;r++) {
10527       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10528       if((requires_32bit[i]>>r)&1) {
10529         if(r==CCREG) printf(" CC");
10530         else if(r==HIREG) printf(" HI");
10531         else if(r==LOREG) printf(" LO");
10532         else printf(" r%d",r);
10533       }
10534     }
10535     printf("\n");
10536     /*printf("pr:");
10537     for(r=0;r<=CCREG;r++) {
10538       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10539       if((pr32[i]>>r)&1) {
10540         if(r==CCREG) printf(" CC");
10541         else if(r==HIREG) printf(" HI");
10542         else if(r==LOREG) printf(" LO");
10543         else printf(" r%d",r);
10544       }
10545     }
10546     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10547     printf("\n");*/
10548     #if defined(__i386__) || defined(__x86_64__)
10549     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10550     printf("dirty: ");
10551     if(regs[i].wasdirty&1) printf("eax ");
10552     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10553     if((regs[i].wasdirty>>2)&1) printf("edx ");
10554     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10555     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10556     if((regs[i].wasdirty>>6)&1) printf("esi ");
10557     if((regs[i].wasdirty>>7)&1) printf("edi ");
10558     #endif
10559     #ifdef __arm__
10560     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10561     printf("dirty: ");
10562     if(regs[i].wasdirty&1) printf("r0 ");
10563     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10564     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10565     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10566     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10567     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10568     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10569     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10570     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10571     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10572     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10573     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10574     #endif
10575     printf("\n");
10576     disassemble_inst(i);
10577     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10578     #if defined(__i386__) || defined(__x86_64__)
10579     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10580     if(regs[i].dirty&1) printf("eax ");
10581     if((regs[i].dirty>>1)&1) printf("ecx ");
10582     if((regs[i].dirty>>2)&1) printf("edx ");
10583     if((regs[i].dirty>>3)&1) printf("ebx ");
10584     if((regs[i].dirty>>5)&1) printf("ebp ");
10585     if((regs[i].dirty>>6)&1) printf("esi ");
10586     if((regs[i].dirty>>7)&1) printf("edi ");
10587     #endif
10588     #ifdef __arm__
10589     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10590     if(regs[i].dirty&1) printf("r0 ");
10591     if((regs[i].dirty>>1)&1) printf("r1 ");
10592     if((regs[i].dirty>>2)&1) printf("r2 ");
10593     if((regs[i].dirty>>3)&1) printf("r3 ");
10594     if((regs[i].dirty>>4)&1) printf("r4 ");
10595     if((regs[i].dirty>>5)&1) printf("r5 ");
10596     if((regs[i].dirty>>6)&1) printf("r6 ");
10597     if((regs[i].dirty>>7)&1) printf("r7 ");
10598     if((regs[i].dirty>>8)&1) printf("r8 ");
10599     if((regs[i].dirty>>9)&1) printf("r9 ");
10600     if((regs[i].dirty>>10)&1) printf("r10 ");
10601     if((regs[i].dirty>>12)&1) printf("r12 ");
10602     #endif
10603     printf("\n");
10604     if(regs[i].isconst) {
10605       printf("constants: ");
10606       #if defined(__i386__) || defined(__x86_64__)
10607       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10608       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10609       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10610       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10611       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10612       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10613       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10614       #endif
10615       #ifdef __arm__
10616       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10617       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10618       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10619       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10620       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10621       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10622       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10623       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10624       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10625       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10626       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10627       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10628       #endif
10629       printf("\n");
10630     }
10631 #ifndef FORCE32
10632     printf(" 32:");
10633     for(r=0;r<=CCREG;r++) {
10634       if((regs[i].is32>>r)&1) {
10635         if(r==CCREG) printf(" CC");
10636         else if(r==HIREG) printf(" HI");
10637         else if(r==LOREG) printf(" LO");
10638         else printf(" r%d",r);
10639       }
10640     }
10641     printf("\n");
10642 #endif
10643     /*printf(" p32:");
10644     for(r=0;r<=CCREG;r++) {
10645       if((p32[i]>>r)&1) {
10646         if(r==CCREG) printf(" CC");
10647         else if(r==HIREG) printf(" HI");
10648         else if(r==LOREG) printf(" LO");
10649         else printf(" r%d",r);
10650       }
10651     }
10652     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10653     else printf("\n");*/
10654     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10655       #if defined(__i386__) || defined(__x86_64__)
10656       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10657       if(branch_regs[i].dirty&1) printf("eax ");
10658       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10659       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10660       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10661       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10662       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10663       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10664       #endif
10665       #ifdef __arm__
10666       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10667       if(branch_regs[i].dirty&1) printf("r0 ");
10668       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10669       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10670       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10671       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10672       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10673       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10674       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10675       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10676       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10677       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10678       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10679       #endif
10680 #ifndef FORCE32
10681       printf(" 32:");
10682       for(r=0;r<=CCREG;r++) {
10683         if((branch_regs[i].is32>>r)&1) {
10684           if(r==CCREG) printf(" CC");
10685           else if(r==HIREG) printf(" HI");
10686           else if(r==LOREG) printf(" LO");
10687           else printf(" r%d",r);
10688         }
10689       }
10690       printf("\n");
10691 #endif
10692     }
10693   }
10694
10695   /* Pass 8 - Assembly */
10696   linkcount=0;stubcount=0;
10697   ds=0;is_delayslot=0;
10698   cop1_usable=0;
10699   uint64_t is32_pre=0;
10700   u_int dirty_pre=0;
10701   u_int beginning=(u_int)out;
10702   if((u_int)addr&1) {
10703     ds=1;
10704     pagespan_ds();
10705   }
10706   u_int instr_addr0_override=0;
10707
10708 #ifdef PCSX
10709   if (start == 0x80030000) {
10710     // nasty hack for fastbios thing
10711     instr_addr0_override=(u_int)out;
10712     emit_movimm(start,0);
10713     emit_readword((int)&pcaddr,1);
10714     emit_writeword(0,(int)&pcaddr);
10715     emit_cmp(0,1);
10716     emit_jne((int)new_dyna_leave);
10717   }
10718 #endif
10719   for(i=0;i<slen;i++)
10720   {
10721     //if(ds) printf("ds: ");
10722     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10723     if(ds) {
10724       ds=0; // Skip delay slot
10725       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10726       instr_addr[i]=0;
10727     } else {
10728       #ifndef DESTRUCTIVE_WRITEBACK
10729       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10730       {
10731         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10732               unneeded_reg[i],unneeded_reg_upper[i]);
10733         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10734               unneeded_reg[i],unneeded_reg_upper[i]);
10735       }
10736       is32_pre=regs[i].is32;
10737       dirty_pre=regs[i].dirty;
10738       #endif
10739       // write back
10740       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10741       {
10742         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10743                       unneeded_reg[i],unneeded_reg_upper[i]);
10744         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10745       }
10746       // branch target entry point
10747       instr_addr[i]=(u_int)out;
10748       assem_debug("<->\n");
10749       // load regs
10750       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10751         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10752       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10753       address_generation(i,&regs[i],regs[i].regmap_entry);
10754       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10755       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10756       {
10757         // Load the delay slot registers if necessary
10758         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10759           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10760         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10761           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10762         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10763           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10764       }
10765       else if(i+1<slen)
10766       {
10767         // Preload registers for following instruction
10768         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10769           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10770             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10771         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10772           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10773             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10774       }
10775       // TODO: if(is_ooo(i)) address_generation(i+1);
10776       if(itype[i]==CJUMP||itype[i]==FJUMP)
10777         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10778       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10779         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10780       if(bt[i]) cop1_usable=0;
10781       // assemble
10782       switch(itype[i]) {
10783         case ALU:
10784           alu_assemble(i,&regs[i]);break;
10785         case IMM16:
10786           imm16_assemble(i,&regs[i]);break;
10787         case SHIFT:
10788           shift_assemble(i,&regs[i]);break;
10789         case SHIFTIMM:
10790           shiftimm_assemble(i,&regs[i]);break;
10791         case LOAD:
10792           load_assemble(i,&regs[i]);break;
10793         case LOADLR:
10794           loadlr_assemble(i,&regs[i]);break;
10795         case STORE:
10796           store_assemble(i,&regs[i]);break;
10797         case STORELR:
10798           storelr_assemble(i,&regs[i]);break;
10799         case COP0:
10800           cop0_assemble(i,&regs[i]);break;
10801         case COP1:
10802           cop1_assemble(i,&regs[i]);break;
10803         case C1LS:
10804           c1ls_assemble(i,&regs[i]);break;
10805         case COP2:
10806           cop2_assemble(i,&regs[i]);break;
10807         case C2LS:
10808           c2ls_assemble(i,&regs[i]);break;
10809         case C2OP:
10810           c2op_assemble(i,&regs[i]);break;
10811         case FCONV:
10812           fconv_assemble(i,&regs[i]);break;
10813         case FLOAT:
10814           float_assemble(i,&regs[i]);break;
10815         case FCOMP:
10816           fcomp_assemble(i,&regs[i]);break;
10817         case MULTDIV:
10818           multdiv_assemble(i,&regs[i]);break;
10819         case MOV:
10820           mov_assemble(i,&regs[i]);break;
10821         case SYSCALL:
10822           syscall_assemble(i,&regs[i]);break;
10823         case HLECALL:
10824           hlecall_assemble(i,&regs[i]);break;
10825         case INTCALL:
10826           intcall_assemble(i,&regs[i]);break;
10827         case UJUMP:
10828           ujump_assemble(i,&regs[i]);ds=1;break;
10829         case RJUMP:
10830           rjump_assemble(i,&regs[i]);ds=1;break;
10831         case CJUMP:
10832           cjump_assemble(i,&regs[i]);ds=1;break;
10833         case SJUMP:
10834           sjump_assemble(i,&regs[i]);ds=1;break;
10835         case FJUMP:
10836           fjump_assemble(i,&regs[i]);ds=1;break;
10837         case SPAN:
10838           pagespan_assemble(i,&regs[i]);break;
10839       }
10840       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10841         literal_pool(1024);
10842       else
10843         literal_pool_jumpover(256);
10844     }
10845   }
10846   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10847   // If the block did not end with an unconditional branch,
10848   // add a jump to the next instruction.
10849   if(i>1) {
10850     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10851       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10852       assert(i==slen);
10853       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10854         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10855         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10856           emit_loadreg(CCREG,HOST_CCREG);
10857         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10858       }
10859       else if(!likely[i-2])
10860       {
10861         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10862         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10863       }
10864       else
10865       {
10866         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10867         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10868       }
10869       add_to_linker((int)out,start+i*4,0);
10870       emit_jmp(0);
10871     }
10872   }
10873   else
10874   {
10875     assert(i>0);
10876     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10877     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10878     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10879       emit_loadreg(CCREG,HOST_CCREG);
10880     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10881     add_to_linker((int)out,start+i*4,0);
10882     emit_jmp(0);
10883   }
10884
10885   // TODO: delay slot stubs?
10886   // Stubs
10887   for(i=0;i<stubcount;i++)
10888   {
10889     switch(stubs[i][0])
10890     {
10891       case LOADB_STUB:
10892       case LOADH_STUB:
10893       case LOADW_STUB:
10894       case LOADD_STUB:
10895       case LOADBU_STUB:
10896       case LOADHU_STUB:
10897         do_readstub(i);break;
10898       case STOREB_STUB:
10899       case STOREH_STUB:
10900       case STOREW_STUB:
10901       case STORED_STUB:
10902         do_writestub(i);break;
10903       case CC_STUB:
10904         do_ccstub(i);break;
10905       case INVCODE_STUB:
10906         do_invstub(i);break;
10907       case FP_STUB:
10908         do_cop1stub(i);break;
10909       case STORELR_STUB:
10910         do_unalignedwritestub(i);break;
10911     }
10912   }
10913
10914   if (instr_addr0_override)
10915     instr_addr[0] = instr_addr0_override;
10916
10917   /* Pass 9 - Linker */
10918   for(i=0;i<linkcount;i++)
10919   {
10920     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10921     literal_pool(64);
10922     if(!link_addr[i][2])
10923     {
10924       void *stub=out;
10925       void *addr=check_addr(link_addr[i][1]);
10926       emit_extjump(link_addr[i][0],link_addr[i][1]);
10927       if(addr) {
10928         set_jump_target(link_addr[i][0],(int)addr);
10929         add_link(link_addr[i][1],stub);
10930       }
10931       else set_jump_target(link_addr[i][0],(int)stub);
10932     }
10933     else
10934     {
10935       // Internal branch
10936       int target=(link_addr[i][1]-start)>>2;
10937       assert(target>=0&&target<slen);
10938       assert(instr_addr[target]);
10939       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10940       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10941       //#else
10942       set_jump_target(link_addr[i][0],instr_addr[target]);
10943       //#endif
10944     }
10945   }
10946   // External Branch Targets (jump_in)
10947   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10948   for(i=0;i<slen;i++)
10949   {
10950     if(bt[i]||i==0)
10951     {
10952       if(instr_addr[i]) // TODO - delay slots (=null)
10953       {
10954         u_int vaddr=start+i*4;
10955         u_int page=get_page(vaddr);
10956         u_int vpage=get_vpage(vaddr);
10957         literal_pool(256);
10958         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10959 #ifndef FORCE32
10960         if(!requires_32bit[i])
10961 #else
10962         if(1)
10963 #endif
10964         {
10965           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10966           assem_debug("jump_in: %x\n",start+i*4);
10967           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10968           int entry_point=do_dirty_stub(i);
10969           ll_add(jump_in+page,vaddr,(void *)entry_point);
10970           // If there was an existing entry in the hash table,
10971           // replace it with the new address.
10972           // Don't add new entries.  We'll insert the
10973           // ones that actually get used in check_addr().
10974           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10975           if(ht_bin[0]==vaddr) {
10976             ht_bin[1]=entry_point;
10977           }
10978           if(ht_bin[2]==vaddr) {
10979             ht_bin[3]=entry_point;
10980           }
10981         }
10982         else
10983         {
10984           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10985           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10986           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10987           //int entry_point=(int)out;
10988           ////assem_debug("entry_point: %x\n",entry_point);
10989           //load_regs_entry(i);
10990           //if(entry_point==(int)out)
10991           //  entry_point=instr_addr[i];
10992           //else
10993           //  emit_jmp(instr_addr[i]);
10994           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10995           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10996           int entry_point=do_dirty_stub(i);
10997           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10998         }
10999       }
11000     }
11001   }
11002   // Write out the literal pool if necessary
11003   literal_pool(0);
11004   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11005   // Align code
11006   if(((u_int)out)&7) emit_addnop(13);
11007   #endif
11008   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11009   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11010   memcpy(copy,source,slen*4);
11011   copy+=slen*4;
11012   
11013   #ifdef __arm__
11014   __clear_cache((void *)beginning,out);
11015   #endif
11016   
11017   // If we're within 256K of the end of the buffer,
11018   // start over from the beginning. (Is 256K enough?)
11019   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11020   
11021   // Trap writes to any of the pages we compiled
11022   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11023     invalid_code[i]=0;
11024 #ifndef DISABLE_TLB
11025     memory_map[i]|=0x40000000;
11026     if((signed int)start>=(signed int)0xC0000000) {
11027       assert(using_tlb);
11028       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11029       invalid_code[j]=0;
11030       memory_map[j]|=0x40000000;
11031       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11032     }
11033 #endif
11034   }
11035   
11036   /* Pass 10 - Free memory by expiring oldest blocks */
11037   
11038   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11039   while(expirep!=end)
11040   {
11041     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11042     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11043     inv_debug("EXP: Phase %d\n",expirep);
11044     switch((expirep>>11)&3)
11045     {
11046       case 0:
11047         // Clear jump_in and jump_dirty
11048         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11049         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11050         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11051         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11052         break;
11053       case 1:
11054         // Clear pointers
11055         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11056         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11057         break;
11058       case 2:
11059         // Clear hash table
11060         for(i=0;i<32;i++) {
11061           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11062           if((ht_bin[3]>>shift)==(base>>shift) ||
11063              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11064             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11065             ht_bin[2]=ht_bin[3]=-1;
11066           }
11067           if((ht_bin[1]>>shift)==(base>>shift) ||
11068              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11069             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11070             ht_bin[0]=ht_bin[2];
11071             ht_bin[1]=ht_bin[3];
11072             ht_bin[2]=ht_bin[3]=-1;
11073           }
11074         }
11075         break;
11076       case 3:
11077         // Clear jump_out
11078         #ifdef __arm__
11079         if((expirep&2047)==0) 
11080           do_clear_cache();
11081         #endif
11082         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11083         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11084         break;
11085     }
11086     expirep=(expirep+1)&65535;
11087   }
11088   return 0;
11089 }
11090
11091 // vim:shiftwidth=2:expandtab